require("lgpr")
# Loading required package: lgpr
# Attached lgpr 1.2.5, using rstan 2.32.7. Type ?lgpr to get started.
require("ggplot2")
# Loading required package: ggplot2
require("rstan")
# Loading required package: rstan
# Loading required package: StanHeaders
# 
# rstan version 2.32.7 (Stan version 2.32.2)
# For execution on a local, multicore CPU with excess RAM we recommend calling
# options(mc.cores = parallel::detectCores()).
# To avoid recompilation of unchanged Stan programs, we recommend calling
# rstan_options(auto_write = TRUE)
# For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
# change `threads_per_chain` option:
# rstan_options(threads_per_chain = 1)

1 Introduction

In this tutorial we simulate and analyse count data.

1.1 Binomial data

We first show how to generate binomial data

set.seed(131)
simData <- simulate_data(
  N = 12,
  t_data = seq(12, 72, by = 12),
  covariates = c(2),
  noise_type = "binomial",
  relevances = c(0, 1, 1),
  lengthscales = c(18, 24, 18),
  N_trials = 100,
  t_jitter = 1
)

plot_sim(simData)
# Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
# ℹ Please use tidy evaluation idioms with `aes()`.
# ℹ See also `vignette("ggplot2-in-packages")` for more information.
# ℹ The deprecated feature was likely used in the lgpr package.
#   Please report the issue at <https://github.com/jtimonen/lgpr/issues>.
# This warning is displayed once every 8 hours.
# Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# - Dots are noisy observations of the response var.
# - Line is the true signal mapped through inv. link fun.
#   (and multiplied by number of trials)

1.2 Beta-binomial data

The noise level in the previous example was rather low. To generate more noisy observations, we can draw from the beta-binomial distribution. The overdispersion parameter \(\gamma \in (0,1)\) controls the noise level, so that \(\gamma \rightarrow 0\) would be equivalent to binomial noise (no overdispersion) and larger values mean more noise. Here we set gamma = 0.3.

set.seed(131)
simData <- simulate_data(
  N = 12,
  t_data = seq(12, 72, by = 12),
  covariates = c(2),
  noise_type = "bb",
  gamma = 0.3,
  relevances = c(0, 1, 1),
  lengthscales = c(18, 24, 18),
  N_trials = 100,
  t_jitter = 1
)

plot_sim(simData)
# - Dots are noisy observations of the response var.
# - Line is the true signal mapped through inv. link fun.
#   (and multiplied by number of trials)

dat <- simData@data
str(dat)
# 'data.frame': 72 obs. of  4 variables:
#  $ id : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 2 2 2 2 ...
#  $ age: num  11.2 23.5 37 48 59.5 ...
#  $ z  : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 1 1 1 1 ...
#  $ y  : int  32 26 36 52 99 80 0 21 96 80 ...

1.3 Defining model

To use beta-binomial observation model in our analysis, we use the likelihood argument of lgp(). Here we define a \(\text{Beta}(2,2)\) prior for the gamma parameter.

my_prior <- list(
  alpha = student_t(20),
  gamma = bet(2, 2)
)

2 Fitting the model

We fit a model using the generated beta-binomial data.

fit <- lgp(y ~ age + age | id + age | z,
  data = dat,
  likelihood = "bb",
  num_trials = 100,
  prior = my_prior,
  refresh = 300,
  chains = 3,
  cores = 3,
  control = list(adapt_delta = 0.95),
  iter = 3000
)
print(fit)
# An object of class lgpfit. See ?lgpfit for more info.
# Inference for Stan model: lgp_latent.
# 3 chains, each with iter=3000; warmup=1500; thin=1; 
# post-warmup draws per chain=1500, total post-warmup draws=4500.
# 
#          mean se_mean   sd 2.5%  25%  50%  75% 97.5% n_eff Rhat
# alpha[1] 1.11    0.01 0.49 0.44 0.75 1.01 1.37  2.31  2026    1
# alpha[2] 0.37    0.01 0.28 0.01 0.15 0.31 0.53  1.04   530    1
# alpha[3] 1.30    0.01 0.47 0.63 0.97 1.22 1.55  2.40  1868    1
# ell[1]   0.85    0.01 0.46 0.21 0.51 0.75 1.12  1.88  1119    1
# ell[2]   1.10    0.03 1.49 0.13 0.44 0.69 1.18  4.54  2023    1
# ell[3]   0.74    0.01 0.28 0.26 0.55 0.73 0.91  1.34  1326    1
# gamma[1] 0.27    0.00 0.05 0.14 0.24 0.27 0.30  0.36   631    1
# 
# Samples were drawn using NUTS(diag_e) at Thu Oct 30 23:09:01 2025.
# For each parameter, n_eff is a crude measure of effective sample size,
# and Rhat is the potential scale reduction factor on split chains (at 
# convergence, Rhat=1).

We see that the posterior mean of the gamma parameter is close to the value used in simulation (0.3). We visualize its posterior draws for each MCMC chain:

plot_draws(fit, type = "trace", regex_pars = "gamma")

relevances(fit)
#        gp(age) gp(age)*zs(id)  gp(age)*zs(z)          noise 
#     0.14125061     0.04760887     0.25839685     0.55274366

3 Visualising inferred signal and its components

We subsample 50 draws from the posterior of each component \(f^{(j)}\), \(j=1,2,3\)

draws <- sample.int(1000, 50)

and visualize them

p <- pred(fit, draws = draws)
plot_components(fit, pred = p, alpha = 0.1, color_by = c(NA, NA, "z", "z"))

as well as the sum \(f = f^{(1)} + f^{(2)} + f^{(3)}\)

plot_pred(fit, pred = p, alpha = 0.3)

4 Out-of-sample prediction

t <- seq(2, 100, by = 2)
x_pred <- new_x(dat, t)
p <- pred(fit, x_pred, draws = draws)
# Extrapolating function posterior draws... 
# |   10%|   20%|   30%|   40%|   50%|   60%|   70%|   80%|   90%|  100%| 
# ======================================================================
# c_hat_pred not given, using constant c_hat_pred = 0.14581335876668
# Done.
plot_pred(fit, p, alpha = 0.3)

plot_components(fit, pred = p, alpha = 0.1, color_by = c(NA, NA, "z", "z"))

5 Computing environment

sessionInfo()
# R version 4.4.1 (2024-06-14)
# Platform: aarch64-apple-darwin20
# Running under: macOS Sonoma 14.6.1
# 
# Matrix products: default
# BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
# LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
# 
# locale:
# [1] C/UTF-8/C/C/C/C
# 
# time zone: Europe/Helsinki
# tzcode source: internal
# 
# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
# [1] rstan_2.32.7        StanHeaders_2.32.10 ggplot2_4.0.0      
# [4] lgpr_1.2.5          rmarkdown_2.30     
# 
# loaded via a namespace (and not attached):
#  [1] tensorA_0.36.2.1      sass_0.4.10           generics_0.1.4       
#  [4] stringi_1.8.7         digest_0.6.37         magrittr_2.0.4       
#  [7] evaluate_1.0.5        grid_4.4.1            RColorBrewer_1.1-3   
# [10] fastmap_1.2.0         plyr_1.8.9            jsonlite_2.0.0       
# [13] pkgbuild_1.4.8        backports_1.5.0       gridExtra_2.3        
# [16] QuickJSR_1.8.1        scales_1.4.0          codetools_0.2-20     
# [19] jquerylib_0.1.4       abind_1.4-8           cli_3.6.5            
# [22] crayon_1.5.3          rlang_1.1.6           withr_3.0.2          
# [25] cachem_1.1.0          yaml_2.3.10           tools_4.4.1          
# [28] inline_0.3.21         parallel_4.4.1        reshape2_1.4.4       
# [31] checkmate_2.3.3       rstantools_2.5.0      dplyr_1.1.4          
# [34] colorspace_2.1-2      curl_6.2.3            posterior_1.6.1      
# [37] vctrs_0.6.5           R6_2.6.1              matrixStats_1.5.0    
# [40] stats4_4.4.1          lifecycle_1.0.4       stringr_1.5.2        
# [43] V8_6.0.0              MASS_7.3-60.2         pkgconfig_2.0.3      
# [46] RcppParallel_5.1.11-1 pillar_1.11.1         bslib_0.9.0          
# [49] gtable_0.3.6          loo_2.8.0             glue_1.8.0           
# [52] Rcpp_1.1.0            xfun_0.53             tibble_3.3.0         
# [55] tidyselect_1.2.1      knitr_1.50            farver_2.1.2         
# [58] bayesplot_1.14.0      htmltools_0.5.8.1     labeling_0.4.3       
# [61] compiler_4.4.1        S7_0.2.0              distributional_0.5.0