require("lgpr")
# Loading required package: lgpr
# Attached lgpr 1.2.4, using rstan 2.26.23. Type ?lgpr to get started.
require("ggplot2")
# Loading required package: ggplot2
require("rstan")
# Loading required package: rstan
# Loading required package: StanHeaders
# 
# rstan version 2.26.23 (Stan version 2.26.1)
# For execution on a local, multicore CPU with excess RAM we recommend calling
# options(mc.cores = parallel::detectCores()).
# To avoid recompilation of unchanged Stan programs, we recommend calling
# rstan_options(auto_write = TRUE)
# For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
# change `threads_per_chain` option:
# rstan_options(threads_per_chain = 1)

1 Introduction

In this tutorial we simulate and analyse count data.

1.1 Binomial data

We first show how to generate binomial data

set.seed(131)
simData   <- simulate_data(N            = 12,
                           t_data       = seq(12, 72, by = 12),
                           covariates   = c(    2),
                           noise_type   = "binomial",
                           relevances   = c(0,1,1),
                           lengthscales = c(18,24,18),
                           N_trials     = 100,
                           t_jitter     = 1)

plot_sim(simData)
# - Dots are noisy observations of the response var.
# - Line is the true signal mapped through inv. link fun.
#   (and multiplied by number of trials)

1.2 Beta-binomial data

The noise level in the previous example was rather low. To generate more noisy observations, we can draw from the beta-binomial distribution. The overdispersion parameter \(\gamma \in (0,1)\) controls the noise level, so that \(\gamma \rightarrow 0\) would be equivalent to binomial noise (no overdispersion) and larger values mean more noise. Here we set gamma = 0.3.

set.seed(131)
simData   <- simulate_data(N            = 12,
                           t_data       = seq(12, 72, by = 12),
                           covariates   = c(    2),
                           noise_type   = "bb",
                           gamma        = 0.3,
                           relevances   = c(0,1,1),
                           lengthscales = c(18,24,18),
                           N_trials     = 100,
                           t_jitter     = 1)

plot_sim(simData)
# - Dots are noisy observations of the response var.
# - Line is the true signal mapped through inv. link fun.
#   (and multiplied by number of trials)

dat <- simData@data
str(dat)
# 'data.frame': 72 obs. of  4 variables:
#  $ id : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 2 2 2 2 ...
#  $ age: num  11.2 23.5 37 48 59.5 ...
#  $ z  : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 1 1 1 1 ...
#  $ y  : int  29 34 30 45 100 84 1 55 96 77 ...

1.3 Defining model

To use beta-binomial observation model in our analysis, we use the likelihood argument of lgp(). Here we define a \(\text{Beta}(2,2)\) prior for the gamma parameter.

my_prior <- list(
  alpha = student_t(20),
  gamma = bet(2, 2)
)

2 Fitting the model

We fit a model using the generated beta-binomial data.

fit <- lgp(y ~ age + age|id + age|z,
           data = dat,
           likelihood = "bb",
           num_trials = 100,
           prior = my_prior,
           refresh = 300,
           chains = 3,
           cores = 3,
           control = list(adapt_delta = 0.95),
           iter = 3000)
print(fit)
# An object of class lgpfit. See ?lgpfit for more info.
# Inference for Stan model: lgp_latent.
# 3 chains, each with iter=3000; warmup=1500; thin=1; 
# post-warmup draws per chain=1500, total post-warmup draws=4500.
# 
#          mean se_mean   sd 2.5%  25%  50%  75% 97.5% n_eff Rhat
# alpha[1] 1.13    0.01 0.48 0.48 0.79 1.04 1.39  2.30  2001    1
# alpha[2] 0.34    0.01 0.26 0.01 0.14 0.29 0.48  0.99   609    1
# alpha[3] 1.17    0.01 0.45 0.53 0.85 1.09 1.40  2.24  2011    1
# ell[1]   0.83    0.01 0.42 0.21 0.52 0.77 1.09  1.77  1398    1
# ell[2]   1.41    0.04 2.10 0.11 0.37 0.78 1.69  6.39  2312    1
# ell[3]   0.73    0.01 0.29 0.24 0.53 0.71 0.89  1.37  1320    1
# gamma[1] 0.30    0.00 0.05 0.18 0.27 0.31 0.34  0.40   793    1
# 
# Samples were drawn using NUTS(diag_e) at Sun Sep 24 10:17:54 2023.
# For each parameter, n_eff is a crude measure of effective sample size,
# and Rhat is the potential scale reduction factor on split chains (at 
# convergence, Rhat=1).

We see that the posterior mean of the gamma parameter is close to the value used in simulation (0.3). We visualize its posterior draws for each MCMC chain:

plot_draws(fit, type = "trace", regex_pars = "gamma")

relevances(fit)
#        gp(age) gp(age)*zs(id)  gp(age)*zs(z)          noise 
#      0.1751555      0.0460864      0.2219434      0.5568147

3 Visualising inferred signal and its components

We subsample 50 draws from the posterior of each component \(f^{(j)}\), \(j=1,2,3\)

draws <- sample.int(1000, 50)

and visualize them

p <- pred(fit, draws = draws)
plot_components(fit, pred = p, alpha = 0.1, color_by = c(NA, NA, "z", "z"))

as well as the sum \(f = f^{(1)} + f^{(2)} + f^{(3)}\)

plot_pred(fit, pred = p, alpha = 0.3)

4 Out-of-sample prediction

t <- seq(2, 100, by = 2)
x_pred <- new_x(dat, t)
p <- pred(fit, x_pred, draws = draws)
# Extrapolating function posterior draws... 
# |   10%|   20%|   30%|   40%|   50%|   60%|   70%|   80%|   90%|  100%| 
# ======================================================================
# c_hat_pred not given, using constant c_hat_pred = 0.0155558692386378
# Done.
plot_pred(fit, p, alpha = 0.3)

plot_components(fit, pred = p, alpha = 0.1, color_by = c(NA, NA, "z", "z"))

5 Computing environment

sessionInfo()
# R version 4.1.2 (2021-11-01)
# Platform: x86_64-pc-linux-gnu (64-bit)
# Running under: Linux Mint 21.1
# 
# Matrix products: default
# BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
# LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
# 
# locale:
#  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#  [5] LC_MONETARY=fi_FI.UTF-8    LC_MESSAGES=en_US.UTF-8   
#  [7] LC_PAPER=fi_FI.UTF-8       LC_NAME=C                 
#  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
# [11] LC_MEASUREMENT=fi_FI.UTF-8 LC_IDENTIFICATION=C       
# 
# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
# [1] rstan_2.26.23       StanHeaders_2.26.28 ggplot2_3.4.3      
# [4] lgpr_1.2.4          rmarkdown_2.23     
# 
# loaded via a namespace (and not attached):
#  [1] tidyselect_1.2.0     xfun_0.39            QuickJSR_1.0.6      
#  [4] bslib_0.5.0          reshape2_1.4.4       colorspace_2.1-0    
#  [7] vctrs_0.6.3          generics_0.1.3       htmltools_0.5.5     
# [10] stats4_4.1.2         loo_2.6.0            yaml_2.3.7          
# [13] utf8_1.2.3           rlang_1.1.1          pkgbuild_1.4.2      
# [16] jquerylib_0.1.4      pillar_1.9.0         glue_1.6.2          
# [19] withr_2.5.0          distributional_0.3.2 plyr_1.8.8          
# [22] matrixStats_1.0.0    lifecycle_1.0.3      stringr_1.5.0       
# [25] posterior_1.4.1      munsell_0.5.0        gtable_0.3.4        
# [28] codetools_0.2-18     evaluate_0.21        labeling_0.4.3      
# [31] inline_0.3.19        knitr_1.43           callr_3.7.3         
# [34] fastmap_1.1.1        ps_1.7.5             parallel_4.1.2      
# [37] fansi_1.0.4          bayesplot_1.10.0     highr_0.10          
# [40] rstantools_2.3.1.1   Rcpp_1.0.11          backports_1.4.1     
# [43] checkmate_2.2.0      scales_1.2.1         cachem_1.0.8        
# [46] RcppParallel_5.1.7   jsonlite_1.8.7       abind_1.4-5         
# [49] farver_2.1.1         gridExtra_2.3        tensorA_0.36.2      
# [52] digest_0.6.33        stringi_1.7.12       processx_3.8.2      
# [55] dplyr_1.1.3          grid_4.1.2           cli_3.6.1           
# [58] tools_4.1.2          magrittr_2.0.3       sass_0.4.6          
# [61] tibble_3.2.1         crayon_1.5.2         pkgconfig_2.0.3     
# [64] MASS_7.3-55          prettyunits_1.1.1    R6_2.5.1            
# [67] compiler_4.1.2