require("lgpr")
# Loading required package: lgpr
# Attached lgpr 1.2.4, using rstan 2.26.23. Type ?lgpr to get started.
require("ggplot2")
# Loading required package: ggplot2
require("rstan")
# Loading required package: rstan
# Loading required package: StanHeaders
# 
# rstan version 2.26.23 (Stan version 2.26.1)
# For execution on a local, multicore CPU with excess RAM we recommend calling
# options(mc.cores = parallel::detectCores()).
# To avoid recompilation of unchanged Stan programs, we recommend calling
# rstan_options(auto_write = TRUE)
# For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
# change `threads_per_chain` option:
# rstan_options(threads_per_chain = 1)

1 Introduction

In this tutorial we simulate and analyse a test data set which contains 8 case and 8 control individuals, and the disease effect on case individuals is modeled using the disease-related age (diseaseAge) as a covariate. The disease-related age is defined as age relative to the observed disease initiation. The disease effect is simulated so that only 4 of the 8 case individuals are affected.

set.seed(1213)
simData   <- simulate_data(N            = 16,
                           t_data       = seq(12, 72, by = 12),
                           covariates   = c(    0, 2,2,2),
                           relevances   = c(1,1,1, 1,0,0),
                           lengthscales = c(18,24, 1, 18,18,18),
                           t_effect_range  = c(46,48),
                           snr          = 5,
                           N_affected   = 4,
                           t_jitter     = 0)

plot_sim(simData)
# - Dots are noisy observations of the response var.
# - Line is the true signal mapped through inv. link fun.
# - Solid vert. line is the real effect time (used to generate signal) 
# - Dashed vert. line is the 'observed' disease initiation time

#plot_sim.component(simData,3)

Above, the blue line represents the data-generating signal and black dots are noisy observations of the response variable.

dat <- simData@data
str(dat)
# 'data.frame': 96 obs. of  7 variables:
#  $ id        : Factor w/ 16 levels "1","2","3","4",..: 1 1 1 1 1 1 2 2 2 2 ...
#  $ age       : num  12 24 36 48 60 72 12 24 36 48 ...
#  $ diseaseAge: num  -36 -24 -12 0 12 24 -36 -24 -12 0 ...
#  $ z1        : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 1 1 1 1 ...
#  $ z2        : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
#  $ z3        : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 1 1 1 1 ...
#  $ y         : num  1.8159 0.6226 -0.0437 -1.4838 -3.0214 ...
simData@effect_times
# $true
#        1        2        3        4        5        6        7        8 
# 46.26274 46.28976 47.70218 46.76020 46.01252 46.17979 46.02512 47.86529 
#        9       10       11       12       13       14       15       16 
#      NaN      NaN      NaN      NaN      NaN      NaN      NaN      NaN 
# 
# $observed
#   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
#  48  48  48  48  48  48  48  48 NaN NaN NaN NaN NaN NaN NaN NaN

2 Declaring a heterogeneous component

We will define a formula where the term het(id)*gp_vm(diseaseAge) declares that the gp_vm term is heterogeneous and one level-specific magnitude parameter is needed for each level of id.

formula <- y ~ zs(id)*gp(age) + gp(age) + het(id)*gp_vm(diseaseAge) + zs(z1)*gp(age) + zs(z2)*gp(age) + zs(z3)*gp(age)

Because diseaseAge is NaN for the control individuals, it is automatically taken into account that an individual-specific magnitude parameter is actually needed just for each case individual.

3 Fitting the model

fit <- lgp(formula       = formula,
            data         = dat,
            prior        = list(wrp = igam(14,5)),
            iter         = 2000,
            chains       = 4,
            cores        = 4)
# Computing analytic function posteriors... 
# |   10%|   20%|   30%|   40%|   50%|   60%|   70%|   80%|   90%|  100%| 
# ======================================================================  
# Done.

Printing the fit object summarizes the posterior

print(fit)
# An object of class lgpfit. See ?lgpfit for more info.
# Inference for Stan model: lgp.
# 4 chains, each with iter=2000; warmup=1000; thin=1; 
# post-warmup draws per chain=1000, total post-warmup draws=4000.
# 
#           mean se_mean   sd 2.5%  25%  50%  75% 97.5% n_eff Rhat
# alpha[1]  0.51    0.00 0.13 0.28 0.41 0.50 0.59  0.81  1553 1.00
# alpha[2]  0.70    0.01 0.36 0.29 0.45 0.61 0.85  1.58  2920 1.00
# alpha[3]  1.30    0.01 0.56 0.30 0.92 1.23 1.60  2.59  2094 1.00
# alpha[4]  0.42    0.00 0.31 0.04 0.20 0.34 0.54  1.23  3949 1.00
# alpha[5]  0.43    0.01 0.34 0.03 0.19 0.33 0.56  1.33  4341 1.00
# alpha[6]  0.27    0.00 0.27 0.01 0.09 0.19 0.35  1.01  4170 1.00
# ell[1]    1.71    0.02 1.11 0.66 1.09 1.47 2.00  4.29  2068 1.00
# ell[2]    0.72    0.01 0.35 0.15 0.48 0.70 0.93  1.50  2253 1.00
# ell[3]    1.23    0.02 1.28 0.14 0.52 0.94 1.54  3.95  2847 1.00
# ell[4]    2.69    0.05 2.91 0.35 1.05 1.78 3.25 10.07  3421 1.00
# ell[5]    3.00    0.04 2.82 0.32 1.26 2.25 3.79 10.05  4169 1.00
# ell[6]    2.13    0.04 2.23 0.22 0.86 1.48 2.58  7.92  3620 1.00
# wrp[1]    0.39    0.00 0.11 0.23 0.31 0.37 0.44  0.66  3220 1.00
# beta[1,1] 0.87    0.01 0.23 0.02 0.86 0.98 1.00  1.00  1122 1.00
# beta[1,2] 0.71    0.01 0.34 0.00 0.46 0.87 1.00  1.00  2497 1.00
# beta[1,3] 0.80    0.01 0.28 0.02 0.71 0.96 1.00  1.00  2161 1.00
# beta[1,4] 0.64    0.01 0.36 0.00 0.30 0.77 0.99  1.00  2993 1.00
# beta[1,5] 0.06    0.01 0.19 0.00 0.00 0.00 0.01  0.95   694 1.00
# beta[1,6] 0.10    0.01 0.24 0.00 0.00 0.00 0.05  0.99  1003 1.00
# beta[1,7] 0.07    0.01 0.20 0.00 0.00 0.00 0.02  0.96   842 1.00
# beta[1,8] 0.06    0.01 0.19 0.00 0.00 0.00 0.01  0.94   749 1.01
# sigma[1]  0.48    0.00 0.05 0.38 0.44 0.48 0.51  0.60  3936 1.00
# 
# Samples were drawn using NUTS(diag_e) at Sun Sep 24 09:28:54 2023.
# For each parameter, n_eff is a crude measure of effective sample size,
# and Rhat is the potential scale reduction factor on split chains (at 
# convergence, Rhat=1).

Printing the model information clarifies the model and priors

model_summary(fit)
# Formula: y ~ zs(id) * gp(age) + gp(age) + het(id) * gp_vm(diseaseAge) + zs(z1) * gp(age) + zs(z2) * gp(age) + zs(z3) * gp(age)
# Likelihood: gaussian
# Data: 96 observations, 7 variables
# 
#                   Component type ker het ns vm unc cat cont
# 1            zs(id)*gp(age)    2   0   0  0  0   0   1    1
# 2                   gp(age)    1   0   0  0  0   0   0    1
# 3 het(id)*gp_vm(diseaseAge)    1   0   1  1  1   0   0    2
# 4            zs(z1)*gp(age)    2   0   0  0  0   0   2    1
# 5            zs(z2)*gp(age)    2   0   0  0  0   0   3    1
# 6            zs(z3)*gp(age)    2   0   0  0  0   0   4    1
# 
#     Variable #Missing
# 1        age        0
# 2 diseaseAge       48
# 
#   Factor #Levels Values
# 1     id      16    ...
# 2     z1       2   1, 2
# 3     z2       2   1, 2
# 4     z3       2   1, 2
# 
#    Parameter   Bounds                         Prior
# 1   alpha[1] [0, Inf)      alpha[1] ~ student-t(20)
# 2   alpha[2] [0, Inf)      alpha[2] ~ student-t(20)
# 3   alpha[3] [0, Inf)      alpha[3] ~ student-t(20)
# 4   alpha[4] [0, Inf)      alpha[4] ~ student-t(20)
# 5   alpha[5] [0, Inf)      alpha[5] ~ student-t(20)
# 6   alpha[6] [0, Inf)      alpha[6] ~ student-t(20)
# 7     ell[1] [0, Inf)      ell[1] ~ log-normal(0,1)
# 8     ell[2] [0, Inf)      ell[2] ~ log-normal(0,1)
# 9     ell[3] [0, Inf)      ell[3] ~ log-normal(0,1)
# 10    ell[4] [0, Inf)      ell[4] ~ log-normal(0,1)
# 11    ell[5] [0, Inf)      ell[5] ~ log-normal(0,1)
# 12    ell[6] [0, Inf)      ell[6] ~ log-normal(0,1)
# 13    wrp[1] [0, Inf)      wrp[1] ~ inv-gamma(14,5)
# 14  sigma[1] [0, Inf) (sigma[1])^2 ~ inv-gamma(2,1)
# 15 beta[1-8]   [0, 1]    beta[1-8] ~ beta(0.2, 0.2)
# 
#                                       
# id                     1 2 3 4 5 6 7 8
# beta_or_teff_param_idx 1 2 3 4 5 6 7 8
# 
# Created on Sun Sep 24 09:09:06 2023 with lgpr 1.2.4.

3.1 Visualizing the individual-specific effect magnitudes

We can visualize the posterior distribution of the individual-specific disease effect magnitude parameters for each case individual. We see that for individuals 1-4 the parameters are close to 1 (affected individuals) and for individuals 5-8 there is considerable posterior mass close to 0.

plot_beta(fit)
#                                       
# id                     1 2 3 4 5 6 7 8
# beta_or_teff_param_idx 1 2 3 4 5 6 7 8

4 Visualizing the inferred heterogeneous effect

Finally we plot the inferred disease component, using posterior median hyperparameters, and see that the inferred effects have the same shape for all individuals, but are heterogeneous in magnitude.

t <- seq(0, 100, by = 1)
x_pred <- new_x(dat, t, x_ns = "diseaseAge")
p <- pred(fit, x_pred, verbose = FALSE, reduce = stats::median)
plot_f(fit, pred = p, comp_idx = 3, color_by = 'diseaseAge')

5 Computing environment

sessionInfo()
# R version 4.1.2 (2021-11-01)
# Platform: x86_64-pc-linux-gnu (64-bit)
# Running under: Linux Mint 21.1
# 
# Matrix products: default
# BLAS:   /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
# LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
# 
# locale:
#  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#  [5] LC_MONETARY=fi_FI.UTF-8    LC_MESSAGES=en_US.UTF-8   
#  [7] LC_PAPER=fi_FI.UTF-8       LC_NAME=C                 
#  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
# [11] LC_MEASUREMENT=fi_FI.UTF-8 LC_IDENTIFICATION=C       
# 
# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods   base     
# 
# other attached packages:
# [1] rstan_2.26.23       StanHeaders_2.26.28 ggplot2_3.4.3      
# [4] lgpr_1.2.4          rmarkdown_2.23     
# 
# loaded via a namespace (and not attached):
#  [1] tidyselect_1.2.0     xfun_0.39            QuickJSR_1.0.6      
#  [4] bslib_0.5.0          reshape2_1.4.4       colorspace_2.1-0    
#  [7] vctrs_0.6.3          generics_0.1.3       htmltools_0.5.5     
# [10] stats4_4.1.2         loo_2.6.0            yaml_2.3.7          
# [13] utf8_1.2.3           rlang_1.1.1          pkgbuild_1.4.2      
# [16] jquerylib_0.1.4      pillar_1.9.0         glue_1.6.2          
# [19] withr_2.5.0          distributional_0.3.2 plyr_1.8.8          
# [22] matrixStats_1.0.0    lifecycle_1.0.3      stringr_1.5.0       
# [25] posterior_1.4.1      munsell_0.5.0        gtable_0.3.4        
# [28] codetools_0.2-18     evaluate_0.21        labeling_0.4.3      
# [31] inline_0.3.19        knitr_1.43           callr_3.7.3         
# [34] fastmap_1.1.1        ps_1.7.5             parallel_4.1.2      
# [37] fansi_1.0.4          bayesplot_1.10.0     highr_0.10          
# [40] rstantools_2.3.1.1   Rcpp_1.0.11          backports_1.4.1     
# [43] checkmate_2.2.0      scales_1.2.1         cachem_1.0.8        
# [46] RcppParallel_5.1.7   jsonlite_1.8.7       abind_1.4-5         
# [49] farver_2.1.1         gridExtra_2.3        tensorA_0.36.2      
# [52] digest_0.6.33        stringi_1.7.12       processx_3.8.2      
# [55] dplyr_1.1.3          grid_4.1.2           cli_3.6.1           
# [58] tools_4.1.2          magrittr_2.0.3       sass_0.4.6          
# [61] tibble_3.2.1         crayon_1.5.2         pkgconfig_2.0.3     
# [64] MASS_7.3-55          prettyunits_1.1.1    R6_2.5.1            
# [67] compiler_4.1.2

Disease effect heterogeneity modeling in lgpr

Juho Timonen

2023-09-24