require("lgpr")
# Loading required package: lgpr
# Attached lgpr 1.2.4, using rstan 2.26.23. Type ?lgpr to get started.
require("ggplot2")
# Loading required package: ggplot2
require("rstan")
# Loading required package: rstan
# Loading required package: StanHeaders
#
# rstan version 2.26.23 (Stan version 2.26.1)
# For execution on a local, multicore CPU with excess RAM we recommend calling
# options(mc.cores = parallel::detectCores()).
# To avoid recompilation of unchanged Stan programs, we recommend calling
# rstan_options(auto_write = TRUE)
# For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
# change `threads_per_chain` option:
# rstan_options(threads_per_chain = 1)
In this tutorial we simulate and analyse a test data set which contains 8 case and 8 control individuals, and the disease effect on case individuals is modeled using the disease-related age (diseaseAge
) as a covariate. The disease-related age is defined as age relative to the observed disease initiation. The disease effect is simulated so that only 4 of the 8 case individuals are affected.
set.seed(1213)
simData <- simulate_data(N = 16,
t_data = seq(12, 72, by = 12),
covariates = c( 0, 2,2,2),
relevances = c(1,1,1, 1,0,0),
lengthscales = c(18,24, 1, 18,18,18),
t_effect_range = c(46,48),
snr = 5,
N_affected = 4,
t_jitter = 0)
plot_sim(simData)
# - Dots are noisy observations of the response var.
# - Line is the true signal mapped through inv. link fun.
# - Solid vert. line is the real effect time (used to generate signal)
# - Dashed vert. line is the 'observed' disease initiation time
#plot_sim.component(simData,3)
Above, the blue line represents the data-generating signal and black dots are noisy observations of the response variable.
dat <- simData@data
str(dat)
# 'data.frame': 96 obs. of 7 variables:
# $ id : Factor w/ 16 levels "1","2","3","4",..: 1 1 1 1 1 1 2 2 2 2 ...
# $ age : num 12 24 36 48 60 72 12 24 36 48 ...
# $ diseaseAge: num -36 -24 -12 0 12 24 -36 -24 -12 0 ...
# $ z1 : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 1 1 1 1 ...
# $ z2 : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
# $ z3 : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 1 1 1 1 ...
# $ y : num 1.8159 0.6226 -0.0437 -1.4838 -3.0214 ...
simData@effect_times
# $true
# 1 2 3 4 5 6 7 8
# 46.26274 46.28976 47.70218 46.76020 46.01252 46.17979 46.02512 47.86529
# 9 10 11 12 13 14 15 16
# NaN NaN NaN NaN NaN NaN NaN NaN
#
# $observed
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# 48 48 48 48 48 48 48 48 NaN NaN NaN NaN NaN NaN NaN NaN
We will define a formula where the term het(id)*gp_vm(diseaseAge)
declares that the gp_vm
term is heterogeneous and one level-specific magnitude parameter is needed for each level of id
.
formula <- y ~ zs(id)*gp(age) + gp(age) + het(id)*gp_vm(diseaseAge) + zs(z1)*gp(age) + zs(z2)*gp(age) + zs(z3)*gp(age)
Because diseaseAge
is NaN
for the control individuals, it is automatically taken into account that an individual-specific magnitude parameter is actually needed just for each case individual.
fit <- lgp(formula = formula,
data = dat,
prior = list(wrp = igam(14,5)),
iter = 2000,
chains = 4,
cores = 4)
# Computing analytic function posteriors...
# | 10%| 20%| 30%| 40%| 50%| 60%| 70%| 80%| 90%| 100%|
# ======================================================================
# Done.
Printing the fit object summarizes the posterior
print(fit)
# An object of class lgpfit. See ?lgpfit for more info.
# Inference for Stan model: lgp.
# 4 chains, each with iter=2000; warmup=1000; thin=1;
# post-warmup draws per chain=1000, total post-warmup draws=4000.
#
# mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
# alpha[1] 0.51 0.00 0.13 0.28 0.41 0.50 0.59 0.81 1553 1.00
# alpha[2] 0.70 0.01 0.36 0.29 0.45 0.61 0.85 1.58 2920 1.00
# alpha[3] 1.30 0.01 0.56 0.30 0.92 1.23 1.60 2.59 2094 1.00
# alpha[4] 0.42 0.00 0.31 0.04 0.20 0.34 0.54 1.23 3949 1.00
# alpha[5] 0.43 0.01 0.34 0.03 0.19 0.33 0.56 1.33 4341 1.00
# alpha[6] 0.27 0.00 0.27 0.01 0.09 0.19 0.35 1.01 4170 1.00
# ell[1] 1.71 0.02 1.11 0.66 1.09 1.47 2.00 4.29 2068 1.00
# ell[2] 0.72 0.01 0.35 0.15 0.48 0.70 0.93 1.50 2253 1.00
# ell[3] 1.23 0.02 1.28 0.14 0.52 0.94 1.54 3.95 2847 1.00
# ell[4] 2.69 0.05 2.91 0.35 1.05 1.78 3.25 10.07 3421 1.00
# ell[5] 3.00 0.04 2.82 0.32 1.26 2.25 3.79 10.05 4169 1.00
# ell[6] 2.13 0.04 2.23 0.22 0.86 1.48 2.58 7.92 3620 1.00
# wrp[1] 0.39 0.00 0.11 0.23 0.31 0.37 0.44 0.66 3220 1.00
# beta[1,1] 0.87 0.01 0.23 0.02 0.86 0.98 1.00 1.00 1122 1.00
# beta[1,2] 0.71 0.01 0.34 0.00 0.46 0.87 1.00 1.00 2497 1.00
# beta[1,3] 0.80 0.01 0.28 0.02 0.71 0.96 1.00 1.00 2161 1.00
# beta[1,4] 0.64 0.01 0.36 0.00 0.30 0.77 0.99 1.00 2993 1.00
# beta[1,5] 0.06 0.01 0.19 0.00 0.00 0.00 0.01 0.95 694 1.00
# beta[1,6] 0.10 0.01 0.24 0.00 0.00 0.00 0.05 0.99 1003 1.00
# beta[1,7] 0.07 0.01 0.20 0.00 0.00 0.00 0.02 0.96 842 1.00
# beta[1,8] 0.06 0.01 0.19 0.00 0.00 0.00 0.01 0.94 749 1.01
# sigma[1] 0.48 0.00 0.05 0.38 0.44 0.48 0.51 0.60 3936 1.00
#
# Samples were drawn using NUTS(diag_e) at Sun Sep 24 09:28:54 2023.
# For each parameter, n_eff is a crude measure of effective sample size,
# and Rhat is the potential scale reduction factor on split chains (at
# convergence, Rhat=1).
Printing the model information clarifies the model and priors
model_summary(fit)
# Formula: y ~ zs(id) * gp(age) + gp(age) + het(id) * gp_vm(diseaseAge) + zs(z1) * gp(age) + zs(z2) * gp(age) + zs(z3) * gp(age)
# Likelihood: gaussian
# Data: 96 observations, 7 variables
#
# Component type ker het ns vm unc cat cont
# 1 zs(id)*gp(age) 2 0 0 0 0 0 1 1
# 2 gp(age) 1 0 0 0 0 0 0 1
# 3 het(id)*gp_vm(diseaseAge) 1 0 1 1 1 0 0 2
# 4 zs(z1)*gp(age) 2 0 0 0 0 0 2 1
# 5 zs(z2)*gp(age) 2 0 0 0 0 0 3 1
# 6 zs(z3)*gp(age) 2 0 0 0 0 0 4 1
#
# Variable #Missing
# 1 age 0
# 2 diseaseAge 48
#
# Factor #Levels Values
# 1 id 16 ...
# 2 z1 2 1, 2
# 3 z2 2 1, 2
# 4 z3 2 1, 2
#
# Parameter Bounds Prior
# 1 alpha[1] [0, Inf) alpha[1] ~ student-t(20)
# 2 alpha[2] [0, Inf) alpha[2] ~ student-t(20)
# 3 alpha[3] [0, Inf) alpha[3] ~ student-t(20)
# 4 alpha[4] [0, Inf) alpha[4] ~ student-t(20)
# 5 alpha[5] [0, Inf) alpha[5] ~ student-t(20)
# 6 alpha[6] [0, Inf) alpha[6] ~ student-t(20)
# 7 ell[1] [0, Inf) ell[1] ~ log-normal(0,1)
# 8 ell[2] [0, Inf) ell[2] ~ log-normal(0,1)
# 9 ell[3] [0, Inf) ell[3] ~ log-normal(0,1)
# 10 ell[4] [0, Inf) ell[4] ~ log-normal(0,1)
# 11 ell[5] [0, Inf) ell[5] ~ log-normal(0,1)
# 12 ell[6] [0, Inf) ell[6] ~ log-normal(0,1)
# 13 wrp[1] [0, Inf) wrp[1] ~ inv-gamma(14,5)
# 14 sigma[1] [0, Inf) (sigma[1])^2 ~ inv-gamma(2,1)
# 15 beta[1-8] [0, 1] beta[1-8] ~ beta(0.2, 0.2)
#
#
# id 1 2 3 4 5 6 7 8
# beta_or_teff_param_idx 1 2 3 4 5 6 7 8
#
# Created on Sun Sep 24 09:09:06 2023 with lgpr 1.2.4.
We can visualize the posterior distribution of the individual-specific disease effect magnitude parameters for each case individual. We see that for individuals 1-4 the parameters are close to 1 (affected individuals) and for individuals 5-8 there is considerable posterior mass close to 0.
plot_beta(fit)
#
# id 1 2 3 4 5 6 7 8
# beta_or_teff_param_idx 1 2 3 4 5 6 7 8
Finally we plot the inferred disease component, using posterior median hyperparameters, and see that the inferred effects have the same shape for all individuals, but are heterogeneous in magnitude.
t <- seq(0, 100, by = 1)
x_pred <- new_x(dat, t, x_ns = "diseaseAge")
p <- pred(fit, x_pred, verbose = FALSE, reduce = stats::median)
plot_f(fit, pred = p, comp_idx = 3, color_by = 'diseaseAge')
sessionInfo()
# R version 4.1.2 (2021-11-01)
# Platform: x86_64-pc-linux-gnu (64-bit)
# Running under: Linux Mint 21.1
#
# Matrix products: default
# BLAS: /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
# LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
# [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
# [5] LC_MONETARY=fi_FI.UTF-8 LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=fi_FI.UTF-8 LC_NAME=C
# [9] LC_ADDRESS=C LC_TELEPHONE=C
# [11] LC_MEASUREMENT=fi_FI.UTF-8 LC_IDENTIFICATION=C
#
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
#
# other attached packages:
# [1] rstan_2.26.23 StanHeaders_2.26.28 ggplot2_3.4.3
# [4] lgpr_1.2.4 rmarkdown_2.23
#
# loaded via a namespace (and not attached):
# [1] tidyselect_1.2.0 xfun_0.39 QuickJSR_1.0.6
# [4] bslib_0.5.0 reshape2_1.4.4 colorspace_2.1-0
# [7] vctrs_0.6.3 generics_0.1.3 htmltools_0.5.5
# [10] stats4_4.1.2 loo_2.6.0 yaml_2.3.7
# [13] utf8_1.2.3 rlang_1.1.1 pkgbuild_1.4.2
# [16] jquerylib_0.1.4 pillar_1.9.0 glue_1.6.2
# [19] withr_2.5.0 distributional_0.3.2 plyr_1.8.8
# [22] matrixStats_1.0.0 lifecycle_1.0.3 stringr_1.5.0
# [25] posterior_1.4.1 munsell_0.5.0 gtable_0.3.4
# [28] codetools_0.2-18 evaluate_0.21 labeling_0.4.3
# [31] inline_0.3.19 knitr_1.43 callr_3.7.3
# [34] fastmap_1.1.1 ps_1.7.5 parallel_4.1.2
# [37] fansi_1.0.4 bayesplot_1.10.0 highr_0.10
# [40] rstantools_2.3.1.1 Rcpp_1.0.11 backports_1.4.1
# [43] checkmate_2.2.0 scales_1.2.1 cachem_1.0.8
# [46] RcppParallel_5.1.7 jsonlite_1.8.7 abind_1.4-5
# [49] farver_2.1.1 gridExtra_2.3 tensorA_0.36.2
# [52] digest_0.6.33 stringi_1.7.12 processx_3.8.2
# [55] dplyr_1.1.3 grid_4.1.2 cli_3.6.1
# [58] tools_4.1.2 magrittr_2.0.3 sass_0.4.6
# [61] tibble_3.2.1 crayon_1.5.2 pkgconfig_2.0.3
# [64] MASS_7.3-55 prettyunits_1.1.1 R6_2.5.1
# [67] compiler_4.1.2