Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.
# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)Model object
Additional arguments to be passed to the low level functions
Number of simulated values/individuals
Parameter value (optional)
Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition
for internal use
Default residual variance (1)
Default covariance parameter (0.5)
Optional matrix of fixed values of variables (manipulation)
Return Inverse link transformed data
Include latent variables (default TRUE)
convert categorical variables to factors before applying transformation
Random seed
##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#> y x z
#> 1 1 2.5577940 1
#> 2 1 0.6289062 1
#> 3 0 -2.5114518 0
#> 4 0 -0.6517820 0
#> 5 1 1.9549094 1
#> 6 0 0.3479851 0
e <- estimate(m,d,estimator="glm")
e
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 0.98403 0.08625 11.40896 <1e-12
#> y~z 1.16438 0.16804 6.92926 4.23e-12
#> x~z 1.00992 0.06547 15.42639 <1e-12
#> Intercepts:
#> y -0.16286 0.09836 -1.65584 0.09775
#> x -0.00032 0.04669 -0.00675 0.9946
#> Dispersion:
#> x 1.07372
## Simulate a few observation from estimated model
sim(e,n=5)
#> y x z
#> 1 1 1.4902578 1
#> 2 0 -1.4551789 1
#> 3 1 1.8096418 1
#> 4 0 -1.4846894 0
#> 5 1 0.9795036 1
##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#> y x z
#> 1 14 1.0060825 1
#> 2 4 0.6183958 1
#> 3 7 0.4521507 1
#> 4 2 0.2320851 1
#> 5 121 2.4243668 1
#> 6 0 -1.2190155 1
estimate(m,d,estimator="glm")
#> Estimate Std. Error Z-value P-value
#> Regressions:
#> y~x 1.99872 0.00200 1001.16217 <1e-12
#> y~z 0.98636 0.01193 82.70802 <1e-12
#> x~z 1.01006 0.02265 44.59182 <1e-12
#> Intercepts:
#> y -0.98042 0.01237 -79.23054 <1e-12
#> x -0.00907 0.01940 -0.46750 0.6401
#> Dispersion:
#> x 1.00426
mean(d$z); lava:::expit(1)
#> [1] 0.7256
#> [1] 0.7310586
summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#>
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -5.2252 -0.9823 0.0325 0.9402 5.4313
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.98956 0.04439 22.29 <2e-16 ***
#> x 4.03106 0.04417 91.25 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.402 on 998 degrees of freedom
#> Multiple R-squared: 0.893, Adjusted R-squared: 0.8929
#> F-statistic: 8327 on 1 and 998 DF, p-value: < 2.2e-16
#>
##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#>
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.499081 0.005034 99.14 <2e-16 ***
#> x 1.036826 0.016498 62.85 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> (Dispersion parameter for Gamma family taken to be 0.5158464)
#>
#> Null deviance: 8541.5 on 9999 degrees of freedom
#> Residual deviance: 5551.2 on 9998 degrees of freedom
#> AIC: 20680
#>
#> Number of Fisher Scoring iterations: 6
#>
if (FALSE) MASS::gamma.shape(g) # \dontrun{}
args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE,
#> log = FALSE, ...)
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#> [1] -0.06146023 -0.15297105 -1.55859772 -2.33899184 -0.13261446 -0.22364401
#> [7] -0.13945111 0.31993208 0.19293624 0.17868933
##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#> y
#> y 1.187236
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#> y
#> y 0.05303781
##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#>
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -2.9770 -0.6711 -0.0585 0.7070 3.6556
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.007315 0.061044 0.120 0.905
#> x 0.972089 0.047008 20.679 <2e-16 ***
#> z 0.995315 0.054426 18.287 <2e-16 ***
#> I(z > 0)TRUE 0.025211 0.105441 0.239 0.811
#> x:I(z > 0)TRUE 1.021637 0.065233 15.661 <2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 1.014 on 995 degrees of freedom
#> Multiple R-squared: 0.7703, Adjusted R-squared: 0.7694
#> F-statistic: 834.4 on 4 and 995 DF, p-value: < 2.2e-16
#>
##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
Binary.lvm(), ## Vector of ones
Binary.lvm(0.5), ## 0.5n 0, 0.5n 1
Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#> x z v w
#> 1 0.0000000 1 0 0
#> 2 0.5555556 1 0 0
#> 3 1.1111111 1 0 1
#> 4 1.6666667 1 0 1
#> 5 2.2222222 1 0 1
#> 6 2.7777778 1 1 0
#> 7 3.3333333 1 1 0
#> 8 3.8888889 1 1 1
#> 9 4.4444444 1 1 1
#> 10 5.0000000 1 1 1
##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)
if (FALSE) { # \dontrun{
d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$t)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
} # }
##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
resample.iid=0,robust=0),spec=1)
L <- approxfun(c(cuts,max(d$y)),f=1,
cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
method="linear")
curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#> y x z t
#> 1 0.4641322 0.90979684 0.9450201 4.386143e-01
#> 2 0.5780113 1.11989582 0.3499724 2.022880e-01
#> 3 0.1906364 0.01722635 0.7479131 1.425794e-01
#> 4 1.0765679 -0.04989872 0.2386936 2.569699e-01
#> 5 1.6427577 0.54597959 0.8221244 1.350551e+00
#> 6 -41.9514857 -0.31551318 4.8776866 -2.046262e+02
#> 7 1.2838462 1.25150386 2.0931514 2.687285e+00
#> 8 1.6555105 0.40068497 0.7282096 1.205559e+00
#> 9 1.4386263 0.07996742 10.6085516 1.526174e+01
#> 10 0.0657008 1.61233073 0.1113936 7.318652e-03
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }
##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)
m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }
m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#> y1 y2 y3
#> y1 1.7879796 0.8986911 0.7334187
#> y2 0.8986911 1.9711804 0.7104499
#> y3 0.7334187 0.7104499 1.7411134
## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#> y x z
#> 1 1002.5079 1.9003038 1.3207004
#> 2 1001.5635 -0.6600688 0.6022015
#> 3 999.8989 0.0284914 0.5194575
#> 4 997.8830 -1.0465310 -0.6467731
#> 5 1000.2193 0.5358772 0.3953164
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#> y x z
#> 1 1.4379189 -0.5940062 0.2075005
#> 2 0.2787988 0.1730358 -0.2492215
#> 3 0.1981616 0.7085807 -0.5910783
#> 4 -1.2663856 -1.4670154 0.2479143
#> 5 -1.8845258 -1.2254667 1.0021884
sim(m,5,p=c(m1=100))
#> y x z
#> 1 101.61422 1.2893309 0.1184869
#> 2 97.25223 -1.5376382 -1.2306975
#> 3 100.27654 -0.8698709 0.9848374
#> 4 102.17923 1.1440416 0.3391601
#> 5 97.62066 -1.0514781 -0.4981777
##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)
if (interactive()) {
d <- sim(m,1e3)
require(survival)
(cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}
##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"
regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }
m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#>
#> A B C
#> 5008 3013 1979
glm(y~v, data=sim(m,1e4))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000))
#>
#> Coefficients:
#> (Intercept) vB vC
#> 0.02614 1.99230 -1.00798
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 22520
#> Residual Deviance: 9971 AIC: 28360
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#>
#> Call: glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#>
#> Coefficients:
#> (Intercept) vB vC
#> 0.00163 2.98778 -1.00968
#>
#> Degrees of Freedom: 9999 Total (i.e. Null); 9997 Residual
#> Null Deviance: 34460
#> Residual Deviance: 10310 AIC: 28690
transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#> v y z v2
#> 1 A -0.2367056 -0.6076527 TRUE
#> 2 B 1.8585600 3.3210809 FALSE
#> 3 A 0.9679083 -2.1797290 TRUE
#> 4 B 2.8827087 1.8835530 FALSE
#> 5 A -1.3114255 1.6318984 TRUE
#> 6 C -1.2983199 -0.4557392 FALSE
#> 7 B 0.1109659 5.3286566 FALSE
#> 8 B 2.9988011 4.7953034 FALSE
#> 9 A -0.4407522 0.6218654 TRUE
#> 10 A -0.1266534 0.5531538 TRUE
##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#> y x
#> 1 0.81233476 -0.97131096
#> 2 -0.64987060 0.20623939
#> 3 -2.01949824 -0.53276023
#> 4 4.48232406 2.44643312
#> 5 2.17489768 -0.08444089
#> 6 -1.39811580 -0.81595413
#> 7 2.19349947 0.96794188
#> 8 -0.03079853 0.05816394
#> 9 1.07318999 1.10372137
#> 10 -2.22883211 -1.61141878
sim(m2,10) ## Faster
#> y x
#> 1 -3.2338549 -1.7731064
#> 2 0.3875014 0.1975684
#> 3 3.2081255 1.5528790
#> 4 -2.0997483 -0.3177804
#> 5 2.2187278 0.6917520
#> 6 1.1863361 0.4877073
#> 7 2.5965386 1.4051690
#> 8 1.4001423 0.6924867
#> 9 -0.4259144 0.6072422
#> 10 -1.5955365 -1.6338646