Simulate model — sim • lava

Simulate data from a general SEM model including non-linear effects and general link and distribution of variables.

# S3 method for class 'lvm'
sim(x, n = NULL, p = NULL, normal = FALSE, cond = FALSE,
sigma = 1, rho = 0.5, X = NULL, unlink=FALSE, latent=TRUE,
use.labels = TRUE, seed=NULL, ...)

Arguments

x: Model object
...: Additional arguments to be passed to the low level functions
n: Number of simulated values/individuals
p: Parameter value (optional)
normal: Logical indicating whether to simulate data from a multivariate normal distribution conditional on exogenous variables hence ignoring functional/distribution definition
cond: for internal use
sigma: Default residual variance (1)
rho: Default covariance parameter (0.5)
X: Optional matrix of fixed values of variables (manipulation)
unlink: Return Inverse link transformed data
latent: Include latent variables (default TRUE)
use.labels: convert categorical variables to factors before applying transformation
seed: Random seed

Author

Klaus K. Holst

Examples

##################################################
## Logistic regression
##################################################
m <- lvm(y~x+z)
regression(m) <- x~z
distribution(m,~y+z) <- binomial.lvm("logit")
d <- sim(m,1e3)
head(d)
#>   y          x z
#> 1 1  2.5577940 1
#> 2 1  0.6289062 1
#> 3 0 -2.5114518 0
#> 4 0 -0.6517820 0
#> 5 1  1.9549094 1
#> 6 0  0.3479851 0

e <- estimate(m,d,estimator="glm")
e
#>              Estimate Std. Error  Z-value  P-value
#> Regressions:                                      
#>    y~x        0.98403    0.08625 11.40896   <1e-12
#>    y~z        1.16438    0.16804  6.92926 4.23e-12
#>     x~z       1.00992    0.06547 15.42639   <1e-12
#> Intercepts:                                       
#>    y         -0.16286    0.09836 -1.65584  0.09775
#>    x         -0.00032    0.04669 -0.00675   0.9946
#> Dispersion:                                       
#>    x          1.07372                             
## Simulate a few observation from estimated model
sim(e,n=5)
#>   y          x z
#> 1 1  1.4902578 1
#> 2 0 -1.4551789 1
#> 3 1  1.8096418 1
#> 4 0 -1.4846894 0
#> 5 1  0.9795036 1

##################################################
## Poisson
##################################################
distribution(m,~y) <- poisson.lvm()
d <- sim(m,1e4,p=c(y=-1,"y~x"=2,z=1))
head(d)
#>     y          x z
#> 1  14  1.0060825 1
#> 2   4  0.6183958 1
#> 3   7  0.4521507 1
#> 4   2  0.2320851 1
#> 5 121  2.4243668 1
#> 6   0 -1.2190155 1
estimate(m,d,estimator="glm")
#>                Estimate Std. Error    Z-value  P-value
#> Regressions:                                          
#>    y~x          1.99872    0.00200 1001.16217   <1e-12
#>    y~z          0.98636    0.01193   82.70802   <1e-12
#>     x~z         1.01006    0.02265   44.59182   <1e-12
#> Intercepts:                                           
#>    y           -0.98042    0.01237  -79.23054   <1e-12
#>    x           -0.00907    0.01940   -0.46750   0.6401
#> Dispersion:                                           
#>    x            1.00426                               
mean(d$z); lava:::expit(1)
#> [1] 0.7256
#> [1] 0.7310586

summary(lm(y~x,sim(lvm(y[1:2]~4*x),1e3)))
#> 
#> Call:
#> lm(formula = y ~ x, data = sim(lvm(y[1:2] ~ 4 * x), 1000))
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -5.2252 -0.9823  0.0325  0.9402  5.4313 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.98956    0.04439   22.29   <2e-16 ***
#> x            4.03106    0.04417   91.25   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.402 on 998 degrees of freedom
#> Multiple R-squared:  0.893,	Adjusted R-squared:  0.8929 
#> F-statistic:  8327 on 1 and 998 DF,  p-value: < 2.2e-16
#> 

##################################################
### Gamma distribution
##################################################
m <- lvm(y~x)
distribution(m,~y+x) <- list(Gamma.lvm(shape=2),binomial.lvm())
intercept(m,~y) <- 0.5
d <- sim(m,1e4)
summary(g <- glm(y~x,family=Gamma(),data=d))
#> 
#> Call:
#> glm(formula = y ~ x, family = Gamma(), data = d)
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) 0.499081   0.005034   99.14   <2e-16 ***
#> x           1.036826   0.016498   62.85   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> (Dispersion parameter for Gamma family taken to be 0.5158464)
#> 
#>     Null deviance: 8541.5  on 9999  degrees of freedom
#> Residual deviance: 5551.2  on 9998  degrees of freedom
#> AIC: 20680
#> 
#> Number of Fisher Scoring iterations: 6
#> 
if (FALSE) MASS::gamma.shape(g) # \dontrun{}

args(lava::Gamma.lvm)
#> function (link = "inverse", shape, rate, unit = FALSE, var = FALSE, 
#>     log = FALSE, ...) 
#> NULL
distribution(m,~y) <- Gamma.lvm(shape=2,log=TRUE)
sim(m,10,p=c(y=0.5))[,"y"]
#>  [1] -0.06146023 -0.15297105 -1.55859772 -2.33899184 -0.13261446 -0.22364401
#>  [7] -0.13945111  0.31993208  0.19293624  0.17868933

##################################################
### Beta
##################################################
m <- lvm()
distribution(m,~y) <- beta.lvm(alpha=2,beta=1)
var(sim(m,100,"y,y"=2))
#>          y
#> y 1.187236
distribution(m,~y) <- beta.lvm(alpha=2,beta=1,scale=FALSE)
var(sim(m,100))
#>            y
#> y 0.05303781

##################################################
### Transform
##################################################
m <- lvm()
transform(m,xz~x+z) <- function(x) x[1]*(x[2]>0)
regression(m) <- y~x+z+xz
d <- sim(m,1e3)
summary(lm(y~x+z + x*I(z>0),d))
#> 
#> Call:
#> lm(formula = y ~ x + z + x * I(z > 0), data = d)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -2.9770 -0.6711 -0.0585  0.7070  3.6556 
#> 
#> Coefficients:
#>                Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)    0.007315   0.061044   0.120    0.905    
#> x              0.972089   0.047008  20.679   <2e-16 ***
#> z              0.995315   0.054426  18.287   <2e-16 ***
#> I(z > 0)TRUE   0.025211   0.105441   0.239    0.811    
#> x:I(z > 0)TRUE 1.021637   0.065233  15.661   <2e-16 ***
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 1.014 on 995 degrees of freedom
#> Multiple R-squared:  0.7703,	Adjusted R-squared:  0.7694 
#> F-statistic: 834.4 on 4 and 995 DF,  p-value: < 2.2e-16
#> 

##################################################
### Non-random variables
##################################################
m <- lvm()
distribution(m,~x+z+v+w) <- list(Sequence.lvm(0,5),## Seq. 0 to 5 by 1/n
                               Binary.lvm(),       ## Vector of ones
                               Binary.lvm(0.5),    ##  0.5n 0, 0.5n 1
                               Binary.lvm(interval=list(c(0.3,0.5),c(0.8,1))))
sim(m,10)
#>            x z v w
#> 1  0.0000000 1 0 0
#> 2  0.5555556 1 0 0
#> 3  1.1111111 1 0 1
#> 4  1.6666667 1 0 1
#> 5  2.2222222 1 0 1
#> 6  2.7777778 1 1 0
#> 7  3.3333333 1 1 0
#> 8  3.8888889 1 1 1
#> 9  4.4444444 1 1 1
#> 10 5.0000000 1 1 1

##################################################
### Cox model
### piecewise constant hazard
################################################
m <- lvm(t~x)
rates <- c(1,0.5); cuts <- c(0,5)
## Constant rate: 1 in [0,5), 0.5 in [5,Inf)
distribution(m,~t) <- coxExponential.lvm(rate=rates,timecut=cuts)

if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("t~x"=0.1)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(t,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$t)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$t))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }

##################################################
### Cox model
### piecewise constant hazard, gamma frailty
##################################################
m <- lvm(y~x+z)
rates <- c(0.3,0.5); cuts <- c(0,5)
distribution(m,~y+z) <- list(coxExponential.lvm(rate=rates,timecut=cuts),
                             loggamma.lvm(rate=1,shape=1))
if (FALSE) { # \dontrun{
    d <- sim(m,2e4,p=c("y~x"=0,"y~z"=0)); d$status <- TRUE
    plot(timereg::aalen(survival::Surv(y,status)~x,data=d,
                        resample.iid=0,robust=0),spec=1)
    L <- approxfun(c(cuts,max(d$y)),f=1,
                   cumsum(c(0,rates*diff(c(cuts,max(d$y))))),
                   method="linear")
    curve(L,0,100,add=TRUE,col="blue")
} # }
## Equivalent via transform (here with Aalens additive hazard model)
m <- lvm(y~x)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- Gamma.lvm(rate=1,shape=1)
transform(m,t~y+z) <- prod
sim(m,10)
#>              y           x          z             t
#> 1    0.4641322  0.90979684  0.9450201  4.386143e-01
#> 2    0.5780113  1.11989582  0.3499724  2.022880e-01
#> 3    0.1906364  0.01722635  0.7479131  1.425794e-01
#> 4    1.0765679 -0.04989872  0.2386936  2.569699e-01
#> 5    1.6427577  0.54597959  0.8221244  1.350551e+00
#> 6  -41.9514857 -0.31551318  4.8776866 -2.046262e+02
#> 7    1.2838462  1.25150386  2.0931514  2.687285e+00
#> 8    1.6555105  0.40068497  0.7282096  1.205559e+00
#> 9    1.4386263  0.07996742 10.6085516  1.526174e+01
#> 10   0.0657008  1.61233073  0.1113936  7.318652e-03
## Shared frailty
m <- lvm(c(t1,t2)~x+z)
rates <- c(1,0.5); cuts <- c(0,5)
distribution(m,~y) <- aalenExponential.lvm(rate=rates,timecut=cuts)
distribution(m,~z) <- loggamma.lvm(rate=1,shape=1)
if (FALSE) { # \dontrun{
mets::fast.reshape(sim(m,100),varying="t")
} # }

##################################################
### General multivariate distributions
##################################################
if (FALSE) { # \dontrun{
m <- lvm()
distribution(m,~y1+y2,oratio=4) <- VGAM::rbiplackcop
ksmooth2(sim(m,1e4),rgl=FALSE,theta=-20,phi=25)

m <- lvm()
distribution(m,~z1+z2,"or1") <- VGAM::rbiplackcop
distribution(m,~y1+y2,"or2") <- VGAM::rbiplackcop
sim(m,10,p=c(or1=0.1,or2=4))
} # }

m <- lvm()
distribution(m,~y1+y2+y3,TRUE) <- function(n,...) rmvn0(n,sigma=diag(3)+1)
var(sim(m,100))
#>           y1        y2        y3
#> y1 1.7879796 0.8986911 0.7334187
#> y2 0.8986911 1.9711804 0.7104499
#> y3 0.7334187 0.7104499 1.7411134

## Syntax also useful for univariate generators, e.g.
m <- lvm(y~x+z)
distribution(m,~y,TRUE) <- function(n) rnorm(n,mean=1000)
sim(m,5)
#>           y          x          z
#> 1 1002.5079  1.9003038  1.3207004
#> 2 1001.5635 -0.6600688  0.6022015
#> 3  999.8989  0.0284914  0.5194575
#> 4  997.8830 -1.0465310 -0.6467731
#> 5 1000.2193  0.5358772  0.3953164
distribution(m,~y,"m1",0) <- rnorm
sim(m,5)
#>            y          x          z
#> 1  1.4379189 -0.5940062  0.2075005
#> 2  0.2787988  0.1730358 -0.2492215
#> 3  0.1981616  0.7085807 -0.5910783
#> 4 -1.2663856 -1.4670154  0.2479143
#> 5 -1.8845258 -1.2254667  1.0021884
sim(m,5,p=c(m1=100))
#>           y          x          z
#> 1 101.61422  1.2893309  0.1184869
#> 2  97.25223 -1.5376382 -1.2306975
#> 3 100.27654 -0.8698709  0.9848374
#> 4 102.17923  1.1440416  0.3391601
#> 5  97.62066 -1.0514781 -0.4981777

##################################################
### Regression design in other parameters
##################################################
## Variance heterogeneity
m <- lvm(y~x)
distribution(m,~y) <- function(n,mean,x) rnorm(n,mean,exp(x)^.5)
if (interactive()) plot(y~x,sim(m,1e3))
## Alternaively, calculate the standard error directly
addvar(m) <- ~sd ## If 'sd' should be part of the resulting data.frame
constrain(m,sd~x) <- function(x) exp(x)^.5
distribution(m,~y) <- function(n,mean,sd) rnorm(n,mean,sd)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on variance parameter
m <- lvm()
regression(m) <- y~x
regression(m) <- v~x
##distribution(m,~v) <- 0 # No stochastic term
## Alternative:
## regression(m) <- v[NA:0]~x
distribution(m,~y) <- function(n,mean,v) rnorm(n,mean,exp(v)^.5)
if (interactive()) plot(y~x,sim(m,1e3))

## Regression on shape parameter in Weibull model
m <- lvm()
regression(m) <- y ~ z+v
regression(m) <- s ~ exp(0.6*x-0.5*z)
distribution(m,~x+z) <- binomial.lvm()
distribution(m,~cens) <- coxWeibull.lvm(scale=1)
distribution(m,~y) <- coxWeibull.lvm(scale=0.1,shape=~s)
eventTime(m) <- time ~ min(y=1,cens=0)

if (interactive()) {
    d <- sim(m,1e3)
    require(survival)
    (cc <- coxph(Surv(time,status)~v+strata(x,z),data=d))
    plot(survfit(cc) ,col=1:4,mark.time=FALSE)
}

##################################################
### Categorical predictor
##################################################
m <- lvm()
## categorical(m,K=3) <- "v"
categorical(m,labels=c("A","B","C")) <- "v"

regression(m,additive=FALSE) <- y~v
if (FALSE) { # \dontrun{
plot(y~v,sim(m,1000,p=c("y~v:2"=3)))
} # }

m <- lvm()
categorical(m,labels=c("A","B","C"),p=c(0.5,0.3)) <- "v"
regression(m,additive=FALSE,beta=c(0,2,-1)) <- y~v
## equivalent to:
## regression(m,y~v,additive=FALSE) <- c(0,2,-1)
regression(m,additive=FALSE,beta=c(0,4,-1)) <- z~v
table(sim(m,1e4)$v)
#> 
#>    A    B    C 
#> 5008 3013 1979 
glm(y~v, data=sim(m,1e4))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>     0.02614      1.99230     -1.00798  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    22520 
#> Residual Deviance: 9971 	AIC: 28360
glm(y~v, data=sim(m,1e4,p=c("y~v:1"=3)))
#> 
#> Call:  glm(formula = y ~ v, data = sim(m, 10000, p = c(`y~v:1` = 3)))
#> 
#> Coefficients:
#> (Intercept)           vB           vC  
#>     0.00163      2.98778     -1.00968  
#> 
#> Degrees of Freedom: 9999 Total (i.e. Null);  9997 Residual
#> Null Deviance:	    34460 
#> Residual Deviance: 10310 	AIC: 28690

transform(m,v2~v) <- function(x) x=='A'
sim(m,10)
#>    v          y          z    v2
#> 1  A -0.2367056 -0.6076527  TRUE
#> 2  B  1.8585600  3.3210809 FALSE
#> 3  A  0.9679083 -2.1797290  TRUE
#> 4  B  2.8827087  1.8835530 FALSE
#> 5  A -1.3114255  1.6318984  TRUE
#> 6  C -1.2983199 -0.4557392 FALSE
#> 7  B  0.1109659  5.3286566 FALSE
#> 8  B  2.9988011  4.7953034 FALSE
#> 9  A -0.4407522  0.6218654  TRUE
#> 10 A -0.1266534  0.5531538  TRUE

##################################################
### Pre-calculate object
##################################################
m <- lvm(y~x)
m2 <- sim(m,'y~x'=2)
sim(m,10,'y~x'=2)
#>              y           x
#> 1   0.81233476 -0.97131096
#> 2  -0.64987060  0.20623939
#> 3  -2.01949824 -0.53276023
#> 4   4.48232406  2.44643312
#> 5   2.17489768 -0.08444089
#> 6  -1.39811580 -0.81595413
#> 7   2.19349947  0.96794188
#> 8  -0.03079853  0.05816394
#> 9   1.07318999  1.10372137
#> 10 -2.22883211 -1.61141878
sim(m2,10) ## Faster
#>             y          x
#> 1  -3.2338549 -1.7731064
#> 2   0.3875014  0.1975684
#> 3   3.2081255  1.5528790
#> 4  -2.0997483 -0.3177804
#> 5   2.2187278  0.6917520
#> 6   1.1863361  0.4877073
#> 7   2.5965386  1.4051690
#> 8   1.4001423  0.6924867
#> 9  -0.4259144  0.6072422
#> 10 -1.5955365 -1.6338646