# To install a package, run the following ONCE (and only once on your computer)# install.packages("psych")library(here) # makes reading data more consistentlibrary(tidyverse) # for data manipulation and plottinglibrary(haven) # for importing SPSS/SAS/Stata datalibrary(lme4) # for multilevel analysislibrary(glmmTMB) # for frequentist longitudinal MLM
># Warning in checkMatrixPackageVersion(): Package version inconsistency detected.
># TMB was built with Matrix version 1.5.1
># Current Matrix version is 1.5.3
># Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
library(brms) # for Bayesian longitudinal MLMlibrary(sjPlot) # for plottinglibrary(modelsummary) # for making tables
# Read in the data from GitHub (need Internet access)curran_wide <-read_sav("https://github.com/MultiLevelAnalysis/Datasets-third-edition-Multilevel-book/raw/master/chapter%205/Curran/CurranData.sav")# Make id a factorcurran_wide # print the data
# Using the new `tidyr::pivot_longer()` functioncurran_long <- curran_wide %>%pivot_longer(c(anti1:anti4, read1:read4), # variables that are repeated measures# Convert 8 columns to 3: 2 columns each for anti/read (.value), and# one column for timenames_to =c(".value", "time"),# Extract the names "anti"/"read" from the names of the variables for the# value columns, and then the number to the "time" columnnames_pattern ="(anti|read)([1-4])",# Convert the "time" column to integersnames_transform =list(time = as.integer) ) %>%mutate(time0 = time -1)
anova(m_pw_tmb, m_pw_ar1_tmb) # Not statistically significant
Intensive Longitudinal Data
The data is the first wave of the Cognition, Health, and Aging Project.
# Download the data from# https://www.pilesofvariance.com/Chapter8/SPSS/SPSS_Chapter8.zip, and put it# into the "data_files" folderzip_data <-here("data_files", "SPSS_Chapter8.zip")if (!file.exists(zip_data)) {download.file("https://www.pilesofvariance.com/Chapter8/SPSS/SPSS_Chapter8.zip", zip_data )}stress_data <-read_sav(unz(zip_data,"SPSS_Chapter8/SPSS_Chapter8.sav"))stress_data
The data is already in long format.
Data Exploration
# First, separate the time-varying variables into within-person and# between-person levelsstress_data <- stress_data %>%# Center mood (originally 1-5) at 1 for interpretation (so it becomes 0-4)# Also recode women to factormutate(mood1 = mood -1,women =factor(women, levels =c(0, 1),labels =c("men", "women"))) %>%group_by(PersonID) %>%# The `across()` function applies the same procedure on# multiple variablesmutate(across(c(symptoms, mood1, stressor),# The `.` means the variable to be operated onlist("pm"=~mean(., na.rm =TRUE),"pmc"=~ . -mean(., na.rm =TRUE)))) %>%ungroup()stress_data
# Use `datasummary_skim` for a quick summarydatasummary_skim(stress_data %>%select(-PersonID, -session, -studyday, -dayofweek,-weekend, -mood,# PMC for a binary variable is not very meaningful-stressor_pmc))
# Plotting moodp1 <-ggplot(stress_data, aes(x = studyday, y = mood1)) +# add lines to connect the data for each persongeom_line(aes(color =factor(PersonID), group = PersonID)) +# add a mean trajectorystat_summary(fun ="mean", col ="red", size =1, geom ="line") +# suppress legendguides(color ="none")
># Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
># ℹ Please use `linewidth` instead.
# Plotting symptomsp2 <-ggplot(stress_data, aes(x = studyday, y = symptoms)) +# add lines to connect the data for each persongeom_line(aes(color =factor(PersonID), group = PersonID)) +# add a mean trajectorystat_summary(fun ="mean", col ="red", size =1, geom ="line") +# suppress legendguides(color ="none")gridExtra::grid.arrange(p1, p2, ncol =2)
We can see that there is not a clear trend across time but fluctuations.
We can also plot the trajectories for a random sample of individuals:
stress_data %>%# randomly sample 40 individualsfilter(PersonID %in%sample(unique(PersonID), 40)) %>%ggplot(aes(x = studyday, y = symptoms)) +geom_point(size =0.5) +geom_line() +# add lines to connect the data for each personfacet_wrap(~ PersonID, ncol =10)
# Use glmer(..., family = binomial) for a binary outcome; to be discussed in a later weekm0_stre <-glmer(stressor ~ (1| PersonID), data = stress_data,family = binomial)performance::icc(m0_stre)
One thing to consider with longitudinal data in a short time is whether there may be trends in the data due to contextual factors (e.g., weekend effect; shared life events at a specific time). If this is the case, a common practice is to de-trend the data; otherwise, associations between predictors and the outcome may be merely due to contextual factors happening on certain days.
There is no strong reason to think the data contain trends, as shown in the graphs. We can further test the linear trend:
GEE is an alternative estimation method for clustered data, as discussed in 12.2 of Snijders and Bosker. It is a popular method, especially for longitudinal data in some research areas, as it offers some robustness against misspecification in the random effect structure (e.g., autoregressive errors, missing random slopes, etc). It accounts for the covariance structure using a working correlation matrix, such as the exchangeable (aka compound symmetry) structure. On the other hand, it only estimates fixed effects (also called marginal effects), so we cannot know whether some slopes are different across persons, but only the average slopes. GEE estimates the fixed effects by iteratively calculating the working correlation matrix and updating the fixed effect estimates with OLS. At the end, it computes the standard errors using the cluster-robust sandwich estimator.
GEE tends to require a relatively large sample size, and is usually less efficient than MLM for the same model (i.e., GEE gives larger standard errors, wider confidence intervals, and thus has less statistical power), which can be seen in the example below.
Below is a quick demonstration of GEE using the geepack package in R.
stress_data_lw <- stress_data %>%drop_na(mood1_pm, mood1_pmc, women, stressor_pm, stressor) %>%# Also need to convert to a data frame (instead of tibble)as.data.frame()
# Small sample correction (Mancl & DeRouen, 2001) for the robust standard errorslibrary(geesmv)m2_vcov_md <-GEE.var.md(symptoms ~ (mood1_pm + mood1_pmc) * women + stressor_pm + stressor,data = stress_data_lw,id ="PersonID", # need to be stringcorstr ="exchangeable")
As shown above, for OLS regression, which ignores the temporal dependence of the data, the standard errors for the between-level coefficients are too small. The results with MLM and GEE are comparable, although GEE only gives fixed effect estimates. Overall, GEE tends to have larger standard errors, especially when using the small-sample correction. In sum, the pros and cons of GEE are:
Pros:
Not requiring specification of the random part
Robust to moderate degree of misspecification in the random-effect covariance structure
Likely valid inferences on the fixed effects
Cons:
Loss of statistical power (which may not be a concern in really large samples)
No results and predictions for individual-specific models, as the random effect part is not modeled
No likelihood-based statistical tools (e.g., AIC/BIC)