Dissertation-R-code/LMM _code.R at main · danielhp777/Dissertation-R-code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
library(nlme)
library(dplyr)
library(ggplot2)

# load and process data
data <- read.table("Data2.txt",header=TRUE)

# separating data (6 measurements per subject for most subjects, m=481)
time <- c(0,2,4,8,16,24)
calwk <- data$calwk
logrna <- as.numeric(data$logrna)
nnrti <- factor(data$nnrti)
txday <- data$txday
cens <- factor(data$cens)
trtarm <- factor(data$trtarm)
patid <- data$patid

hiv <- data.frame(patid, nnrti, calwk, txday, logrna, cens, trtarm)

set.seed(158)
patients <- seq(1:481)
test_ids <- sample(patients, size = 96, replace = FALSE) # choosing the patient ID's for test data
test_ids <- sort(test_ids)
train_ids <- setdiff(patients, test_ids)

hiv_train <- hiv %>% filter(patid %in% train_ids)
hiv_test <- hiv %>% filter(patid %in% test_ids)

# Linear Mixed-Effect Models: choosing the best LMM to use (random slope models)
# independent within subject correlation structure
lme1 <- lme(logrna ~ nnrti + calwk + txday + cens + trtarm,
            data = hiv_train, random = ~ calwk | patid, method = 'ML')
summary(lme1)

# compound symmetry structure for within subject correlation
lme2 <- update(lme1, correlation = corCompSymm(form = ~ 1 | patid))
summary(lme2)

# fewer fixed effects, random slope with independent within subject correlation
lme3 <- lme(logrna ~ trtarm * calwk, data = hiv_train,
            random = ~calwk | patid, method = 'ML')
summary(lme3)

lme4 <- update(lme3, correlation = corCompSymm(form = ~ 1 | patid))
summary(lme4)

anova(lme1, lme2, lme3, lme4) # we choose lme1 based on AIC/BIC

# improve parameter estimates by using REML instead of ML
lme5 <- update(lme1, method = 'REML')
summary(lme5) # shows the fitted parameters for the chosen LMM

# predictions
lme5_pred <- predict(lme5, newdata = hiv_test, level = 0) # level asserts that we only use fixed effects in prediction, since expectation of random components is 0

# plot predictions vs actual responses
D5 <- data.frame(real_values = hiv_test$logrna, Predictions = lme5_pred)

ggplot(D5, aes(x = Predictions, y = real_values)) +
  geom_point(alpha = 0.6, aes(color = 'Data points')) +
  geom_smooth(method = "lm", se = FALSE, aes(color = 'Regression line')) + # best fit line
  geom_line(linewidth = 1, data = data.frame(x = range(D5$Predictions), y = range(D5$Predictions)), aes(x=x, y=y, color = 'Identity line')) +
  scale_color_manual(name = 'Legend', values = c('Data points'= 'deeppink3', 'Regression line' = 'black', 'Identity line' = 'green')) +
  labs(
    x = "Predicted viral load", y = "Actual viral load",
    title =
      "Predicted vs Actual viral loads for the testing data (LMM)") +
  theme_minimal()

# mse
mse_lmm <- mean((lme5_pred - hiv_test$logrna)^2)
mse_lmm

# mae
mae_lmm <- mean(abs(lme5_pred - hiv_test$logrna))
mae_lmm

# correlation between predicted and actual responses
cor(hiv_test$logrna, lme5_pred)