Please download the following files:
We need ‘rms’, ‘ggplot2’, and ‘car’ packages If you haven’t already installed these:
Load the packages
library(rms)
## Loading required package: Hmisc
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: splines
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(ggplot2)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:rms':
##
## vif
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
The data should be merged and ready to go from day 1. If not, here’s the code for it:
data1 <- read.csv("~/Downloads/messy_demographic.csv") # put in the location of the downloaded data file 1
data2 <- read.csv("~/Downloads/messy_cognitive.csv") # put in the location of the downloaded data file 2
data3 <- read.csv("~/Downloads/messy_genotype.csv")
data1[data1==""] <- NA
data1[data1=="missing"] <- NA
data1[data1=="9999"] <- NA
data1$age <- as.numeric(as.character(data1$age))
data1$ethnicity <- factor(data1$ethnicity,levels=c("Cauc","AA","As","In","Other"))
data1$sex <- factor(data1$sex, levels=c(0,1), labels=c("Male","Female"))
data1$dx <- factor(data1$dx, levels=c(0,1), labels=c("Control","Case"))
data2[data2==""] <- NA
data2[data2=="missing"] <- NA
data2[data2=="9999"] <- NA
data2$cog1 <- as.numeric(as.character(data2$cog1))
data2$cog2 <- as.numeric(as.character(data2$cog2))
data2$cog3 <- as.numeric(as.character(data2$cog3))
data2$subID <- gsub(data2$subID,pattern="subject",replacement="SUB_")
data3[data3==""] <- NA
data3[data3=="missing"] <- NA
data3[data3=="9999"] <- NA
data3$genotype <- factor(data3$genotype, levels=c(0,1,2), labels=c("AA","AG","GG"))
data3$subID <- gsub(data3$subID,pattern="subject",replacement="SUB_")
alldata <- merge(data1,data2,by.x="subject_ID",by.y="subID")
alldata <- merge(alldata,data3,by.x="subject_ID",by.y="subID")
Calculate a composite variable by combining multiple variables Note: new variables can be made easily (using dplyr’s mutate verb)
alldata$totalcog <- (alldata$cog1 + alldata$cog3) / alldata$cog2
Simple linear regression (two ways: base package and rms)
lm.base <- lm(data=alldata, totalcog ~ age)
lm.rms <- ols(data=alldata, totalcog ~ age)
Let’s compare the output’s
lm.base
##
## Call:
## lm(formula = totalcog ~ age, data = alldata)
##
## Coefficients:
## (Intercept) age
## 1.78587 -0.01219
summary(lm.base)
##
## Call:
## lm(formula = totalcog ~ age, data = alldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0529 -0.5054 -0.1940 0.4352 2.3720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.785875 0.172793 10.335 < 2e-16 ***
## age -0.012192 0.003346 -3.644 0.000313 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6966 on 322 degrees of freedom
## (26 observations deleted due to missingness)
## Multiple R-squared: 0.03961, Adjusted R-squared: 0.03663
## F-statistic: 13.28 on 1 and 322 DF, p-value: 0.0003125
anova(lm.base)
## Analysis of Variance Table
##
## Response: totalcog
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 6.445 6.4446 13.28 0.0003125 ***
## Residuals 322 156.259 0.4853
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Note: to make the most out of rms package functionality, we need to store summary stats using the datadist() function. That way, when we call summary() on an ols() object (we just made one called “lm.rms”) it will give us useful info.
dd.alldata <- datadist(alldata)
options(datadist="dd.alldata")
lm.rms
##
## Linear Regression Model
##
## ols(formula = totalcog ~ age, data = alldata)
##
## Frequencies of Missing Values Due to Each Variable
## totalcog age
## 18 8
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 324 LR chi2 13.09 R2 0.040
## sigma 0.6966 d.f. 1 R2 adj 0.037
## d.f. 322 Pr(> chi2) 0.0003 g 0.160
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.0529 -0.5054 -0.1940 0.4352 2.3720
##
## Coef S.E. t Pr(>|t|)
## Intercept 1.7859 0.1728 10.34 <0.0001
## age -0.0122 0.0033 -3.64 0.0003
summary(lm.rms)
## Effects Response : totalcog
##
## Factor Low High Diff. Effect S.E. Lower 0.95 Upper 0.95
## age 43 58 15 -0.18 0.05 -0.28 -0.08
anova(lm.rms)
## Analysis of Variance Response: totalcog
##
## Factor d.f. Partial SS MS F P
## age 1 6.444638 6.4446382 13.28 3e-04
## REGRESSION 1 6.444638 6.4446382 13.28 3e-04
## ERROR 322 156.258677 0.4852754
ggplot(data=alldata, aes(y=totalcog, x=age)) +
geom_point() +
geom_smooth(method=lm)
## Warning: Removed 26 rows containing missing values (stat_smooth).
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
Visualize predicted results using rms
plot(Predict(lm.rms))
Check regression assumption of normal residuals
hist(resid(lm.rms))
They are not normal! We can look at this formally also:
shapiro.test(resid(lm.rms))
##
## Shapiro-Wilk normality test
##
## data: resid(lm.rms)
## W = 0.9158, p-value = 1.677e-12
ggplot(data=alldata, aes(x=totalcog)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Let’s try transforming our variable using log and sqrt transformations to see if it helps:
alldata$totalcog_log <- log(alldata$totalcog)
alldata$totalcog_sqrt <- sqrt(alldata$totalcog)
The powerTransform function will calculate the best transform for your data. It saves the best exponent as the value “lambda”
# calculate the best exponent using powerTransform:
pT <- powerTransform(alldata$totalcog)
# apply the power transform and save the result to a new variable
alldata$totalcog_pT <- alldata$totalcog^pT$lambda ## note ^ is exponent in r
Note: if we want to use summary() on our ols() object we will have to redefine datadist because we created new variables that were not in the original datadist object
dd.alldata <- datadist(alldata)
options(datadist="dd.alldata")
This is were we start to add covariates and do multiple regression
lm3 <- ols(data=alldata, totalcog_pT ~ age + genotype + sex)
anova(lm3)
## Analysis of Variance Response: totalcog_pT
##
## Factor d.f. Partial SS MS F P
## age 1 8.449950e-04 8.449950e-04 14.68 0.0002
## genotype 2 6.737907e-05 3.368954e-05 0.59 0.5576
## sex 1 2.106451e-04 2.106451e-04 3.66 0.0567
## REGRESSION 4 1.115625e-03 2.789063e-04 4.85 0.0008
## ERROR 309 1.778648e-02 5.756143e-05
summary(lm3)
## Effects Response : totalcog_pT
##
## Factor Low High Diff. Effect S.E. Lower 0.95 Upper 0.95
## age 43 58 15 0 0 0 0
## genotype - AA:AG 2 1 NA 0 0 0 0
## genotype - GG:AG 2 3 NA 0 0 0 0
## sex - Male:Female 2 1 NA 0 0 0 0
We can easily visualise the effects of each variable on the outcome using rms, but this feature does not give us plot points and is not very flexible:
plot(Predict(lm3))
Now let’s say we want to recode out genotype variable so that we have only 2 groups: those who “carry” the G allele, and those who do not carry it:
alldata$riskcarrier[alldata$genotype=="AG" | alldata$genotype=="GG"] <- "carrier"
alldata$riskcarrier[alldata$genotype=="AA"] <- "non-carrier"
alldata$riskcarrier <- as.factor(alldata$riskcarrier)
No we can re-run the model:
dd.alldata <- datadist(alldata)
options(datadist="dd.alldata")
lm3 <- ols(data=alldata, totalcog_pT ~ age + sex + riskcarrier)
lm3
##
## Linear Regression Model
##
## ols(formula = totalcog_pT ~ age + sex + riskcarrier, data = alldata)
##
## Frequencies of Missing Values Due to Each Variable
## totalcog_pT age sex riskcarrier
## 18 8 4 8
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 314 LR chi2 17.93 R2 0.055
## sigma 0.0076 d.f. 3 R2 adj 0.046
## d.f. 310 Pr(> chi2) 0.0005 g 0.002
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.761e-02 -5.765e-03 -7.513e-05 6.148e-03 1.767e-02
##
## Coef S.E. t Pr(>|t|)
## Intercept 1.0080 0.0020 500.39 <0.0001
## age -0.0001 0.0000 -3.83 0.0002
## sex=Female -0.0017 0.0009 -1.93 0.0548
## riskcarrier=non-carrier 0.0001 0.0010 0.12 0.9074
anova(lm3)
## Analysis of Variance Response: totalcog_pT
##
## Factor d.f. Partial SS MS F P
## age 1 8.468547e-04 8.468547e-04 14.70 0.0002
## sex 1 2.140203e-04 2.140203e-04 3.72 0.0548
## riskcarrier 1 7.802082e-07 7.802082e-07 0.01 0.9074
## REGRESSION 3 1.049026e-03 3.496754e-04 6.07 0.0005
## ERROR 310 1.785308e-02 5.759059e-05
summary(lm3)
## Effects Response : totalcog_pT
##
## Factor Low High Diff. Effect S.E. Lower 0.95
## age 43 58 15 0 0 0
## sex - Male:Female 2 1 NA 0 0 0
## riskcarrier - non-carrier:carrier 1 2 NA 0 0 0
## Upper 0.95
## 0
## 0
## 0
The concept of statistical interaction goes by many names and has many definitions. Simply this is the concept that the effect of one variable changes depending on the value of another variable.
Interaction is indicated in R formula syntax with a “:” or *
, depending on if you want to automatically include the main effects of your interacting variables or not. As a general rule, always use *
.
lm4 <- lm(totalcog_pT ~ sex + riskcarrier*age, data=alldata)
summary(lm4)
##
## Call:
## lm(formula = totalcog_pT ~ sex + riskcarrier * age, data = alldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0176521 -0.0056722 -0.0004349 0.0057920 0.0181582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.010e+00 2.301e-03 438.920 < 2e-16 ***
## sexFemale -1.727e-03 8.935e-04 -1.933 0.0541 .
## riskcarriernon-carrier -6.756e-03 4.504e-03 -1.500 0.1346
## age -1.790e-04 4.372e-05 -4.095 5.4e-05 ***
## riskcarriernon-carrier:age 1.312e-04 8.402e-05 1.561 0.1195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007571 on 309 degrees of freedom
## (36 observations deleted due to missingness)
## Multiple R-squared: 0.06289, Adjusted R-squared: 0.05076
## F-statistic: 5.184 on 4 and 309 DF, p-value: 0.0004697
anova(lm4)
## Analysis of Variance Table
##
## Response: totalcog_pT
## Df Sum Sq Mean Sq F value Pr(>F)
## sex 1 0.0001862 0.00018620 3.2482 0.0724769 .
## riskcarrier 1 0.0000160 0.00001597 0.2786 0.5980138
## age 1 0.0008469 0.00084685 14.7729 0.0001472 ***
## riskcarrier:age 1 0.0001397 0.00013968 2.4367 0.1195489
## Residuals 309 0.0177134 0.00005732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####…Controlling for the other variables in the model….
To visualize a given effect more informatively, we want to caculate the residuals of the model lacking our co-varitate of interest and plot those residuals as our outcome:
For genotype we want a boxplot of model residuals:
lm3.plot <- ols(data=alldata, totalcog_pT ~ genotype + age)
ggplot(data=alldata, aes(y=resid(lm3.plot), x=sex)) +
geom_boxplot()
## Warning: Removed 33 rows containing non-finite values (stat_boxplot).
See it thinks NA is a value! That is because the ols() object stores NA input values as NA residuals, and ggplot2 sees these as another level to plot. Fix by re-running the model to exclude missing observations and plotting the data subset where NAs are excluded:
lm3.plot <- ols(data=subset(alldata,sex!="NA"), totalcog_pT ~ age + genotype)
ggplot(data=subset(alldata,sex!="NA"), aes(y=resid(lm3.plot), x=sex)) +
geom_boxplot()
## Warning: Removed 32 rows containing non-finite values (stat_boxplot).
Ror age we want a scatterplot of residuals. same subsetting principle applies:
lm3.plot <- ols(data=subset(alldata,age!="NA"), totalcog_pT ~ genotype + sex)
ggplot(data=subset(alldata,age!="NA"), aes(y=resid(lm3.plot), x=age)) +
geom_point() +
geom_smooth(method=lm)
## Warning: Removed 28 rows containing missing values (stat_smooth).
## Warning: Removed 28 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
CONCEPT: notice that if we include age in our model and plot age on the x-axis in our residual plot, the effect is lost - we have modeled it out:
lm3.plot <- ols(data=alldata, totalcog_pT ~ genotype + sex + age)
ggplot(data=alldata, aes(y=resid(lm3.plot), x=age)) +
geom_point() +
geom_smooth(method=lm)
## Warning: Removed 36 rows containing missing values (stat_smooth).
## Warning: Removed 36 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
To plot the interaction we just tell ggplo2 to plot the regression lines for ou
lm4.plot <- ols(data=subset(alldata,age!="NA" & riskcarrier!="NA"), totalcog_pT ~ sex)
ggplot(data=subset(alldata,age!="NA" & riskcarrier!="NA"), aes(y=resid(lm4.plot), x=age, col=riskcarrier)) +
geom_point() +
geom_smooth(method=lm)
## Warning: Removed 10 rows containing missing values (stat_smooth).
## Warning: Removed 11 rows containing missing values (stat_smooth).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page