Please download the following files:
We need ‘ggplot2’, ‘dplyr’, ‘tidyr’, and ‘broom’ packages If you haven’t already installed these:
Load the packages
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(broom)
library(knitr)
The data should be merged and ready to go from day 1. If not, here’s the code for it:
data1 <- read.csv("~/Downloads/messy_demographic.csv") # put in the location of the downloaded data file 1
data2 <- read.csv("~/Downloads/messy_cognitive.csv") # put in the location of the downloaded data file 2
data3 <- read.csv("~/Downloads/messy_genotype.csv")
data1[data1==""] <- NA
data1[data1=="missing"] <- NA
data1[data1=="9999"] <- NA
data1$age <- as.numeric(as.character(data1$age))
data1$ethnicity <- factor(data1$ethnicity,levels=c("Cauc","AA","As","In","Other"))
data1$sex <- factor(data1$sex, levels=c(0,1), labels=c("Male","Female"))
data1$dx <- factor(data1$dx, levels=c(0,1), labels=c("Control","Case"))
data2[data2==""] <- NA
data2[data2=="missing"] <- NA
data2[data2=="9999"] <- NA
data2$cog1 <- as.numeric(as.character(data2$cog1))
data2$cog2 <- as.numeric(as.character(data2$cog2))
data2$cog3 <- as.numeric(as.character(data2$cog3))
data2$subID <- gsub(data2$subID,pattern="subject",replacement="SUB_")
data3[data3==""] <- NA
data3[data3=="missing"] <- NA
data3[data3=="9999"] <- NA
data3$genotype <- factor(data3$genotype, levels=c(0,1,2), labels=c("AA","AG","GG"))
data3$subID <- gsub(data3$subID,pattern="subject",replacement="SUB_")
alldata <- merge(data1,data2,by.x="subject_ID",by.y="subID")
alldata <- merge(alldata,data3,by.x="subject_ID",by.y="subID")
## Write merged dataframe as new .csv
write.csv(alldata, "alldata.csv", row.names=F)
t.test(data=alldata, cog1 ~ dx)
##
## Welch Two Sample t-test
##
## data: cog1 by dx
## t = -6.347, df = 334.916, p-value = 7.133e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.871388 -1.512676
## sample estimates:
## mean in group Control mean in group Case
## 9.940047 12.132079
Wow we have a result!! But one of my favorite things output R is that the statistical output can be saved as an object!!
my.t.result <- t.test(data=alldata, cog1 ~ dx) # saves to output to my.t.result
print(my.t.result) ## prints to output to the console
##
## Welch Two Sample t-test
##
## data: cog1 by dx
## t = -6.347, df = 334.916, p-value = 7.133e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.871388 -1.512676
## sample estimates:
## mean in group Control mean in group Case
## 9.940047 12.132079
my.t.result$statistic ## gets us the t statistic!
## t
## -6.347017
my.t.result$parameter ## the degrees of freedom
## df
## 334.9162
my.t.result$p.value ## gets us the p-value
## [1] 7.13299e-10
round(my.t.result$statistic,2) ## we can these numbers using the "round" function
## t
## -6.35
Let’s put these three together into something we might want to report in our paper
my.t.results.txt = paste0('t(',
round(my.t.result$parameter,1),
') = ',
round(my.t.result$statistic,2), ', p = ',
round(my.t.result$p.value, 7))
my.t.results.txt
## [1] "t(334.9) = -6.35, p = 0"
View this as basic boxplots (two ways: base package and ggplot2)
ggplot(data=alldata, aes(y=cog1,x=dx)) +
geom_boxplot()
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).
First - let’s deal with the NA’s we don’t want to plot - let’s remove them from the plotting dataset
data.toplot <-filter(alldata, !is.na(cog1), !is.na(dx))
ggplot(data.toplot, aes(y=cog1,x=dx)) +
geom_boxplot(outlier.shape=NA) + ## removes outliers from boxplot layer
geom_jitter(alpha=0.5) ## add dots on top
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_points, x$x, x$y, x$pch, x$size): semi-
## transparency is not supported on this device: reported only once per page
## even fancier - let's add a title, and annotation and label the axes
ggplot(data.toplot, aes(y=cog1,x=dx)) +
geom_boxplot(outlier.shape=NA) +
geom_jitter(alpha=0.5) +
labs(title="Effect of Diagnosis on Cog Score #1",
y = "Cognitive Test 1",
x = "Diagnosis") +
annotate("text", label = my.t.results.txt, x = 1, y = 21) +
theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_points, x$x, x$y, x$pch, x$size): semi-
## transparency is not supported on this device: reported only once per page
Note: we can start by using the “Export” button in the plots tab..
ggsave('figure1_ttestresults.pdf', width = 5, height = 5)
my.stats.table <- summarise(alldata,
"Mean" = mean(cog1, na.rm = T),
"St Dev" = sd(cog1, na.rm = T))
my.stats.table <- alldata %>%
group_by(dx,sex) %>%
summarise("Mean" = mean(cog1, na.rm = T),
"St Dev" = sd(cog1, na.rm = T))
kable(my.stats.table)
dx | sex | Mean | St Dev |
---|---|---|---|
Control | Male | 10.476754 | 2.7469625 |
Control | Female | 9.746893 | 2.9514609 |
Control | NA | 12.254830 | 1.5465560 |
Case | Male | 12.215568 | 3.5483599 |
Case | Female | 12.022392 | 3.3859637 |
Case | NA | 13.036707 | 0.1736939 |
NA | Male | 11.868654 | 1.0237882 |
NA | Female | 10.580708 | 2.3376551 |
## let's add a rick carrier
alldata$riskcarrier[alldata$genotype=="AG" | alldata$genotype=="GG"] <- "carrier"
alldata$riskcarrier[alldata$genotype=="AA"] <- "non-carrier"
alldata$riskcarrier <- as.factor(alldata$riskcarrier)
alldata_melted <- gather(alldata, cog_var, cognitive.score,
-subject_ID, -age, -sex, -ethnicity, -dx, -genotype, -riskcarrier)
toplot <- filter(alldata_melted, !is.na(cognitive.score),
!is.na(dx),
!is.na(riskcarrier))
ggplot(toplot, aes(y=cognitive.score,x=age,color=riskcarrier)) +
geom_point() +
geom_smooth(method=lm) +
facet_wrap(~cog_var, scales = "free")
## Warning: Removed 5 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 5 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 5 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning in grid.Call.graphics(L_polygon, x$x, x$y, index): semi-
## transparency is not supported on this device: reported only once per page
LAST BUT NOT LEAST, running a loop to screen for significance at multiple tests (manhattan plot)
## add stargazer descriptive stats table?
cog_dems_table <- alldata_melted %>%
filter(!is.na(cognitive.score), !is.na(dx), !is.na(riskcarrier)) %>%
group_by(dx, cog_var, riskcarrier) %>%
summarise("Mean" = mean(cognitive.score, na.rm = T),
"St Dev" = sd(cognitive.score, na.rm = T))
kable(cog_dems_table)
dx | cog_var | riskcarrier | Mean | St Dev |
---|---|---|---|---|
Control | cog1 | carrier | 9.367999 | 2.745134 |
Control | cog1 | non-carrier | 11.545711 | 2.884425 |
Control | cog2 | carrier | 31.439535 | 4.015106 |
Control | cog2 | non-carrier | 31.845074 | 3.675566 |
Control | cog3 | carrier | 24.442639 | 20.988259 |
Control | cog3 | non-carrier | 25.761328 | 24.178563 |
Case | cog1 | carrier | 11.549411 | 3.450849 |
Case | cog1 | non-carrier | 13.418635 | 3.126957 |
Case | cog2 | carrier | 31.405908 | 3.921310 |
Case | cog2 | non-carrier | 32.569693 | 3.489469 |
Case | cog3 | carrier | 26.178213 | 20.660281 |
Case | cog3 | non-carrier | 23.402200 | 20.367174 |
regress_results <- alldata_melted %>%
filter(!is.na(cognitive.score), !is.na(dx), !is.na(riskcarrier)) %>%
group_by(cog_var) %>%
do(tidy(lm(cognitive.score ~ dx*riskcarrier*age, .)))
kable(regress_results)
cog_var | term | estimate | std.error | statistic | p.value |
---|---|---|---|---|---|
cog1 | (Intercept) | 11.9264294 | 1.1672595 | 10.2174617 | 0.0000000 |
cog1 | dxCase | 2.8394090 | 1.7698966 | 1.6042796 | 0.1096482 |
cog1 | riskcarriernon-carrier | -1.0538028 | 2.6306132 | -0.4005921 | 0.6889903 |
cog1 | age | -0.0520558 | 0.0226907 | -2.2941403 | 0.0224360 |
cog1 | dxCase:riskcarriernon-carrier | -4.6365570 | 3.6489450 | -1.2706569 | 0.2047825 |
cog1 | dxCase:age | -0.0134964 | 0.0351062 | -0.3844435 | 0.7009075 |
cog1 | riskcarriernon-carrier:age | 0.0650805 | 0.0501192 | 1.2985151 | 0.1950542 |
cog1 | dxCase:riskcarriernon-carrier:age | 0.0785818 | 0.0685391 | 1.1465256 | 0.2524424 |
cog2 | (Intercept) | 19.9055562 | 1.1676401 | 17.0476815 | 0.0000000 |
cog2 | dxCase | 0.8053476 | 1.7868704 | 0.4507029 | 0.6525118 |
cog2 | riskcarriernon-carrier | 0.7816939 | 2.6015980 | 0.3004668 | 0.7640181 |
cog2 | age | 0.2308120 | 0.0226252 | 10.2015568 | 0.0000000 |
cog2 | dxCase:riskcarriernon-carrier | 1.1478888 | 3.6265195 | 0.3165263 | 0.7518114 |
cog2 | dxCase:age | -0.0128973 | 0.0353055 | -0.3653057 | 0.7151268 |
cog2 | riskcarriernon-carrier:age | -0.0156561 | 0.0493271 | -0.3173933 | 0.7511541 |
cog2 | dxCase:riskcarriernon-carrier:age | -0.0217776 | 0.0679281 | -0.3205975 | 0.7487267 |
cog3 | (Intercept) | 33.6200213 | 8.2169195 | 4.0915602 | 0.0000546 |
cog3 | dxCase | -10.1585421 | 12.4897860 | -0.8133480 | 0.4166368 |
cog3 | riskcarriernon-carrier | 22.2201568 | 18.6311019 | 1.1926378 | 0.2339146 |
cog3 | age | -0.1834078 | 0.1600371 | -1.1460333 | 0.2526568 |
cog3 | dxCase:riskcarriernon-carrier | -34.6621556 | 25.8192409 | -1.3424932 | 0.1804090 |
cog3 | dxCase:age | 0.2483313 | 0.2484239 | 0.9996272 | 0.3182631 |
cog3 | riskcarriernon-carrier:age | -0.4091669 | 0.3562594 | -1.1485084 | 0.2516357 |
cog3 | dxCase:riskcarriernon-carrier:age | 0.5742907 | 0.4857384 | 1.1823044 | 0.2379823 |
#Let's just look at the carier by age interaction term
age_regress_results <- regress_results %>% filter(term == "age")
age_regress_results$p.fdr = p.adjust(age_regress_results$p.value, method = "fdr")
kable(age_regress_results)
cog_var | term | estimate | std.error | statistic | p.value | p.fdr |
---|---|---|---|---|---|---|
cog1 | age | -0.0520558 | 0.0226907 | -2.294140 | 0.0224360 | 0.0336540 |
cog2 | age | 0.2308120 | 0.0226252 | 10.201557 | 0.0000000 | 0.0000000 |
cog3 | age | -0.1834078 | 0.1600371 | -1.146033 | 0.2526568 | 0.2526568 |
age_regress_results$p.bonferonni = p.adjust(age_regress_results$p.value, method = "bonferroni")
kable(age_regress_results)
cog_var | term | estimate | std.error | statistic | p.value | p.fdr | p.bonferonni |
---|---|---|---|---|---|---|---|
cog1 | age | -0.0520558 | 0.0226907 | -2.294140 | 0.0224360 | 0.0336540 | 0.0673081 |
cog2 | age | 0.2308120 | 0.0226252 | 10.201557 | 0.0000000 | 0.0000000 | 0.0000000 |
cog3 | age | -0.1834078 | 0.1600371 | -1.146033 | 0.2526568 | 0.2526568 | 0.7579704 |