# Read in an example gene expression data file
# Modify this to be wherever you are keeping the data on your computer
setwd("~/OneDrive - Indiana State University/Courses/cs618-s2022/")
fileName <- "GSE85331_all.gene.FPKM.output.replicates.txt"
# Read in the data
data <- read.csv(fileName, sep='\t')
dim(data) # how many rows and columns
summary(data) # summary of each column
#View(data) # GUI viewing the data - don't do this if data is too huge
# pick out just the data (not the metadata - gene symbols, etc.)
numeric_cols <- unlist(lapply(data, is.numeric))
data_nums <- data[,numeric_cols]
# boxplot of each sample - plotted with log2 because the data had not been
# log-scaled (which we normally do with gene expression data)
boxplot(log2(data_nums+1), las=2)
# a factor giving the t groups for each of the columns (from data_nums)
# data frame with information about the samples
samples <- data.frame(time = factor(rep(c("d0","d0","d2","d2","d4","d4","CM","CM"),4)),
cell_line = factor(c(rep("H1",8), rep("H9", 8), rep("C15", 8), rep("C20", 8))))
samples
# Identify differentially expressed genes
# From the supplementary information from - https://pubmed.ncbi.nlm.nih.gov/28663367/
# "Statistical analysis was performed for each cell line individually by pairwise
# comparisons across time-points and day 0 (control)."
data_H1 <- data_nums[,samples$cell_line == "H1"]
#View(data_H1)
# Not sure what exactly they mean by the above. We are going to do an ANOVA** for
# each gene (row) independently. Let's do just the first row to get the hang of it.
row <- as.vector(t(data_H1[1,]))
d <- data.frame(expr = row, group = samples$time[samples$cell_line == "H1"])
colnames(d) <- c("expr", "group")
results_aov <- aov(expr ~ group, d)
summary(results_aov)[[1]]$'Pr(>F)' # we just want the p-value
# ** - why do we use aov rather than t.test? We could use t.test to
# compare two of the sample groups (e.g., d0 versus d2), and do t.test
# for all pairs of groups. That would be tedious. aov with the way we
# have it set up should take care of all of the comparisons, and give a
# p value indicating the probability that the group influences the expression
# for at least some of the groups.
# Now let's do that for every gene (row) by using sapply
aov_by_row <- sapply(1:(nrow(data_H1)), function(i) {
row <- as.vector(t(data_H1[i,]))
d <- data.frame(expr=row, group=samples$time[samples$cell_line == "H1"])
colnames(d) <- c("expr", "group")
results_aov <- aov(expr ~ group, d)
summary(results_aov)[[1]]$'Pr(>F)' # we just want the p-value
})
# pull out p value from list
aov_by_row_p <- aov_by_row[1,]
# adjust for running multiple tests
aov_by_row_p_adj <- p.adjust(aov_by_row_p, method="fdr")
# put it together with gene symbol
p_values <- data.frame(p_value_adj = aov_by_row_p_adj)
rownames(p_values) <- data$X
#View(p_values)
# Can put the p values in with the data
data_H1_p <- cbind(data_H1, p_values, data$gene_id)
# Just look at ones that have <= .001 (an arbitrary cutoff)
significant <- data_H1_p[p_values$p_value_adj <= 0.001 & !is.na(p_values$p_value_adj),1,drop=FALSE]
View(significant)