# Goal of this file
# - take cleaned csv file and do some simple analysis
# - take the average of each of the two replicates, so we'll have 4 number columns
# - normalize so that all columns have a median of 10
# - look for gene transcripts up at day2 compared to day0 and day10
data <- read.csv("GSE69618_data.cleaned.csv")
View(data)
boxplot(data[,-c(1,2,3)])
dim(data)
# data frame with averages of the replicates
colnames(data)
data.avg <- cbind(data[,1:3], rowMeans(data[,4:5]), rowMeans(data[,6:7]),
rowMeans(data[,8:9]), rowMeans(data[,10:11]))
# and fix the column names
colnames(data.avg) <- c(colnames(data)[1:3], "WT.day0", "WT.day2", "WT.day6", "WT.day10")
View(data.avg)
# now normalize so each number column has median 10
medians <- c(median(data.avg$WT.day0), median(data.avg$WT.day2),
median(data.avg$WT.day6), median(data.avg$WT.day10))
data.avg$WT.day0 <- data.avg$WT.day0 * 10.0 / medians[1]
data.avg$WT.day2 <- data.avg$WT.day2 * 10.0 / medians[2]
data.avg$WT.day6 <- data.avg$WT.day6 * 10.0 / medians[3]
data.avg$WT.day10 <- data.avg$WT.day10 * 10.0 / medians[4]
# and boxplot again, medians should all be 10
boxplot(data.avg[,-(1:3)])
# now let's look at things up at day2 compared to day0, and also up at day2 compared to day10
day2_day0 <- order(data.avg$WT.day2 - data.avg$WT.day0, decreasing = TRUE)
day2_day10 <- order(data.avg$WT.day2 - data.avg$WT.day10, decreasing = TRUE)
top.results <- intersect(day2_day0[1:200], day2_day10[1:200])
View(data.avg[top.results,])
# and do we think there is a bigger change from day0-day2, day2-day6, or day6-day10
var(data.avg$WT.day2-data.avg$WT.day0)
var(data.avg$WT.day6-data.avg$WT.day2)
var(data.avg$WT.day10-data.avg$WT.day6)
hist(data.avg$WT.day2-data.avg$WT.day0,breaks=20,xlim=c(-20,20), ylim=c(0,20000))
hist(data.avg$WT.day6-data.avg$WT.day2,breaks=20,xlim=c(-20,20), ylim=c(0,20000))
hist(data.avg$WT.day10-data.avg$WT.day6,breaks=20,xlim=c(-20,20), ylim=c(0,20000))
# looks to me like day2 to day6 is the biggest change
# More questions to answer, things to do...
# 1. For each of the transitions (day0-day2, day2-day6, day6-day10), how many gene transcripts
# have the following change: at least 1.5x higher, 1.5x lower, 2x higher, 2x lower,
# 5x higher, 5x lower, 10x higher, 10x lower
# Once you have all of those counts, put them into a table or data frame so we
# can visualize it.
# 2. How many gene transcripts fit into each of the following categories
# A) numbers at each time point are all with factor 1.5 of each other
# B) day10 is at least 2x day0, and number never goes down from one time point to the next
# C) day10 is at most 0.5x day0, and number never goes up from one time point to the next
# D) day0 and day10 are within factor 1.5 of each other, and day2 or day6 is at least 2x higher
# E) day0 and day10 are within factor 1.5 of each other, and day2 or day6 is at least 2x lower
# Once you have all of these counts, put them into a table or data frame for visualization.
# 3. For one of the groups A-E, give a boxplot of just that group of rows
# 4. For each of the samples, does the data look like it's normally distributed?
# Try boxplot of a single column, does it look "bell-shaped"
# 5. What does a t-test say about whether each of the following differ from
# more than just random chance?
# i) day2 versus day0, day6 versus day2, day10 versus day6
# ii) day0 replicate 1 versus day0 replicate 2, and comparing other replicates as well