# Goal of this file 
# - take cleaned csv file and do some simple analysis
# - take the average of each of the two replicates, so we'll have 4 number columns
# - normalize so that all columns have a median of 10
# - look for gene transcripts up at day2 compared to day0 and day10

data <- read.csv("GSE69618_data.cleaned.csv")
View(data)
boxplot(data[,-c(1,2,3)])
dim(data)

# data frame with averages of the replicates
colnames(data)
data.avg <- cbind(data[,1:3], rowMeans(data[,4:5]), rowMeans(data[,6:7]),
                  rowMeans(data[,8:9]), rowMeans(data[,10:11]))
# and fix the column names
colnames(data.avg) <- c(colnames(data)[1:3], "WT.day0", "WT.day2", "WT.day6", "WT.day10")
View(data.avg)

# now normalize so each number column has median 10
medians <- c(median(data.avg$WT.day0), median(data.avg$WT.day2),
             median(data.avg$WT.day6), median(data.avg$WT.day10))
data.avg$WT.day0 <- data.avg$WT.day0 * 10.0 / medians[1]
data.avg$WT.day2 <- data.avg$WT.day2 * 10.0 / medians[2]
data.avg$WT.day6 <- data.avg$WT.day6 * 10.0 / medians[3]
data.avg$WT.day10 <- data.avg$WT.day10 * 10.0 / medians[4]

# and boxplot again, medians should all be 10
boxplot(data.avg[,-(1:3)])

# now let's look at things up at day2 compared to day0, and also up at day2 compared to day10
day2_day0 <- order(data.avg$WT.day2 - data.avg$WT.day0, decreasing = TRUE)
day2_day10 <- order(data.avg$WT.day2 - data.avg$WT.day10, decreasing = TRUE)

top.results <- intersect(day2_day0[1:200], day2_day10[1:200])
View(data.avg[top.results,])

# and do we think there is a bigger change from day0-day2, day2-day6, or day6-day10
var(data.avg$WT.day2-data.avg$WT.day0)
var(data.avg$WT.day6-data.avg$WT.day2)
var(data.avg$WT.day10-data.avg$WT.day6)
hist(data.avg$WT.day2-data.avg$WT.day0,breaks=20,xlim=c(-20,20), ylim=c(0,20000))
hist(data.avg$WT.day6-data.avg$WT.day2,breaks=20,xlim=c(-20,20), ylim=c(0,20000))
hist(data.avg$WT.day10-data.avg$WT.day6,breaks=20,xlim=c(-20,20), ylim=c(0,20000))
# looks to me like day2 to day6 is the biggest change


# More questions to answer, things to do...

# 1. For each of the transitions (day0-day2, day2-day6, day6-day10), how many gene transcripts 
#    have the following change: at least 1.5x higher, 1.5x lower, 2x higher, 2x lower, 
#    5x higher, 5x lower, 10x higher, 10x lower
#    Once you have all of those counts, put them into a table or data frame so we 
#    can visualize it.

# 2. How many gene transcripts fit into each of the following categories
#    A) numbers at each time point are all with factor 1.5 of each other
#    B) day10 is at least 2x day0, and number never goes down from one time point to the next
#    C) day10 is at most 0.5x day0, and number never goes up from one time point to the next
#    D) day0 and day10 are within factor 1.5 of each other, and day2 or day6 is at least 2x higher
#    E) day0 and day10 are within factor 1.5 of each other, and day2 or day6 is at least 2x lower
#    Once you have all of these counts, put them into a table or data frame for visualization.

# 3. For one of the groups A-E, give a boxplot of just that group of rows

# 4. For each of the samples, does the data look like it's normally distributed?
#    Try boxplot of a single column, does it look "bell-shaped"

# 5. What does a t-test say about whether each of the following differ from 
#     more than just random chance?
#    i) day2 versus day0, day6 versus day2, day10 versus day6
#    ii) day0 replicate 1 versus day0 replicate 2, and comparing other replicates as well