# Goal of this file 
# - take cleaned csv file and do some simple analysis
# - take the average of each of the two replicates, so we'll have 4 number columns
# - normalize so that all columns have a median of 10
# - look for gene transcripts up at day2 compared to day0 and day10

setwd("~/Documents/bd4isu")
data <- read.csv("GSE69618_data.cleaned.csv")
#View(data)
#boxplot(data[,-c(1,2,3)])
#dim(data)

# data frame with averages of the replicates
#colnames(data)
data.avg <- cbind(data[,1:3], 
                  rowMeans(data[,4:5]), rowMeans(data[,6:7]),
                  rowMeans(data[,8:9]), rowMeans(data[,10:11]))
# and fix the column names
colnames(data.avg) <- 
  c(colnames(data)[1:3], "WT.day0", "WT.day2", "WT.day6", "WT.day10")
#View(data.avg)

# now normalize so each number column has median 10
medians <- c(median(data.avg$WT.day0), median(data.avg$WT.day2),
             median(data.avg$WT.day6), median(data.avg$WT.day10))
#medians
data.avg$WT.day0 <- data.avg$WT.day0 * 10.0 / medians[1]
data.avg$WT.day2 <- data.avg$WT.day2 * 10.0 / medians[2]
data.avg$WT.day6 <- data.avg$WT.day6 * 10.0 / medians[3]
data.avg$WT.day10 <- data.avg$WT.day10 * 10.0 / medians[4]

# and boxplot again, medians should all be 10
#boxplot(data.avg[,-(1:3)])

# now let's look at things up at day2 compared to day0, and also up at day2 compared to day10
day2_day0 <- order(data.avg$WT.day2 - data.avg$WT.day0, decreasing = TRUE)
day2_day10 <- order(data.avg$WT.day2 - data.avg$WT.day10, decreasing = TRUE)

# look at the day2-day0 top 100
#View(data.avg[day2_day0[1:100],])
write.csv(data.avg[day2_day0[1:100],], "top100_day2-day0.csv")

# how about rows that are in top of both
top.results <- intersect(day2_day0[1:200], day2_day10[1:200])
#length(top.results)

#View(data.avg[top.results,])

# and do we think there is a bigger change from day0-day2, day2-day6, or day6-day10
var(data.avg$WT.day2 - data.avg$WT.day0)
var(data.avg$WT.day6-data.avg$WT.day2)
var(data.avg$WT.day10-data.avg$WT.day6)
hist(data.avg$WT.day2-data.avg$WT.day0,
     breaks=20,xlim=c(-20,20), ylim=c(0,20000))
hist(data.avg$WT.day6-data.avg$WT.day2,
     breaks=20,xlim=c(-20,20), ylim=c(0,20000))
hist(data.avg$WT.day10-data.avg$WT.day6,breaks=20,
     xlim=c(-20,20), ylim=c(0,20000))
# looks to me like day2 to day6 is the biggest change


# More questions to answer, things to do...

# 1. For each of the transitions (day0-day2, day2-day6, day6-day10), how many gene transcripts 
#    have the following change: at least 1.5x higher, 1.5x lower, 2x higher, 2x lower, 
#    5x higher, 5x lower, 10x higher, 10x lower
#    Once you have all of those counts, put them into a table or data frame so we 
#    can visualize it.
b <- c(-20, -10, -1.5, 0, 1.5, 2, 10, 20)
day2_day0.count <- hist(data.avg$WT.day2 - data.avg$WT.day0, breaks=b, plot=FALSE)
day6_day2.count <- hist(data.avg$WT.day6 - data.avg$WT.day2, breaks=b, plot=FALSE)
day10_day6.count <- hist(data.avg$WT.day10 - data.avg$WT.day6, breaks=b, plot=FALSE)
change.counts <- rbind(day2_day0.count$counts, 
                       day6_day2.count$counts, 
                       day10_day6.count$counts) * 100.0 / nrow(data.avg)
colnames(change.counts) <- paste(as.character(b[-8]),"to",as.character(b[-1]),sep="")
rownames(change.counts) <- c("day2-day0", "day6-day2", "day10-day6")
change.counts

# 2. How many gene transcripts fit into each of the following categories

#    A) numbers at each time point are all within (factor) 1.5 of each other
row.ranges <- t(apply(data.avg[,-(1:3)],1,range))
#View(row.ranges)
groupA <-abs(row.ranges[,1]-row.ranges[,2]) <= 1.5
sum(groupA)
boxplot(data.avg[groupA, -(1:3)])
hist(row.ranges[,2],breaks=20) # note this looks like 6000 of the gene transcripts are 0 or 1 for all samples

#    B) day10 is at least +2 day0, and number never goes down from one time point to the next
groupB <- data.avg$WT.day10 >= 2 + data.avg$WT.day0 & 
      data.avg$WT.day2 >= data.avg$WT.day0 & 
      data.avg$WT.day6 >= data.avg$WT.day2 &
      data.avg$WT.day10 >= data.avg$WT.day6
sum(groupB)
boxplot(data.avg[groupB, -(1:3)])


#    C) day10 is at most -2+ day0, and number never goes up from one time point to the next
groupC <- data.avg$WT.day10 <= -2 + data.avg$WT.day0 & 
      data.avg$WT.day2 <= data.avg$WT.day0 & 
      data.avg$WT.day6 <= data.avg$WT.day2 &
      data.avg$WT.day10 <= data.avg$WT.day6
sum(groupC)
boxplot(data.avg[groupC, -(1:3)])


#    D) day0 and day10 are within (factor) 1.5 of each other, and day2 or day6 is at least 3 higher
groupD <- data.avg$WT.day0 <= data.avg$WT.day10 + 1.5 &
      data.avg$WT.day10 <= data.avg$WT.day0 + 1.5 &
      (data.avg$WT.day2 >= 3 + data.avg$WT.day0 |
         data.avg$WT.day6 >= 3 + data.avg$WT.day0)
sum(groupD)
boxplot(data.avg[groupD, -(1:3)])


#    E) day0 and day10 are within  1.5 of each other, and day2 or day6 is at least 2 lower
groupE <- data.avg$WT.day0 <= data.avg$WT.day10 + 1.5 &
      data.avg$WT.day10 <= data.avg$WT.day0 + 1.5 &
      (data.avg$WT.day2 <= data.avg$WT.day0 - 2 |
         data.avg$WT.day6 <= data.avg$WT.day0 -2)
sum(groupE)
boxplot(data.avg[groupE, -(1:3)])

#    Once you have all of these counts, put them into a table or data frame for visualization.

# 3. For one of the groups A-E, give a boxplot of just that group of rows
# above

# 4. For each of the samples, does the data look like it's normally distributed?
#    Try hist of a single column, does it look "bell-shaped"
hist(data.avg[,4], breaks=20)
# looks bell-shaped except for around 0, maybe something scewed towards 0


# 5. What does a t-test say about whether each of the following differ from 
#     more than just random chance?
#    i) day2 versus day0, day6 versus day2, day10 versus day6
result <- t.test(data.avg$WT.day2, data.avg$WT.day0)
result$p.value
result <- t.test(data.avg$WT.day6, data.avg$WT.day2)
result$p.value
result <- t.test(data.avg$WT.day10, data.avg$WT.day6)
result$p.value
# seems to say biggest difference is from day0-day2, and least is day6-day10

#    ii) day0 replicate 1 versus day0 replicate 2, and comparing other replicates as well
result <- t.test(data$WT.day0.Rep1, data$WT.day0.Rep2)
result$p.value
result <- t.test(data$WT.day2.Rep1, data$WT.day2.Rep2)
result$p.value
result <- t.test(data$WT.day6.Rep1, data$WT.day6.Rep2)
result$p.value
result <- t.test(data$WT.day10.Rep3, data$WT.day10.Rep4)
result$p.value

# those seem to be saying the replicates are not similar, so ...
hist(data$WT.day2.Rep1-data$WT.day2.Rep2, breaks=100)
# rep1 is higher in general, so normalizing might be good
result <- t.test(data$WT.day2.Rep1 * 10.0 / median(data$WT.day2.Rep1),
                 data$WT.day2.Rep2 * 10.0 / median(data$WT.day2.Rep2))
result$p.value
# and p value more like we expect.  so normalizing yes indeed needs to happen