# Goal of this file # - take cleaned csv file and do some simple analysis # - take the average of each of the two replicates, so we'll have 4 number columns # - normalize so that all columns have a median of 10 # - look for gene transcripts up at day2 compared to day0 and day10 setwd("~/Documents/bd4isu") data <- read.csv("GSE69618_data.cleaned.csv") #View(data) #boxplot(data[,-c(1,2,3)]) #dim(data) # data frame with averages of the replicates #colnames(data) data.avg <- cbind(data[,1:3], rowMeans(data[,4:5]), rowMeans(data[,6:7]), rowMeans(data[,8:9]), rowMeans(data[,10:11])) # and fix the column names colnames(data.avg) <- c(colnames(data)[1:3], "WT.day0", "WT.day2", "WT.day6", "WT.day10") #View(data.avg) # now normalize so each number column has median 10 medians <- c(median(data.avg$WT.day0), median(data.avg$WT.day2), median(data.avg$WT.day6), median(data.avg$WT.day10)) #medians data.avg$WT.day0 <- data.avg$WT.day0 * 10.0 / medians[1] data.avg$WT.day2 <- data.avg$WT.day2 * 10.0 / medians[2] data.avg$WT.day6 <- data.avg$WT.day6 * 10.0 / medians[3] data.avg$WT.day10 <- data.avg$WT.day10 * 10.0 / medians[4] # and boxplot again, medians should all be 10 #boxplot(data.avg[,-(1:3)]) # now let's look at things up at day2 compared to day0, and also up at day2 compared to day10 day2_day0 <- order(data.avg$WT.day2 - data.avg$WT.day0, decreasing = TRUE) day2_day10 <- order(data.avg$WT.day2 - data.avg$WT.day10, decreasing = TRUE) # look at the day2-day0 top 100 #View(data.avg[day2_day0[1:100],]) write.csv(data.avg[day2_day0[1:100],], "top100_day2-day0.csv") # how about rows that are in top of both top.results <- intersect(day2_day0[1:200], day2_day10[1:200]) #length(top.results) #View(data.avg[top.results,]) # and do we think there is a bigger change from day0-day2, day2-day6, or day6-day10 var(data.avg$WT.day2 - data.avg$WT.day0) var(data.avg$WT.day6-data.avg$WT.day2) var(data.avg$WT.day10-data.avg$WT.day6) hist(data.avg$WT.day2-data.avg$WT.day0, breaks=20,xlim=c(-20,20), ylim=c(0,20000)) hist(data.avg$WT.day6-data.avg$WT.day2, breaks=20,xlim=c(-20,20), ylim=c(0,20000)) hist(data.avg$WT.day10-data.avg$WT.day6,breaks=20, xlim=c(-20,20), ylim=c(0,20000)) # looks to me like day2 to day6 is the biggest change # More questions to answer, things to do... # 1. For each of the transitions (day0-day2, day2-day6, day6-day10), how many gene transcripts # have the following change: at least 1.5x higher, 1.5x lower, 2x higher, 2x lower, # 5x higher, 5x lower, 10x higher, 10x lower # Once you have all of those counts, put them into a table or data frame so we # can visualize it. b <- c(-20, -10, -1.5, 0, 1.5, 2, 10, 20) day2_day0.count <- hist(data.avg$WT.day2 - data.avg$WT.day0, breaks=b, plot=FALSE) day6_day2.count <- hist(data.avg$WT.day6 - data.avg$WT.day2, breaks=b, plot=FALSE) day10_day6.count <- hist(data.avg$WT.day10 - data.avg$WT.day6, breaks=b, plot=FALSE) change.counts <- rbind(day2_day0.count$counts, day6_day2.count$counts, day10_day6.count$counts) * 100.0 / nrow(data.avg) colnames(change.counts) <- paste(as.character(b[-8]),"to",as.character(b[-1]),sep="") rownames(change.counts) <- c("day2-day0", "day6-day2", "day10-day6") change.counts # 2. How many gene transcripts fit into each of the following categories # A) numbers at each time point are all within (factor) 1.5 of each other row.ranges <- t(apply(data.avg[,-(1:3)],1,range)) #View(row.ranges) groupA <-abs(row.ranges[,1]-row.ranges[,2]) <= 1.5 sum(groupA) boxplot(data.avg[groupA, -(1:3)]) hist(row.ranges[,2],breaks=20) # note this looks like 6000 of the gene transcripts are 0 or 1 for all samples # B) day10 is at least +2 day0, and number never goes down from one time point to the next groupB <- data.avg$WT.day10 >= 2 + data.avg$WT.day0 & data.avg$WT.day2 >= data.avg$WT.day0 & data.avg$WT.day6 >= data.avg$WT.day2 & data.avg$WT.day10 >= data.avg$WT.day6 sum(groupB) boxplot(data.avg[groupB, -(1:3)]) # C) day10 is at most -2+ day0, and number never goes up from one time point to the next groupC <- data.avg$WT.day10 <= -2 + data.avg$WT.day0 & data.avg$WT.day2 <= data.avg$WT.day0 & data.avg$WT.day6 <= data.avg$WT.day2 & data.avg$WT.day10 <= data.avg$WT.day6 sum(groupC) boxplot(data.avg[groupC, -(1:3)]) # D) day0 and day10 are within (factor) 1.5 of each other, and day2 or day6 is at least 3 higher groupD <- data.avg$WT.day0 <= data.avg$WT.day10 + 1.5 & data.avg$WT.day10 <= data.avg$WT.day0 + 1.5 & (data.avg$WT.day2 >= 3 + data.avg$WT.day0 | data.avg$WT.day6 >= 3 + data.avg$WT.day0) sum(groupD) boxplot(data.avg[groupD, -(1:3)]) # E) day0 and day10 are within 1.5 of each other, and day2 or day6 is at least 2 lower groupE <- data.avg$WT.day0 <= data.avg$WT.day10 + 1.5 & data.avg$WT.day10 <= data.avg$WT.day0 + 1.5 & (data.avg$WT.day2 <= data.avg$WT.day0 - 2 | data.avg$WT.day6 <= data.avg$WT.day0 -2) sum(groupE) boxplot(data.avg[groupE, -(1:3)]) # Once you have all of these counts, put them into a table or data frame for visualization. # 3. For one of the groups A-E, give a boxplot of just that group of rows # above # 4. For each of the samples, does the data look like it's normally distributed? # Try hist of a single column, does it look "bell-shaped" hist(data.avg[,4], breaks=20) # looks bell-shaped except for around 0, maybe something scewed towards 0 # 5. What does a t-test say about whether each of the following differ from # more than just random chance? # i) day2 versus day0, day6 versus day2, day10 versus day6 result <- t.test(data.avg$WT.day2, data.avg$WT.day0) result$p.value result <- t.test(data.avg$WT.day6, data.avg$WT.day2) result$p.value result <- t.test(data.avg$WT.day10, data.avg$WT.day6) result$p.value # seems to say biggest difference is from day0-day2, and least is day6-day10 # ii) day0 replicate 1 versus day0 replicate 2, and comparing other replicates as well result <- t.test(data$WT.day0.Rep1, data$WT.day0.Rep2) result$p.value result <- t.test(data$WT.day2.Rep1, data$WT.day2.Rep2) result$p.value result <- t.test(data$WT.day6.Rep1, data$WT.day6.Rep2) result$p.value result <- t.test(data$WT.day10.Rep3, data$WT.day10.Rep4) result$p.value # those seem to be saying the replicates are not similar, so ... hist(data$WT.day2.Rep1-data$WT.day2.Rep2, breaks=100) # rep1 is higher in general, so normalizing might be good result <- t.test(data$WT.day2.Rep1 * 10.0 / median(data$WT.day2.Rep1), data$WT.day2.Rep2 * 10.0 / median(data$WT.day2.Rep2)) result$p.value # and p value more like we expect. so normalizing yes indeed needs to happen