# Goal of this file # - take cleaned csv file and do some simple analysis # - take the average of each of the two replicates, so we'll have 4 number columns # - normalize so that all columns have a median of 10 # - look for gene transcripts up at day2 compared to day0 and day10 data <- read.csv("GSE69618_data.cleaned.csv") View(data) boxplot(data[,-c(1,2,3)]) dim(data) # data frame with averages of the replicates colnames(data) data.avg <- cbind(data[,1:3], rowMeans(data[,4:5]), rowMeans(data[,6:7]), rowMeans(data[,8:9]), rowMeans(data[,10:11])) # and fix the column names colnames(data.avg) <- c(colnames(data)[1:3], "WT.day0", "WT.day2", "WT.day6", "WT.day10") View(data.avg) # now normalize so each number column has median 10 medians <- c(median(data.avg$WT.day0), median(data.avg$WT.day2), median(data.avg$WT.day6), median(data.avg$WT.day10)) data.avg$WT.day0 <- data.avg$WT.day0 * 10.0 / medians[1] data.avg$WT.day2 <- data.avg$WT.day2 * 10.0 / medians[2] data.avg$WT.day6 <- data.avg$WT.day6 * 10.0 / medians[3] data.avg$WT.day10 <- data.avg$WT.day10 * 10.0 / medians[4] # and boxplot again, medians should all be 10 boxplot(data.avg[,-(1:3)]) # now let's look at things up at day2 compared to day0, and also up at day2 compared to day10 day2_day0 <- order(data.avg$WT.day2 - data.avg$WT.day0, decreasing = TRUE) day2_day10 <- order(data.avg$WT.day2 - data.avg$WT.day10, decreasing = TRUE) top.results <- intersect(day2_day0[1:200], day2_day10[1:200]) View(data.avg[top.results,]) # and do we think there is a bigger change from day0-day2, day2-day6, or day6-day10 var(data.avg$WT.day2-data.avg$WT.day0) var(data.avg$WT.day6-data.avg$WT.day2) var(data.avg$WT.day10-data.avg$WT.day6) hist(data.avg$WT.day2-data.avg$WT.day0,breaks=20,xlim=c(-20,20), ylim=c(0,20000)) hist(data.avg$WT.day6-data.avg$WT.day2,breaks=20,xlim=c(-20,20), ylim=c(0,20000)) hist(data.avg$WT.day10-data.avg$WT.day6,breaks=20,xlim=c(-20,20), ylim=c(0,20000)) # looks to me like day2 to day6 is the biggest change # More questions to answer, things to do... # 1. For each of the transitions (day0-day2, day2-day6, day6-day10), how many gene transcripts # have the following change: at least 1.5x higher, 1.5x lower, 2x higher, 2x lower, # 5x higher, 5x lower, 10x higher, 10x lower # Once you have all of those counts, put them into a table or data frame so we # can visualize it. # 2. How many gene transcripts fit into each of the following categories # A) numbers at each time point are all with factor 1.5 of each other # B) day10 is at least 2x day0, and number never goes down from one time point to the next # C) day10 is at most 0.5x day0, and number never goes up from one time point to the next # D) day0 and day10 are within factor 1.5 of each other, and day2 or day6 is at least 2x higher # E) day0 and day10 are within factor 1.5 of each other, and day2 or day6 is at least 2x lower # Once you have all of these counts, put them into a table or data frame for visualization. # 3. For one of the groups A-E, give a boxplot of just that group of rows # 4. For each of the samples, does the data look like it's normally distributed? # Try boxplot of a single column, does it look "bell-shaped" # 5. What does a t-test say about whether each of the following differ from # more than just random chance? # i) day2 versus day0, day6 versus day2, day10 versus day6 # ii) day0 replicate 1 versus day0 replicate 2, and comparing other replicates as well