# Note that lines starting with # are commends
# uncomment the following line, and put the directory where you are keeping these files
# setwd("~/Downloads")
# read from csv file and save into variable named data
data <- read.csv("GSE69618_data.csv")
# print a summary - max, min, mean, etc.
summary(data)
# number of rows and columns. Note that #rows in this file is number of
# gene transcripts that were measured, and #cols is the number of different
# samples that were measured.
dim(data)
# view the data graphically, similar to how you would view it in excel
View(data)
# which columns actually have numbers we're interested in (since the first 2 don't)
numberColumns <- 3:ncol(data)
# note that the range on the data is huge - from 0 up to over 1 million
# that is bad for plotting, and in general strange to think about
# we want to take log of the values to make the range smaller, but we
# also need to not have 0's in the data if we're taking log's
# the following command makes log2 data for us
data.log2 <- data
nonzero <- data[,numberColumns] != 0
data.log2[,numberColumns][nonzero] <- log2(data[,numberColumns][nonzero])
# summary the log2 data, should be smaller range now
summary(data.log2)
# and a box plot of the different samples, should look okay since we log2'd it
boxplot(data.log2[,numberColumns])
# graphically view the log2 data
View(data.log2)
# take a summary of two of the samples that are replicates of the same treatment,
# in particular the two replicates of wild-type at day0
summary(data.log2$H1.WT.day0.Rep2 - data.log2$H1.WT.day0.Rep1)
# another view, histogram
hist(data.log2$H1.WT.day0.Rep2-data.log2$H1.WT.day0.Rep1)
# it looks like a difference of about 2 between the replicates could be just due to
# random chance
# now compare two samples that are not from the same treatment, in particular
# a wild-type day2 sample versus a wild-type day 0 sample
summary(data.log2$H1.WT.day2.Rep2 - data.log2$H1.WT.day0.Rep2)
hist(data.log2$H1.WT.day2.Rep2 - data.log2$H1.WT.day0.Rep2)