# Read in an example gene expression data file
# Modify this to be wherever you are keeping the data on your computer
setwd("~/OneDrive - Indiana State University/Courses/cs618-s2022/")
fileName <- "GSE85331_all.gene.FPKM.output.replicates.txt"
# Read in the data
data <- read.csv(fileName, sep='\t')
dim(data) # how many rows and columns
summary(data) # summary of each column
View(data) # GUI viewing the data - don't do this if data is too huge
# pick out just the data (not the metadata - gene symbols, etc.)
numeric_cols <- unlist(lapply(data, is.numeric))
data_nums <- data[,numeric_cols]
# boxplot of each sample - plotted with log2 because the data had not been
# log-scaled (which we normally do with gene expression data)
boxplot(log2(data_nums+1), las=2)
# a factor giving the sample groups for each of the columns (from data_nums)
sample_groups <- factor(rep(c("d0","d0","d2","d2","d4","d4","CM","CM"),4))
length(sample_groups)
sample_groups
# verify some values that we pulled out using Excel
# Genes with highest H1_day0_0 values: SNORD97, SNHG25, EEF1A1, RPL38, RPS27.
H1_day0_0_max <- order(data_nums$H1_day0_0, decreasing = TRUE)[1:5]
data_nums$H1_day0_0[H1_day0_0_max] # the values
data$X[H1_day0_0_max] # the gene symbols
# Genes with highest H1_CM_0 values: H19, MYL7, RPL31, SNORD9, RPS27.
H1_CM_0_max <- order(data_nums$H1_CM_0, decreasing = TRUE)[1:5]
data_nums$H1_CM_0[H1_CM_0_max] # the values
data$X[H1_CM_0_max] # the gene symbols
# Number of genes (#rows - 1): 26257
dim(data)
# That gave 26256, so probably was off by one when took it from Excel - can
# check for sure by looking at first and last rows here, in Excel, and in a
# text viewer to see if something is missing.
# Median value for H1_day0_0: 0.539942
median(data_nums$H1_day0_0)
# Median value for H1_CM_0: 1.246015
median(data_nums$H1_CM_0)
# Average value for H1_day0_0: 15.86772859
mean(data_nums$H1_day0_0)
# Average value for H1_CM_0: 16.4574767
mean(data_nums$H1_CM_0)
# Note that Excel and R are different on the last decimal point or two
# on some of those. Looks like R is printing to a smaller # of decimal
# places. Can change that with the following -
options(digits=10)
mean(data_nums$H1_CM_0)
# and now they print the same