--- title: "GSE69618 Gene Fun" author: "Jeff Kinne" date: "February 22, 2019" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ## Fun with Gene Expression Data - GSE69618 Let's load some data. It is gene expression data. Google for GSE69618 to read about it and download it. ```{r} setwd("~jkinne/public_html/cs459-bd4isu-s2019/code/") #setwd("~/Documents") data <- read.csv("GSE69618_data.csv") ``` Now we clean our data a little - remove (NA-ize) values less than 10, log2 things, clean up the column names to make them shorter for displaying on figures ```{r} tooSmall <- data[,3:ncol(data)] <= 10 NA_count <- sum(is.na(data[,3:ncol(data)])) # how many values were NA already small_count <- sum(tooSmall) # how many are too small and being turned into NA total_count <- prod(dim(data[,3:ncol(data)])) # how many points total print(paste("Percent of values that were read in as NA: ", NA_count/total_count*100)) print(paste("Percent that were <= 10 so setting to NA: ", small_count/total_count*100)) data[,3:ncol(data)][tooSmall] <- NA data[,3:ncol(data)] <- log2(data[,3:ncol(data)]) #data[is.na(data[,3:ncol(data)])] <- 0 temp <- colnames(data) temp <- gsub("shRNA.", "", temp) temp <- gsub("H1.", "", temp) colnames(data) <- temp # average expression value over entire dataset. note this is with the NA's (which includes <= 10 values that were removed) removed already. mean(as.matrix(data[,3:ncol(data)]), na.rm=TRUE) ``` Now we boxplot to look at things. Note that this shows the median, max, min for each sample. ```{r, fig.height=7} temp <- par()$mai par(mai=c(1,2,1,1)) # bottom, left, top, right boxplot(data[,3:ncol(data)], main="GSE69618, <=10 removed, log2", xlab="log2", las=2, ylab="", horizontal=TRUE, varwidth=TRUE, col=c(2,3)) par(mai=temp) ``` Heh data, how many NA's - both per sample (column) and per gene (row)? ```{r} percentNA <- function(x) { return(sum(is.na(x))/length(x)*100) } na_by_col <- apply(data[3:ncol(data)], 2, percentNA) # apply per column na_by_row <- apply(data[3:ncol(data)], 1, percentNA) # apply per row # could have done this for na_by_col... #na_by_col <- apply(data[3:ncol(data)], 2, function(x) { sum(is.na(x))/length(x)*100}) plot(na_by_col, main="Percent NA, per sample", ylab="%NA") # Heh genes, how many NA's? How many genes are at least 50% NAs, 20%, 10%, >0? hist(na_by_row, main="Percent NA, per gene", xlab="%NA") ``` Top 10 highest expressed transcript. Bottom 10. Note - sum the rows, then ... ```{r} stats <- data.frame(RefSeqID=data$RefSeqID) stats$total <- apply(data[,3:ncol(data)], 1, sum, na.rm = TRUE) stats$avg = stats$total/(ncol(data)-2) dec_total <- order(stats$total, decreasing=TRUE, na.last=TRUE) z <- stats[head(dec_total, n=10),] print(z) zeros <- sum(stats$total == 0) print(paste("Number of genes with no expression in any sample (or <= 10):",zeros)) hist(stats$avg) ``` WT samples at days 0, 2, 6, and 10. Create a data frame that has the two replicates for each of those averaged, and still has the RefSeqID column ```{r} WTcolumns <- grep("WT", colnames(data)) dataWT <- data[,WTcolumns] rownames(dataWT) <- data$RefSeqID dataWTa <- data.frame(ID=data$RefSeqID, day0 = rowMeans(dataWT[,1:2]), day2 = rowMeans(dataWT[,3:4]), day6 = rowMeans(dataWT[,5:6]), day10 = rowMeans(dataWT[,7:10])) View(dataWT) ``` Let's focus on days 0 and 2. Create a new column (or a new data frame) that has day2 - day0. Give a boxplot of this (so we can see the range on this over all genes). Do a similar thing for day6-day2, day10-day6. ```{r} dataWTa$day2_0 <- dataWTa$day2 - dataWTa$day0 dataWTa$day6_2 <- dataWTa$day6 - dataWTa$day2 dataWTa$day10_6 <- dataWTa$day10 - dataWTa$day6 boxplot(dataWTa[,2:5]) boxplot(dataWTa[,6:ncol(dataWTa)]) #View(dataWTa) ``` Pick out the top 10 genes that have the highest difference for each of these comparisons. Note - you can do this by using View and clicking on the column header to sort; you can/should also do it in your script automatically. And let's compare the same set of samples that the summer group did, and let's come up with a gene list. ```{r} top_n <- 100 dec_total <- order(dataWTa$day6_2, decreasing=TRUE, na.last=TRUE) z <- dataWTa[head(dec_total, n=top_n),] summary(z$day6_2) other_analysis.genes <- read.csv("busser_human up at day 6.csv") View(other_analysis.genes) print(paste("Number of genes/rows in the other analysis - ", nrow(other_analysis.genes))) print(paste("Number of our significant genes in that list - ", length(unique(intersect(other_analysis.genes$row.names, z$ID))))) ``` Some annotations for the genes... ```{r} library("org.Hs.eg.db") cols <- c("REFSEQ","ENTREZID","GENENAME","SYMBOL", "GO") geneData <- select(org.Hs.eg.db, keys=as.character(data$RefSeqID), columns=cols, keytype="REFSEQ") #View(columns(org.Hs.eg.db)) View(geneData) library("GO.db") columns(GO.db) View(select(GO.db, keys=c("GO:0001869"), columns=c("GOID", "DEFINITION", "ONTOLOGY", "TERM"))) ```