---
title: "GSE69618 Gene Fun"
author: "Jeff Kinne"
date: "February 22, 2019"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Fun with Gene Expression Data - GSE69618

Let's load some data.  It is gene expression data.  Google for GSE69618 to read about it and download it.

```{r}
setwd("~jkinne/public_html/cs459-bd4isu-s2019/code/")
#setwd("~/Documents")
data <- read.csv("GSE69618_data.csv")
```

Now we clean our data a little - remove (NA-ize) values less than 10, log2 things, clean up the column names to make them shorter for displaying on figures

```{r}
tooSmall <- data[,3:ncol(data)] <= 10

NA_count <- sum(is.na(data[,3:ncol(data)])) # how many values were NA already
small_count <- sum(tooSmall) # how many are too small and being turned into NA
total_count <- prod(dim(data[,3:ncol(data)])) # how many points total
print(paste("Percent of values that were read in as NA:  ", NA_count/total_count*100))
print(paste("Percent that were <= 10 so setting to NA: ", small_count/total_count*100))

data[,3:ncol(data)][tooSmall] <- NA
data[,3:ncol(data)] <- log2(data[,3:ncol(data)])
#data[is.na(data[,3:ncol(data)])] <- 0
temp <- colnames(data)
temp <- gsub("shRNA.", "", temp)
temp <- gsub("H1.", "", temp)
colnames(data) <- temp

# average expression value over entire dataset.  note this is with the NA's (which includes <= 10 values that were removed) removed already.
mean(as.matrix(data[,3:ncol(data)]), na.rm=TRUE)
```

Now we boxplot to look at things.  Note that this shows the median, max, min for each sample.
```{r, fig.height=7}
temp <- par()$mai
par(mai=c(1,2,1,1)) # bottom, left, top, right
boxplot(data[,3:ncol(data)], 
        main="GSE69618, <=10 removed, log2", xlab="log2", las=2, ylab="",
        horizontal=TRUE, varwidth=TRUE, col=c(2,3))
par(mai=temp)
```


Heh data, how many NA's - both per sample (column) and per gene (row)?  

```{r}
percentNA <- function(x) {
  return(sum(is.na(x))/length(x)*100)
}
na_by_col <- apply(data[3:ncol(data)], 2, percentNA) # apply per column
na_by_row <- apply(data[3:ncol(data)], 1, percentNA) # apply per row

# could have done this for na_by_col...
#na_by_col <- apply(data[3:ncol(data)], 2, function(x) { sum(is.na(x))/length(x)*100})

plot(na_by_col, main="Percent NA, per sample", ylab="%NA")

# Heh genes, how many NA's?  How many genes are at least 50% NAs, 20%, 10%, >0?
hist(na_by_row, main="Percent NA, per gene", xlab="%NA")
```


Top 10 highest expressed transcript.  Bottom 10.  Note - sum the rows, then ...
```{r}
stats <- data.frame(RefSeqID=data$RefSeqID)
stats$total <- apply(data[,3:ncol(data)], 1, sum, na.rm = TRUE)

stats$avg = stats$total/(ncol(data)-2)

dec_total <- order(stats$total, decreasing=TRUE, na.last=TRUE)

z <- stats[head(dec_total, n=10),]
print(z)

zeros <- sum(stats$total == 0)
print(paste("Number of genes with no expression in any sample (or <= 10):",zeros))
hist(stats$avg)
```

WT samples at days 0, 2, 6, and 10. Create a data frame that has the two replicates for each of those averaged, and still has the RefSeqID column
```{r}
WTcolumns <- 
 grep("WT", colnames(data))
dataWT <- data[,WTcolumns]
rownames(dataWT) <- data$RefSeqID
dataWTa <- data.frame(ID=data$RefSeqID, 
                      day0 = rowMeans(dataWT[,1:2]), 
                      day2 = rowMeans(dataWT[,3:4]),
                      day6 = rowMeans(dataWT[,5:6]),
                      day10 = rowMeans(dataWT[,7:10]))
View(dataWT)
```

Let's focus on days 0 and 2. Create a new column (or a new data frame) that has day2 - day0. Give a boxplot of this (so we can see the range on this over all genes).
Do a similar thing for day6-day2, day10-day6.
```{r}
dataWTa$day2_0 <- dataWTa$day2 - dataWTa$day0
dataWTa$day6_2 <- dataWTa$day6 - dataWTa$day2
dataWTa$day10_6 <- dataWTa$day10 - dataWTa$day6
boxplot(dataWTa[,2:5])
boxplot(dataWTa[,6:ncol(dataWTa)])
#View(dataWTa)
```

Pick out the top 10 genes that have the highest difference for each of these comparisons. Note - you can do this by using View and clicking on the column header to sort; you can/should also do it in your script automatically.  And let's compare the same set of samples that the summer group did, and let's come up with a gene list.

```{r}
top_n <- 100
dec_total <- order(dataWTa$day6_2, decreasing=TRUE, na.last=TRUE)
z <- dataWTa[head(dec_total, n=top_n),]
summary(z$day6_2)
other_analysis.genes <- read.csv("busser_human up at day 6.csv")
View(other_analysis.genes)
print(paste("Number of genes/rows in the other analysis - ", nrow(other_analysis.genes)))
print(paste("Number of our significant genes in that list - ",
            length(unique(intersect(other_analysis.genes$row.names, z$ID)))))

```


Some annotations for the genes...
```{r}
library("org.Hs.eg.db")
cols <- c("REFSEQ","ENTREZID","GENENAME","SYMBOL", "GO")
geneData <- select(org.Hs.eg.db, keys=as.character(data$RefSeqID), columns=cols, keytype="REFSEQ")
#View(columns(org.Hs.eg.db))
View(geneData)

library("GO.db")
columns(GO.db)
View(select(GO.db, keys=c("GO:0001869"), columns=c("GOID", "DEFINITION", "ONTOLOGY", "TERM")))
```