# Created by Jeff Kinne, jkinne@cs.indstate.edu # Weather data is from https://mrcc.illinois.edu/CLIMATE/welcome.jsp # Step 1... # First, get your working directory set properly so you'll be able to import # files without having to put the full path into your commands. # On your CS computers, uncomment the following if you store your csv files in your home directory #setwd("~/") # On a windows computer, something like the following (depends where you keep your files) #setwd("C:/Users/jkinne/Documents/cs459") # On my macbook, I am using this - setwd("~/Documents") # When I am working on my office computer, I am using this - #setwd("~/public_html/cs459-bd4isu-s2019/code") # Step 2... # Import the csv file into a data frame. # header=TRUE because there is a header in the csv file. na.strings so the numbers will properly be read as numbers # (and M or T as missing data) # Try using the following file as well - Champaign-Weather-Station-USC00118740-1950-2018.csv data_indy <- read.csv("Indianapolis-Weather-Station-USW00093819-1950-2018.csv", header = TRUE, na.strings=c("M", "T")) data_champ <- read.csv("Champaign-Weather-Station-USC00118740-1950-2018.csv", header = TRUE, na.strings=c("M", "T")) processWeather <- function(data) { # Fix a few things with the data. This is a process of looking at it to make sure # things are right, and fixing anything that is wrong. Note - after working through issues # with weather data downloaded for Terre Haute, I decided it wasn't good enough so I switched # to data from the Indy airport. # Add a "Year" column so we can do averages and such based on that data$Year <- as.numeric(substr(data$Date, 1, 4)) data$Month <- as.numeric(substr(data$Date, 6,7)) data$Day <- as.numeric(substr(data$Date, 9,10)) data$Day365 <- as.numeric(format.Date(data$Date, "%j")) # Summary of all of the columns. Check out the output #print(summary(data)) # Check to make sure things look right... #View(data) return(data) } data_indy <- processWeather(data_indy) data_champ <- processWeather(data_champ) # Step 3... # Some analysis and/or plotting # Note, if we want the average PRCP for 1950, we could do ... # summary(data$PRCP[1:365]) # mean(data$PRCP[1:365], na.rm=TRUE) makeYearSummaries <- function(data) { # Data frame that will have one row for each year of our data # Note - unique does what it sounds like yearSummaries <- data.frame(year=unique(data$Year)) # Add a column with a count of how many entries there are for each year (hopefully 365 or 366) yearSummaries$count <- table(as.factor(data$Year)) #plot(yearSummaries$year, yearSummaries$count) # Add some other yearly data - average max, average min, average mean, total prcp, total snow yearSummaries$tmax <- tapply(data$TMAX, data$Year, mean, na.rm=TRUE) yearSummaries$tmin <- tapply(data$TMIN, data$Year, mean, na.rm=TRUE) yearSummaries$mean <- tapply(data$MEAN, data$Year, mean, na.rm=TRUE) yearSummaries$prcp <- tapply(data$PRCP, data$Year, sum, na.rm=TRUE) yearSummaries$snow <- tapply(data$SNOW, data$Year, sum, na.rm=TRUE) # last day of measurable snowfall yearSummaries$lastSnow <- as.numeric( lapply(unique(data$Year), function(y) { max(data[data$SNOW > 0 & data$Year == y & data$Day365 < 180,"Day365"],na.rm=TRUE) }) ) # first day of measurable snowfall yearSummaries$firstSnow <- as.numeric( lapply(unique(data$Year), function(y) { min(data[data$SNOW > 0 & data$Year == y & data$Day365 > 180,"Day365"],na.rm=TRUE) }) ) # Last, only keep years with a full set of data yearSummaries <- yearSummaries[yearSummaries$count >= 365,] # And take a look at the summary print(summary(yearSummaries)) return(yearSummaries) } yearSummaries_indy <- makeYearSummaries(data_indy) yearSummaries_champ <- makeYearSummaries(data_champ) # convert day-of-year number to approximate month-day monthDay <- function(x) { x <- as.integer(x) substring(as.Date(x, origin="2001-01-01"),6) } # average first/last days of snowfall in indy first <- max(yearSummaries_indy$firstSnow) last <- max(yearSummaries_indy$lastSnow) print(paste("Average first/last day of snowfall in indy, roughly (mm-dd)", monthDay(first),"and",monthDay(last))) # Now for some plotting # Plot of all of the years, with a point for the average max, average min, average mean plot(yearSummaries_indy$year, yearSummaries_indy$tmax, # x coords, y coords col="red", ylim=c(30,80), pch=1, xlab="Year", ylab="Temperature (F)") points(yearSummaries_indy$year, yearSummaries_indy$mean, col="green", pch=2) points(yearSummaries_indy$year, yearSummaries_indy$tmin, col="blue", pch=3) points(yearSummaries_champ$year, yearSummaries_champ$mean, col="orange", pch=4) legend("top", legend=c("avg daily max temp for the year", "avg mean", "avg min", "avg mean Champ."), col=c("red","green","blue", "orange"), pch=c(1,2,3,4), cex=.7) # And we also compute a linear fit for the max temperature. Does it look like it's trending up? fit_max <- lm(tmax ~ year, data=yearSummaries_indy) #plot(fit_max) # Use the fit to draw a line of what it would predict predicted <- fit_max$coefficients[1] + fit_max$coefficients[2] * yearSummaries_indy$year # would do the same thing, # predicted <- predict(fit_max) # And add that to the plot points(yearSummaries_indy$year, predicted, col="black", type="l") fit_min <- lm(tmin ~ year, data=yearSummaries_indy) #plot(fit_max) # Use the fit to draw a line of what it would predict predicted_min <- fit_min$coefficients[1] + fit_min$coefficients[2] * yearSummaries_indy$year # And add that to the plot points(yearSummaries_indy$year, predicted_min, col="black", type="l") # save the figure dev.copy(pdf, "yearly_temperatures.pdf"); dev.off() # last 10 years from yearSummaries n <- nrow(yearSummaries_indy) print(yearSummaries_indy[(n-9):n,]) print(tail(yearSummaries_indy, n=10)) # another way # first 10 years print(yearSummaries_indy[1:10,]) print(head(yearSummaries_indy, n=10)) # another way # differences between indy and champaign, out of the year summaries differences <- data.frame(year=yearSummaries_indy$year, diff=(yearSummaries_indy$mean - yearSummaries_champ$mean)) plot(differences$year, differences$diff) data_indy$MEAN_DIFF <- data_indy$MEAN - data_champ$MEAN plot(data_indy$MEAN_DIFF) hist(data_indy$MEAN_DIFF, breaks=100, xlab="difference in degF", ylab="how many days", main="Daily temperature difference Indy-Champ.") points(c(0,0),c(0,2000), type="l", col="green") dev.copy(pdf, "daily_temperature_difference.pdf"); dev.off() # average daily temperatures, boxplots plot(data_indy$Day365, data_indy$MEAN) boxplot(data_indy$MEAN ~ data_indy$Day365, xlab="day of the year", ylab="degF", main="Mean daily temperature, Indy") dev.copy(pdf,"daily-temperature-indy.pdf"); dev.off() # plotting the mean plot(1:366, tapply(data_indy$MEAN, data_indy$Day365, mean), type="l", col="green", main="Average daily temperature, Indy", xlab="day of the year", ylab="degF", ylim=c(0,90)) points(1:366, tapply(data_indy$TMAX, data_indy$Day365, mean), type="l", col="red") points(1:366, tapply(data_indy$TMIN, data_indy$Day365, mean), type="l", col="blue") legend("bottom", legend=c("max", "mean", "min"), text.col=c("red","green","blue")) dev.copy(pdf,"daily-temperature-indy2.pdf"); dev.off() # record high and low plot(1:366, tapply(data_indy$TMAX, data_indy$Day365, max), type="l", col="red", main="Record daily temperature, Indy", xlab="day of the year", ylab="degF", ylim=c(-30,115)) points(1:366, tapply(data_indy$TMIN, data_indy$Day365, min), type="l", col="blue") legend("bottom", legend=c("record high", "record low"), text.col=c("red", "blue")) dev.copy(pdf,"daily-temperature-records.pdf"); dev.off() # highest temperature of the year, lowest temperature of the year plot(unique(data_indy$Year), tapply(data_indy$TMAX, data_indy$Year, max), col="red", main="Highest/lowest temp of the year, Indy", xlab="year", ylab="degF", ylim=c(-30,115), type="l") points(unique(data_indy$Year), tapply(data_indy$TMIN, data_indy$Year, min), col="blue", type="l") legend("center", legend=c("highest of the year", "lowest of the year"), text.col=c("red", "blue")) dev.copy(pdf,"annual-temperature-extremes.pdf"); dev.off() # temperature variation over time plot(unique(data_indy$Year), tapply(data_indy$TMAX-data_indy$TMIN, data_indy$Year, mean), main="Average high-low temp by year", xlab="year", ylab="degF", type="l", ylim=c(15,25)) dev.copy(pdf, "daily-temperature-variation.pdf"); dev.off() # last day of measureable snowfall