# Created by Jeff Kinne, jkinne@cs.indstate.edu # Weather data is from https://mrcc.illinois.edu/CLIMATE/welcome.jsp # Step 1... # First, get your working directory set properly so you'll be able to import # files without having to put the full path into your commands. # On your CS computers, uncomment the following if you store your csv files in your home directory #setwd("~/") # On a windows computer, something like the following (depends where you keep your files) #setwd("C:/Users/jkinne/Documents/cs459") # On my macbook, I am using this - #setwd("~/Documents") # When I am working on my office computer, I am using this - setwd("~/public_html/cs459-bd4isu-s2019/code") # Step 2... # Import the csv file into a data frame. # header=TRUE because there is a header in the csv file. na.strings so the numbers will properly be read as numbers # (and M or T as missing data) # Try using the following file as well - Champaign-Weather-Station-USC00118740-1950-2018.csv data_indy <- read.csv("Indianapolis-Weather-Station-USW00093819-1950-2018.csv", header = TRUE, na.strings=c("M", "T")) data_champ <- read.csv("Champaign-Weather-Station-USC00118740-1950-2018.csv", header = TRUE, na.strings=c("M", "T")) processWeather <- function(data) { # Fix a few things with the data. This is a process of looking at it to make sure # things are right, and fixing anything that is wrong. Note - after working through issues # with weather data downloaded for Terre Haute, I decided it wasn't good enough so I switched # to data from the Indy airport. # Add a "Year" column so we can do averages and such based on that data$Year <- as.numeric(substr(data$Date, 1, 4)) # Summary of all of the columns. Check out the output #print(summary(data)) # Check to make sure things look right... #View(data) return(data) } data_indy <- processWeather(data_indy) data_champ <- processWeather(data_champ) # Step 3... # Some analysis and/or plotting # Note, if we want the average PRCP for 1950, we could do ... # summary(data$PRCP[1:365]) # mean(data$PRCP[1:365], na.rm=TRUE) makeYearSummaries <- function(data) { # Data frame that will have one row for each year of our data # Note - unique does what it sounds like yearSummaries <- data.frame(year=unique(data$Year)) # Add a column with a count of how many entries there are for each year (hopefully 365 or 366) yearSummaries$count <- table(as.factor(data$Year)) #plot(yearSummaries$year, yearSummaries$count) # Add some other yearly data - average max, average min, average mean, total prcp, total snow yearSummaries$tmax <- tapply(data$TMAX, data$Year, mean, na.rm=TRUE) yearSummaries$tmin <- tapply(data$TMIN, data$Year, mean, na.rm=TRUE) yearSummaries$mean <- tapply(data$MEAN, data$Year, mean, na.rm=TRUE) yearSummaries$prcp <- tapply(data$PRCP, data$Year, sum, na.rm=TRUE) yearSummaries$snow <- tapply(data$SNOW, data$Year, sum, na.rm=TRUE) # Last, only keep years with a full set of data yearSummaries <- yearSummaries[yearSummaries$count >= 365,] # And take a look at the summary print(summary(yearSummaries)) return(yearSummaries) } yearSummaries_indy <- makeYearSummaries(data_indy) yearSummaries_champ <- makeYearSummaries(data_champ) # Now for some plotting # Plot of all of the years, with a point for the average max, average min, average mean plot(yearSummaries_indy$year, yearSummaries_indy$tmax, # x coords, y coords col="red", ylim=c(30,80), pch=1, xlab="Year", ylab="Temperature (F)") points(yearSummaries_indy$year, yearSummaries_indy$mean, col="green", pch=2) points(yearSummaries_indy$year, yearSummaries_indy$tmin, col="blue", pch=3) legend("top", legend=c("avg daily max temp for the year", "avg mean", "avg min"), col=c("red","green","blue"), pch=c(1,2,3), cex=.7) points(yearSummaries_champ$year, yearSummaries_champ$mean, col="orange", pch=4) # And we also compute a linear fit for the max temperature. Does it look like it's trending up? fit_max <- lm(tmax ~ year, data=yearSummaries_indy) #plot(fit_max) # Use the fit to draw a line of what it would predict predicted <- fit_max$coefficients[1] + fit_max$coefficients[2] * yearSummaries_indy$year # would do the same thing, # predicted <- predict(fit_max) # And add that to the plot points(yearSummaries_indy$year, predicted, col="black", type="l") fit_min <- lm(tmin ~ year, data=yearSummaries_indy) #plot(fit_max) # Use the fit to draw a line of what it would predict predicted_min <- fit_min$coefficients[1] + fit_min$coefficients[2] * yearSummaries_indy$year # And add that to the plot points(yearSummaries_indy$year, predicted_min, col="black", type="l") # last 10 years from yearSummaries n <- nrow(yearSummaries_indy) print(yearSummaries_indy[(n-9):n,]) print(tail(yearSummaries_indy, n=10)) # another way # first 10 years print(yearSummaries_indy[1:10,]) print(head(yearSummaries_indy, n=10)) # another way # differences between indy and champaign, out of the year summaries differences <- data.frame(year=yearSummaries_indy$year, diff=(yearSummaries_indy$mean - yearSummaries_champ$mean)) plot(differences$year, differences$diff) data_indy$MEAN_DIFF <- data_indy$MEAN - data_champ$MEAN plot(data_indy$MEAN_DIFF) hist(data_indy$MEAN_DIFF, breaks=100)