# CSE 446 Spring 2013 # R Tutorial # TA: James McQueen # This file contains some basic information about R ################## # Getting Started# ################## # download R from: http://www.r-project.org/ # R is a scripting language, no compiling necessary # R is case sensitive # R requires no semi-colons and ignores whitespace, though you can write multiple # commands on a line and separate them by semi-colons # R has file extension .R, but you can feed in text # R has some missing data values, NA and NULL # read more here: http://www.r-bloggers.com/r-na-vs-null/ #### hash/pound signs comment out a line # if you enter an unfinished command into an R line # (e.g. you forget a parenth), R goes to the next line # and starts it with a + ################### # R environments: # ################### # 1) R-Studio (This is what I use) # 2) Tinn-R ################################## # setting your working directory # ################################## # All file path names need to use the forward slash / (as in Unix) not the windows backward slash \. # to set wd to my dropbox setwd("C:/Users/James/Dropbox/Documents/Spring 2013/CSE 446") # to find my working directory getwd() # lists everything stored in the R workspace ls() # clears the workspace rm(list=ls()) ls() ############## # Help in R: # ############## # if you eveer need help about something type: # ?something search for help file among installed packages ?mean # ??something search results on CRAN ?cluster ??cluster # view and set options for the session help(options) # learn about available options options() # view current option settings options(digits=3) # number of digits to print on output ############ # Packages # ############ # There are tons of packages containing functions that people have written for R # to download a package type: # install.packages("package.name") install.packages("robustbase") # to use a package you must load it either with: # library(package.name) # require(package.name) # require() should be used if calling inside a clause # require() returns a logical value by default # TRUE if the package is loaded FALSE if not require(robustbase) test <- require(ggplot2) test2 <- require(abcde) ##################### # R as a calculator # ##################### 2+3 2+3*4 ##################### # Variable Creation # ##################### # <- is used for creating variables and functions x <- 2 x y <- 3 y z <- 4 z # you can also use = z = 2 z x+y*z result <- x+y*z result # Everything given a name is an object and is stored in the workspace ls() #################### # creating vectors # #################### x <- 1:5 x y <- 11:15 y # use c() to make a vector containing anything z <- c(10,6,3,5,1) z # if one entry is a string, all entries become strings zz <- c("blah", 2, 5) zz #################### # indexing vectors # #################### # indices start at 1, not 0! y[2] z[4] y[2:4] z[c(5, 1, 2)] ################## # seq() function # ################## ?seq help(seq) a <- seq(10,20) a a <- seq(20,10) a b <- seq(from=17,to=30,by=2) b b <- seq(17,30,2) b b <- seq(by=2,to=30,from=17) b c <- seq(2,1,-.1) c ######################## # functions on vectors # ######################## vec <- c(10,5,1,12,4) vec vec^2 vec + vec sum(vec) mean(vec) sum(vec)/length(vec) vec2 <- c(vec,10,10,4) vec2 # Counting things in vectors: use comparison operators (==, <, >, <=, >=, !=), # boolean operators ( |, & )and sum() function vec2 == 1 sum(vec2 == 1) vec2 == 10 # trues are encoded as 1 and false as 0 # this returns the number of entries equal to 10 sum(vec2 == 10) sum(vec2 < 10) sum(vec2 <= 10) # | is or vec2 == 10 | vec2 == 1 vec2 sum(vec2 == 10 | vec2 == 1) vec2 > 1 & vec2 < 12 ########################################################### # Creating arrays (matrices) by using the matrix function # ########################################################### ?matrix x <- matrix(1:12,nrow=3) x dim(x) dim(x)[1] # number of rows dim(x)[2] # number of cols length(x) # array is a vector y <- matrix(1:12,nrow=4) y dim(y) z <- matrix(1:12,nrow=3,byrow=T) z temp <- c(5,10,12,24,42,60,63,72) w <- matrix(temp,ncol=2) w dim(w) ###################### # indexing in arrays # ###################### w[4,2] w[1,2] z[3,3] z z[1:2, 1:2] z[,1] z[1,] # Calculating row and column means colMeans(z) rowMeans(z) colSums(z)/dim(z)[1] rowSums(z)/dim(z)[2] ############# ## SUbsets ## ############# z ## subset function selecting rows by condition ?subset subset(z, z[,1] < 9) subset(z, z[,1] < 9 & z[,1] > 1) ## subset function selecting columns subset(z, select = 1) subset(z, select = c(1, 2)) ## Alternative method z[z[,1] < 9,] z[, z[1,] < 3] z[, c(1, 2)] z[z[,1] < 9, z[1,] < 3] # which function returns index when argument is true x <- c(1, 2, 5, 3, 6) which(x == 3, x) ## from above z[z[,1] < 9, ] ## can be done also condition <- which(z[,1] < 9) z[condition,] ## another useful which which.max(x) x[5] ########### ## Lists ## ########### ## Lists are also useful. They can store objects as well as values z <- list(1, 2, 3) z # you can unlist them as z <- unlist(z) z a = c(1, 2, 3, 5, 6) b = c(45,3, 5, 2, 1) c = c(2, 59, 381, 60) z <- list(summary(a), summary(b), summary(c)) z ## to access a particular entry you must use two sets of square brackets z[[2]] # you can access entries within the list: z[[2]][2] #################### # making functions # #################### fun1 <- function(vec){ mean <- mean(vec) sd <- sd(vec) min <- min(vec) max <- max(vec) return(c(mean, sd, min, max)) } a <- 1:4 fun1(a) ########### # clauses # ########### for(i in 1:length(a)) { cat(a[i], "\n") } ## lists and vectors can be created without ## defining length. This can be useful with loops y <- c() y z <- list() z for( i in 1:10){ y[i] <- i^2 z[[i]] <- summary(1:i) } y z if(length(a)>2){ cat("the length of vector a is greater than 2") } ###################### # Data import/export # ###################### # You can read in text files, csv files, ... # Reading in data from a text file using the read.table function. # Note I’m reading this as a local file in the same directory as my R session (.RData file). # A full path name will also work, but you need to use front slashes (as in Unix) # not back slashes (as in Windows). R can also read web address path names. ?read.table example.data <- read.table('example.txt', header=T)[sample(1:15, 5),] example.data <- read.csv('example.csv', sep = ",", header=T) fpe <- read.table("http://data.princeton.edu/wws509/datasets/effort.dat") head(example.data) tail(example.data) # rename the columns names1 <- names(example.data) names1 names2 <- c("a", "b", "c") names(example.data) <- names2 names(example.data) head(example.data) colnames(example.data) <- names1 example.data[1:10,] dim(example.data) nrow(example.data) ncol(example.data) example.data[1,1] example.data[8,3] # can call rows and columns in a data.frame by their name is.vector(example.data$x1) example.data$x1[1:10] example.data$y[3] length(example.data$x1) mean(example.data$x1) mean(example.data[,1]) sum(example.data[2,]) min(example.data$y) max(example.data$y) summary(example.data) ################ # writing data # ################ ?write.table() write.table(example.data, file="writetest1.dat") write.table(example.data, file="writetest2.txt") write.table(example.data, file="writetest3.csv") ####################### # easy visualizations # ####################### # make histograms ?hist hist(example.data$x1) hist(example.data$y) # make scatterplots ?plot plot(example.data$x1,example.data$y) ##################### # linear regression # ##################### # Let's consider fitting a model using the example data. # Say we want to fit Y = beta_0 + beta_1*x1 + beta_2 * x2 # Using standard linear regression we want to use (X^TX)^(-1)X^TY # first let's set up our design matrix (don't forget the column of 1s) intercept <- rep(1, dim(example.data)[1]) X <- as.matrix(cbind(intercept,example.data[,c(1,2)])) Y <- as.vector(example.data[,3]) ## t(X) transposes ## solve(X) calculates the inverse ## use %*% for matrix multiplication Beta <- solve(t(X)%*%X)%*%t(X)%*%Y ## Fitted values: y.hat <- X %*% Beta head(y.hat) ## Residuals resid <- Y - y.hat head(resid) ## Regression standard error sigma.hat <- sqrt(sum(resid^2)/(dim(X)[1] - dim(X)[2])) sigma.hat ## Coefficient Standard Errors se.beta <- sqrt(diag(sigma.hat^2 * solve(t(X)%*%X))) se.beta ## Coefficient t-statistics beta.tstat <- Beta/se.beta beta.tstat ## Coefficient p-values p.vals <- 2* pt(beta.tstat, df = dim(X)[1] - dim(X)[2], lower.tail = FALSE) p.vals ## R^2 value # This is how most people think of it: proportion of explained deviance R.squared1 <- 1 - sum(resid^2)/sum((Y - mean(Y))^2) R.squared1 # Can also be calculated this way R.squared2<- cor(Y, y.hat)^2 R.squared2 # Using lm() to fit a linear regression model to the data. ?lm # Note that lm expects the first parameter to be a formula. # In this example, the formula is y~x1+x2. This indicates that y is the dependent variable # and x1 and x2 are the independent variables (covariates). # By default an intercept term is used. # To eliminate the intercept term use y~-1+x1+x2. lm.results <- lm(y~x1+x2, data=example.data) # Note that “lm.results” is the name of the object that stores all the information # from the lm function. lm.results # use str() to see what names the object has lm.results$coefficients # for more info, use summary summary(lm.results) # to get predicted values from the regression predict(lm.results)[1:10] # you can also predict from new data with this function # fitted values lm.results$fitted.values[1:10] # residuals lm.results$residuals[1:10] ############################ ## Other useful functions ## ############################ ### tapply ?tapply ## Useful when calculating summaries of data split by group ## assign the 100 observations to a random group group <- as.factor(rbinom(100,5,0.6)) new.data <- as.data.frame(cbind(example.data, group)) ## say you want the mean of x1 split by group: tapply(new.data$x1, INDEX = new.data$group, FUN = mean) # or the SD tapply(new.data$x2, new.data$group, sd) ### paste concatonates things together, variables or strings cor = cor(example.data$x1, example.data$y) paste("the correlation between X1 and X2 is ", round(cor, 3), sep = "") # collapse can be useful too paste("x",1:5, sep="", collapse=" + ") # in particular: model <- lm(as.formula(paste("y ~ ",paste("x",1:2,sep="", collapse = " + "))), data = example.data) ### assign: helps with dynamic variable names ?assign for (i in 1:5) { assign(paste("z", i, sep = ""), i^2) } z3 ################## # your R session # ################## # save the workspace to the file .RData in the cwd save.image() # save specific objects to a file # if you don't specify the path, the cwd is assumed save(object list,file="myfile.RData") # load a workspace into the current session # if you don't specify the path, the cwd is assumed load("myfile.RData") ############## # quitting R # ############## q() # when you quit R, you have the option to save your workspace. if you do, # it will be saved in your working directory, a .RData extension. # you can open R next time by clicking on this file, or by loading the workspace # once R is open