# CSE 446 Spring 2013
# R Tutorial
# TA: James McQueen

# This file contains some basic information about R

##################
# Getting Started#
##################

# download R from: http://www.r-project.org/

# R is a scripting language, no compiling necessary

# R is case sensitive

# R requires no semi-colons and ignores whitespace, though you can write multiple 
# commands on a line and separate them by semi-colons

# R has file extension .R, but you can feed in text

# R has some missing data values, NA and NULL
# read more here: http://www.r-bloggers.com/r-na-vs-null/

#### hash/pound signs comment out a line

# if you enter an unfinished command into an R line
# (e.g. you forget a parenth), R goes to the next line
# and starts it with a +

###################
# R environments: #
###################
# 1) R-Studio (This is what I use)
# 2) Tinn-R


##################################
# setting your working directory #
##################################
# All file path names need to use the forward slash / (as in Unix) not the windows backward slash \.

# to set wd to my dropbox
setwd("C:/Users/James/Dropbox/Documents/Spring 2013/CSE 446")

# to find my working directory

getwd()

# lists everything stored in the R workspace
ls()

# clears the workspace 
rm(list=ls())
ls()

##############
# Help in R: #
##############
# if you eveer need help about something type:

# ?something search for help file among installed packages
?mean

# ??something search results on CRAN
?cluster
??cluster

# view and set options for the session

help(options) # learn about available options
options() # view current option settings
options(digits=3) # number of digits to print on output 

############ 
# Packages #
############
# There are tons of packages containing functions that people have written for R
# to download a package type:
# install.packages("package.name")
install.packages("robustbase")

# to use a package you must load it either with:
# library(package.name)
# require(package.name)

# require() should be used if calling inside a clause
# require() returns a logical value by default
# TRUE if the package is loaded FALSE if not

require(robustbase)
test <- require(ggplot2)
test2 <- require(abcde)

#####################
# R as a calculator #
#####################
2+3

2+3*4

#####################
# Variable Creation #
#####################
# <- is used for creating variables and functions
x <- 2
x
y <- 3
y
z <- 4
z

# you can also use = 

z = 2
z

x+y*z

result <- x+y*z
result 

# Everything given a name is an object and is stored in the workspace
ls()

####################
# creating vectors #
####################
x <- 1:5
x

y <- 11:15
y

# use c() to make a vector containing anything
z <- c(10,6,3,5,1)
z

# if one entry is a string, all entries become strings
zz <- c("blah", 2, 5)
zz

####################
# indexing vectors #
####################
# indices start at 1, not 0!

y[2]

z[4]

y[2:4]

z[c(5, 1, 2)]

##################
# seq() function #
##################
?seq
help(seq)

a <- seq(10,20)
a

a <- seq(20,10)
a

b <- seq(from=17,to=30,by=2)
b

b <- seq(17,30,2)

b

b <- seq(by=2,to=30,from=17)
b

c <- seq(2,1,-.1)
c


########################
# functions on vectors #
########################
vec <- c(10,5,1,12,4)
vec

vec^2

vec + vec

sum(vec)

mean(vec)

sum(vec)/length(vec)

vec2 <- c(vec,10,10,4)
vec2

# Counting things in vectors: use comparison operators (==, <, >, <=, >=, !=),
# boolean operators ( |, & )and sum() function

vec2 == 1

sum(vec2 == 1)

vec2 == 10

# trues are encoded as 1 and false as 0 

# this returns the number of entries equal to 10
sum(vec2 == 10)

sum(vec2 < 10)

sum(vec2 <= 10)

# | is or 

vec2 == 10 | vec2 == 1

vec2

sum(vec2 == 10 | vec2 == 1)

vec2 > 1 & vec2 < 12


###########################################################
# Creating arrays (matrices) by using the matrix function #
###########################################################
?matrix
x <- matrix(1:12,nrow=3)
x

dim(x)
dim(x)[1] # number of rows
dim(x)[2] # number of cols

length(x) # array is a vector

y <- matrix(1:12,nrow=4)
y
dim(y)

z <- matrix(1:12,nrow=3,byrow=T)
z

temp <- c(5,10,12,24,42,60,63,72)
w <- matrix(temp,ncol=2)
w
dim(w)

######################
# indexing in arrays #
######################
w[4,2]

w[1,2]

z[3,3]

z

z[1:2, 1:2]

z[,1]

z[1,]

# Calculating row and column means

colMeans(z)
rowMeans(z)
colSums(z)/dim(z)[1]
rowSums(z)/dim(z)[2]

#############
## SUbsets ##
#############

z

## subset function selecting rows by condition
?subset
subset(z, z[,1] < 9)
subset(z, z[,1] < 9 & z[,1] > 1)

## subset function selecting columns

subset(z, select = 1)
subset(z, select = c(1, 2))

## Alternative method

z[z[,1] < 9,]
z[, z[1,] < 3]
z[, c(1, 2)]
z[z[,1] < 9, z[1,] < 3]

# which function returns index when argument is true

x <- c(1, 2, 5, 3, 6)

which(x == 3, x)

## from above

z[z[,1] < 9, ]

## can be done also

condition <- which(z[,1] < 9)
z[condition,]

## another useful which

which.max(x)
x[5]

###########
## Lists ##
###########

## Lists are also useful. They can store objects as well as values

z <- list(1, 2, 3)
z

# you can unlist them as

z <- unlist(z)
z

a = c(1, 2, 3, 5, 6)
b = c(45,3, 5, 2, 1)
c = c(2, 59, 381, 60)

z <- list(summary(a), summary(b), summary(c))
z

## to access a particular entry you must use two sets of square brackets

z[[2]]

# you can access entries within the list:

z[[2]][2]

####################
# making functions #
####################

fun1 <- function(vec){
  mean <- mean(vec)
  sd <- sd(vec)
  min <- min(vec)
  max <- max(vec)
  
  return(c(mean, sd, min, max))
  
}

a <- 1:4

fun1(a)

###########
# clauses #
###########

for(i in 1:length(a)) {
  cat(a[i], "\n")
}

## lists and vectors can be created without
## defining length. This can be useful with loops

y <- c()
y
z <- list()
z

for( i in 1:10){
  y[i] <- i^2
  z[[i]] <- summary(1:i)
}

y
z

if(length(a)>2){
  cat("the length of vector a is greater than 2")
}

######################
# Data import/export #
######################
# You can read in text files, csv files, ...

# Reading in data from a text file using the read.table function.  
# Note I’m reading this as a local file in the same directory as my R session (.RData file).  
# A full path name will also work, but you need to use front slashes (as in Unix) 
# not back slashes (as in Windows). R can also read web address path names.

?read.table

example.data <- read.table('example.txt', header=T)[sample(1:15, 5),]
example.data <- read.csv('example.csv', sep = ",", header=T)
fpe <- read.table("http://data.princeton.edu/wws509/datasets/effort.dat")

head(example.data)
tail(example.data)

# rename the columns
names1 <- names(example.data)
names1
names2 <- c("a", "b", "c")
names(example.data) <- names2
names(example.data)
head(example.data)

colnames(example.data) <- names1

example.data[1:10,]

dim(example.data)

nrow(example.data)

ncol(example.data)

example.data[1,1]

example.data[8,3]

# can call rows and columns in a data.frame by their name
is.vector(example.data$x1)

example.data$x1[1:10]

example.data$y[3]

length(example.data$x1)

mean(example.data$x1)

mean(example.data[,1])

sum(example.data[2,])

min(example.data$y)

max(example.data$y)

summary(example.data)

################
# writing data #
################
?write.table()
write.table(example.data, file="writetest1.dat")
write.table(example.data, file="writetest2.txt")
write.table(example.data, file="writetest3.csv")


#######################
# easy visualizations #
#######################
# make histograms

?hist
hist(example.data$x1)
hist(example.data$y)

# make scatterplots
?plot
plot(example.data$x1,example.data$y)


#####################
# linear regression #
#####################

# Let's consider fitting a model using the example data.
# Say we want to fit Y = beta_0 + beta_1*x1 + beta_2 * x2
# Using standard linear regression we want to use (X^TX)^(-1)X^TY

# first let's set up our design matrix (don't forget the column of 1s)
intercept <- rep(1, dim(example.data)[1])
X <- as.matrix(cbind(intercept,example.data[,c(1,2)]))
Y <- as.vector(example.data[,3])

## t(X) transposes
## solve(X) calculates the inverse 
## use %*% for matrix multiplication

Beta <- solve(t(X)%*%X)%*%t(X)%*%Y

## Fitted values:

y.hat <- X %*% Beta
head(y.hat)

## Residuals

resid <- Y - y.hat
head(resid)

## Regression standard error

sigma.hat <- sqrt(sum(resid^2)/(dim(X)[1] - dim(X)[2]))
sigma.hat

## Coefficient Standard Errors

se.beta <- sqrt(diag(sigma.hat^2 * solve(t(X)%*%X)))
se.beta

## Coefficient t-statistics

beta.tstat <- Beta/se.beta
beta.tstat

## Coefficient p-values

p.vals <- 2* pt(beta.tstat, df = dim(X)[1] - dim(X)[2], lower.tail = FALSE)
p.vals

## R^2 value
# This is how most people think of it: proportion of explained deviance
R.squared1 <- 1 - sum(resid^2)/sum((Y - mean(Y))^2)
R.squared1

# Can also be calculated this way
R.squared2<- cor(Y, y.hat)^2
R.squared2

# Using lm() to fit a linear regression model to the data.  
?lm
# Note that lm expects the first parameter to be a formula.
# In this example, the formula is y~x1+x2.  This indicates that y is the dependent variable
# and x1 and x2 are the independent variables (covariates).  
# By default an intercept term is used.  
# To eliminate the intercept term use y~-1+x1+x2.

lm.results <- lm(y~x1+x2, data=example.data)

# Note that “lm.results” is the name of the object that stores all the information 
# from the lm function.
lm.results

# use str() to see what names the object has
lm.results$coefficients

# for more info, use summary
summary(lm.results)

# to get predicted values from the regression
predict(lm.results)[1:10] # you can also predict from new data with this function

# fitted values
lm.results$fitted.values[1:10]

# residuals
lm.results$residuals[1:10]


############################
## Other useful functions ##
############################

### tapply
?tapply

## Useful when calculating summaries of data split by group

## assign the 100 observations to a random group
group <- as.factor(rbinom(100,5,0.6))

new.data <- as.data.frame(cbind(example.data, group))

## say you want the mean of x1 split by group:

tapply(new.data$x1, INDEX = new.data$group, FUN = mean)

# or the SD

tapply(new.data$x2, new.data$group, sd)

### paste concatonates things together, variables or strings

cor = cor(example.data$x1, example.data$y)

paste("the correlation between X1 and X2 is ", round(cor, 3), 
      sep = "")

# collapse can be useful too

paste("x",1:5, sep="", collapse=" + ")

# in particular:

model <- lm(as.formula(paste("y ~ ",paste("x",1:2,sep="", collapse = " + "))),
            data = example.data)

### assign: helps with dynamic variable names
?assign

for (i in 1:5) {
  assign(paste("z", i, sep = ""), i^2)
}

z3

##################
# your R session #
##################
# save the workspace to the file .RData in the cwd
save.image()

# save specific objects to a file
# if you don't specify the path, the cwd is assumed
save(object list,file="myfile.RData") 

# load a workspace into the current session
# if you don't specify the path, the cwd is assumed
load("myfile.RData") 


##############
# quitting R #
##############
q()

# when you quit R, you have the option to save your workspace. if you do, 
# it will be saved in your working directory, a .RData extension.
# you can open R next time by clicking on this file, or by loading the workspace 
# once R is open