R Workshop STIS

Introduction of R
Setia Pramana
May 23, 2014
What is R?
R is an open-source environment for statistical computing an visualisation based

on the S language developed at Bell Laboratories in the eighties. R first appeared in 1996 by Prof. Ross Ihaka and Robert Gentleman of the University of
Auckland, New Zealand.
It is free, open source, maintained and developed by a community of developers and Works in Windows, Unix, MacO.
R is applicable to the most complex and sophsticated problems, as well as
routine analysis, without any restrictions on access or use.
R Installation
To get R just simple download R executable file from http://cran.r-project.org/bin/windows/base/.
3
>
>
>
>
>
>
Data Structure and Simple Manipulations

### Short intro to R ##
# scalar data object
x <- 5
y <- 2
z <- x + y
z
[1] 7
>
>
>
>
>
>
>
## vector ##
vc1 <- c(2,5,5,3,3,6,2,3,5,6)
# we can also create an object using assign function :#
assign ("vc2",seq(from=1, to=100, by=10))
vc1
1
[1] 2 5 5 3 3 6 2 3 5 6
> vc2
[1]
1 11 21 31 41 51 61 71 81 91
> length(vc2)
[1] 10
> ## vector multiplication #
>
> vc1*vc2
[1]
55 105
93 123 306 122 213 405 546
> vc1 %*% vc2

[,1]
[1,] 1970
>
>
>
>
>
## Factor type for cathegorical data ##

type <- rep(c("High","Medium","Low"),times=10)
type <-factor(type)
type
[1] High
Medium Low
High
Medium Low
High
Medium Low
High
[11] Medium Low
High
Medium Low
High
Medium Low
High
Medium
[21] Low
High
Medium Low
High
Medium Low
High
Medium Low
Levels: High Low Medium
> grade <- rep(c("Grade1","Grade2","Grade3", "Grade4"),each=5)
> grade <-factor(grade)
> grade
[1] Grade1 Grade1 Grade1 Grade1 Grade1 Grade2 Grade2 Grade2 Grade2 Grade2
[11] Grade3 Grade3 Grade3 Grade3 Grade3 Grade4 Grade4 Grade4 Grade4 Grade4
Levels: Grade1 Grade2 Grade3 Grade4
> ## MAtrix ##
>
> mat <- matrix(c(2,3,1,5,4,5,6,7,2,3,1,5,4,5,6,7),nrow=4,ncol=4)
> mat
[1,]
[2,]
[3,]
[4,]
[,1] [,2] [,3] [,4]

2
4
2
4
3
5
3
5
1
6
1
6
5
7
5
7
2
> mat2 <- matrix(c(vc2,vc1),nrow=4,ncol=5)

> mat2
[1,]
[2,]
[3,]
[4,]
[,1] [,2] [,3] [,4] [,5]

1
41
81
5
2
11
51
91
3
3
21
61
2
3
5
31
71
5
6
6
> dim(mat2)
[1] 4 5
> ## coloumn binding##
> cbind(mat2,mat)
[1,]
[2,]
[3,]
[4,]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]

1
41
81
5
2
2
4
2
4
11
51
91
3
3
3
5
3
5
21
61
2
3
5
1
6
1
6
31
71
5
6
6
5
7
5
7
> ## row binding ##

> rbind(mat2,vc1[1:5])
[1,]
[2,]
[3,]
[4,]
[5,]
[,1] [,2] [,3] [,4] [,5]

1
41
81
5
2
11
51
91
3
3
21
61
2
3
5
31
71
5
6
6
2
5
5
3
3
> ## Transpose #
>
> t(mat2)
[1,]
[2,]
[3,]
[4,]
[5,]
[,1] [,2] [,3] [,4]

1
11
21
31
41
51
61
71
81
91
2
5
5
3
3
6
2
3
5
6
> #diagnoal Matrix

>
> diag(c(1,4,7,9))
[1,]
[2,]
[3,]
[4,]
[,1] [,2] [,3] [,4]

1
0
0
0
0
4
0
0
0
0
7
0
0
0
0
9
> diag(mat)
[1] 2 5 1 7
>
>
>
>
## Matrix multiplication #
m1 <- matrix(c(6,2,4,5), 2,2)
m2 <- matrix(c(2,4,1,2), 2,2)
m1
[1,]
[2,]
[,1] [,2]
6
4
2
5
> m2
[1,]
[2,]
[,1] [,2]
2
1
4
2
> m1*m2
[1,]
[2,]
[,1] [,2]
12
4
8
10
> m1%*%m2
[1,]
[2,]
[,1] [,2]
28
14
24
12
> ## List ##
>
> myList <- list(vc1, vc2, 5,6,"seven", mat,mat2)
> myList
[[1]]
[1] 2 5 5 3 3 6 2 3 5 6
[[2]]
[1] 1 11 21 31 41 51 61 71 81 91
[[3]]
[1] 5
[[4]]
[1] 6
[[5]]
[1] "seven"
[[6]]
[,1] [,2] [,3] [,4]
[1,]
2
4
2
4
[2,]
3
5
3
5
[3,]
1
6
1
6
[4,]
5
7
5
7
[[7]]
[,1] [,2] [,3] [,4] [,5]
[1,]
1
41
81
5
2
[2,]
11
51
91
3
3
[3,]
21
61
2
3
5
[4,]
31
71
5
6
6
> class (myList)
[1] "list"
> ## Data Frame ##
>
> Data1 <- data.frame( X=c( vc1, vc2), grade, sex=rep(c("male","female"),each=10))
> head(Data1)
1
2
3
4
5
6
X
2
5
5
3
3
6
grade
Grade1
Grade1
Grade1
Grade1
Grade1
Grade2
sex
male
male
male
male
male
male
> tail(Data1)
15
16
17
18
19
20
X
41
51
61
71
81
91
grade
Grade3
Grade4
Grade4
Grade4
Grade4
Grade4
sex
female
female
female
female
female
female
5
> class (Data1)

[1] "data.frame"
> dim(Data1)
[1] 20
> Data1[ Data1$sex=="male",]
1
2
3
4
5
6
7
8
9
10
X
2
5
5
3
3
6
2
3
5
6
grade
Grade1
Grade1
Grade1
Grade1
Grade1
Grade2
Grade2
Grade2
Grade2
Grade2
sex
male
male
male
male
male
male
male
male
male
male
> Data1[ Data1$X > 80,]

X grade
sex
19 81 Grade4 female
20 91 Grade4 female
>
Try this code:
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
## data Extraction ##
vc2[3:10]
vc2>10
vc3 <- vc2[vc2>10]
vc3
mat2[2:3,3:5]
mat2[2:3,3:5]*3
mat2 [,1]
mat2 [,-1]
Data1 $sex
Data1 [1:4,-1]
myList[[1]]
myList[1:2]
Summary Statistics
> summary (vc2)

Min. 1st Qu.
1.0
23.5
Median
46.0
Mean 3rd Qu.

46.0
68.5
> summary (Data1)

X
Min.
: 1.0
1st Qu.: 3.0
Median : 6.0
Mean
:25.0
3rd Qu.:43.5
Max.
:91.0
grade
Grade1:5
Grade2:5
Grade3:5
Grade4:5
sex
female:10
male :10
> mean(vc2)
[1] 46
> var(vc1)
[1] 2.444444
> length(vc2)
[1] 10
> tapply(Data1$X,Data1$grade,mean)
Grade1 Grade2 Grade3 Grade4
3.6
4.4
21.0
71.0
> table(Data1 $sex)
female
10
male
10
> table(Data1 $sex, Data1$grade)
female
male
Grade1 Grade2 Grade3 Grade4

0
0
5
5
5
5
0
0
> ##
> apply(mat2, 1, mean)
[1] 26.0 31.8 18.4 23.8
7
Max.
91.0
> apply(mat2, 2, sd)

[1] 12.909944 12.909944 47.821718
1.500000
1.825742
> apply(mat2, 2, function(x) length(x[x > 10]))

[1] 3 4 2 0 0
> apply(mat2, 2, function(x) mean(x[x > 10]))
[1]
21
56
86 NaN NaN
> rowMeans(mat2)
[1] 26.0 31.8 18.4 23.8
> colMeans(mat2)
[1] 16.00 56.00 44.75
4.25
4.00
> sapply(1:10, function(x) x^2)

[1]
16
25
36
49
64
81 100
> lapply(1:5, function(x) x^4)

[[1]]
[1] 1
[[2]]
[1] 16
[[3]]
[1] 81
[[4]]
[1] 256
[[5]]
[1] 625
> ## Inverse of a matrix ##
> solve(mat2[,-5])
[,1]
[,2]
[,3]
[,4]
[1,] -0.070618503 0.06081601 -0.08727651 0.072079002
[2,] 0.015109148 -0.01279106 0.05775468 -0.035072765
[3,] -0.006237006 0.01663202 -0.01455301 0.004158004
[4,] 0.191268191 -0.17671518 -0.22037422 0.205821206
8
>
> ## Delete all objects ##
>
> #rm(list=ls())
>
>
4.1
Help and Documentation
> help (rnorm)

> ?cor
> example (cor)
>
Read Data
5.1
Input Data Sets
> data1 <- data.frame(NULL)

> edit(data1)
5.2
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
Reading Built-in Data Sets
## list available data in R #

# data()
### Working with available Data sets (built in)##
# load data chicken (available from base)
data(chickwts)
## view the first few rows of the data
head(chickwts)
weight
179
160
136
227
217
168
feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean
> summary(chickwts)
weight
Min.
:108.0
casein
feed
:12
1st Qu.:204.5
Median :258.0
Mean
:261.3
3rd Qu.:323.5
Max.
:423.0
>
>
>
>
>
>
>
horsebean:10
linseed :12
meatmeal :11
soybean :14
sunflower:12
## Set Working directory ##

setwd("C:/Users/Administrator/Documents/Intro_to_R/")
write.csv(chickwts,file="chickenData.csv")
write.table(chickwts,file="chickenData.txt")
5.3
Reading Data from Files
> ## Csv file ##

>
> chik <- read.csv("chickenData.csv")
> head(chik)
1
2
3
4
5
6
X weight
feed
1
179 horsebean
2
160 horsebean
3
136 horsebean
4
227 horsebean
5
217 horsebean
6
168 horsebean
> ## txt file ##

>
> chik2 <- read.table("chickenData.txt")
> head(chik2)
1
2
3
4
5
6
weight
179
160
136
227
217
168
feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean
>
>
>
>
require(foreign)
# SPSS files
dat.spss <- read.spss("hsb2.sav",to.data.frame=TRUE)
head(dat.spss)
10
1
2
3
4
5
6
ID FEMALE RACE
SES SCHTYP
PROG READ WRITE MATH SCIENCE SOCST
70
male white
low public general
57
52
41
47
57
121 female white middle public vocation
68
59
53
63
61
86
male white
high public general
44
33
54
58
31
141
male white
high public vocation
63
44
47
53
56
172
male white middle public academic
47
52
57
53
61
113
44
52
51
63
61
> # Stata files

> dat.dta <- read.dta("hsb2.dta")
> head(dat.dta)
1
2
3
4
5
6
id female race
ses schtyp
prog read write math science socst
70
male white
low public general
57
52
41
47
57
121 female white middle public vocation
68
59
53
63
61
86
male white
high public general
44
33
54
58
31
141
male white
high public vocation
63
44
47
53
56
172
47
52
57
53
61
113
44
52
51
63
61
> ## get from Internet ##

>
> bindata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
> head(bindata)
1
2
3
4
5
6
admit
0
1
1
1
0
1
gre
380
660
800
640
520
760
gpa rank
3.61
3
3.67
3
4.00
1
3.19
4
2.93
4
3.00
2
>
>
Graphics
> ## histogram of Generated normally distributed data #

>
>
> hist(rnorm(100,0,1))
>
>
>
11
15
10
0
Frequency
20
Histogram of rnorm(100, 0, 1)
rnorm(100, 0, 1)
>
>
>
>
>
>
## Scatter Plot ##
n <- 1000
x1 <- matrix(rnorm(n, mean=0, sd=2),ncol = 2)
x2 <- matrix(rnorm(n, mean = 4, sd = 1.5), ncol = 2)
dtx
<- rbind(x1, x2)
plot(dtx)
12
5
5
dtx[,2]
dtx[,1]
> # Scatter plot with color Denisty

> plot(dtx, col = densCols(dtx), pch = 20)
>
13
5
5
dtx[,2]
dtx[,1]
> ## Smooth scatter Plot

>
> smoothScatter(dtx)
>
14
5
5
dtx[,2]
dtx[,1]
> ## a different color scheme:

> Lab.palette <- colorRampPalette(c("blue", "orange", "red"), space = "Lab")
> smoothScatter(dtx, colramp = Lab.palette)
>
15
5
5
dtx[,2]
dtx[,1]
> z <- ts(matrix(rt(200*3 , df = 3), 200,3),

+
start = c(1961, 1), frequency = 4)
> #View(z)
>
> head(z)
Series 1
Series 2
Series 3
[1,] -0.06231692 -0.39472095 -1.0049082
[2,] -0.23154172 -1.10949072 -1.2439971
[3,] -0.10867245 -1.51186314 3.7170121
[4,] 0.63891265 0.02689035 2.8298216
[5,] 1.40172512 0.64631727 -0.4636918
[6,] 1.08954925 0.11188457 -0.1046597
> plot(z, yax.flip = TRUE, main="Plot Time Series")
>
16
2
0
2
0
2
4
5
0
5
15
Series 3
25
1960
1970
1980
1990
2000
Time
> plot(density(chik2$weight))
17
2010
Series 2
6 4 2
Series 1
Plot Time Series
0.000
0.001
0.002
Density
0.003
0.004
density.default(x = chik2$weight)
100
200
300
400
500
N = 71 Bandwidth = 29.96
> stem(chik2$weight)
The decimal point is 2 digit(s) to the right of the |
1
1
2
2
3
3
4
|
|
|
|
|
|
|
124444
5566777889
00112223333444
5556666667778
0001222222333334444
5678899
02
> plot(chik2$feed, chik2$weight)
18
400
350
300
250
200
150
100
casein
linseed
soybean
> barplot(table(chik2$feed), xlab="feed", ylab="Frequency")
19
14
12
10
8
Frequency
6
4
2
0
casein
linseed
soybean
feed
20
7
7.1
Simple Statistical Analysis

Normality Test
> chik2 <- read.table("chickenData.txt")

> shapiro.test(chik2$weight)
Shapiro-Wilk normality test
data: chik2$weight
W = 0.9767, p-value = 0.2101
7.2
t-test
> ## One Sample t test ##

>
> t.test(chik2$weight, alternative='two.sided', mu=150)
One Sample t-test
data: chik2$weight
t = 12.0132, df = 70, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 150
95 percent confidence interval:
242.8301 279.7896
sample estimates:
mean of x
261.3099
> ## Two sample t-test ##
>
>
> chik3 <- chik2[chik2$feed %in% c("casein","sunflower") ,]
> t.test(weight~feed, data=chik3)
Welch Two Sample t-test
data: weight by feed
t = -0.2285, df = 20.502, p-value = 0.8215
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-53.94204 43.27538
sample estimates:
mean in group casein mean in group sunflower
323.5833
328.9167
> plot(weight~feed, data=chik3)
> ## We also do pairwise t-test ##
21
>
> pairwise.t.test(chik2$weight, chik2$feed, p.adj="bonferroni", paired=F)
Pairwise comparisons using t tests with pooled SD
data:
chik2$weight and chik2$feed
horsebean
linseed
meatmeal
soybean
sunflower
casein
3.1e-08
0.00022
0.68350
0.00998
1.00000
horsebean
0.22833
0.00011
0.00487
1.2e-08
linseed
0.20218
1.00000
9.3e-05
meatmeal
1.00000
0.39653
P value adjustment method: bonferroni
250
300
weight
350
400
>
>
casein
linseed
soybean
feed
7.3
>
>
ANOVA and Regression
### Working with Data ##
22
soybean
0.00447
> #Check Available data #

> # load data chicken (available from base)
> data(chickwts)
> head(chickwts)
1
2
3
4
5
6
>
weight
179
160
136
227
217
168
feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean
summary(chickwts)
weight
Min.
:108.0
1st Qu.:204.5
Median :258.0
Mean
:261.3
3rd Qu.:323.5
Max.
:423.0
feed
casein
:12
horsebean:10
linseed :12
meatmeal :11
soybean :14
sunflower:12
> ## Box Plot ##

>
> plot(chickwts$feed,chickwts$weight, ylab="Berat", xlab="Pakan", ylim=c(1,1000), main="Box
> ## one-way ANOVA $$
>
> resanova <- aov(weight~feed, data=chickwts)
> #str(res)
>
> summary(resanova)
Df Sum Sq Mean Sq F value
Pr(>F)
feed
5 231129
46226
15.37 5.94e-10 ***
Residuals
65 195556
3009
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> TukeyHSD(resanova)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = weight ~ feed, data = chickwts)
$feed
23
diff
lwr
upr
p adj
horsebean-casein
-163.383333 -232.346876 -94.41979 0.0000000
linseed-casein
-104.833333 -170.587491 -39.07918 0.0002100
meatmeal-casein
-46.674242 -113.906207 20.55772 0.3324584
soybean-casein
-77.154762 -140.517054 -13.79247 0.0083653
sunflower-casein
5.333333 -60.420825 71.08749 0.9998902
linseed-horsebean
58.550000 -10.413543 127.51354 0.1413329
meatmeal-horsebean
116.709091
46.335105 187.08308 0.0001062
soybean-horsebean
86.228571
19.541684 152.91546 0.0042167
sunflower-horsebean 168.716667
99.753124 237.68021 0.0000000
meatmeal-linseed
58.159091
-9.072873 125.39106 0.1276965
soybean-linseed
27.678571 -35.683721 91.04086 0.7932853
sunflower-linseed
110.166667
44.412509 175.92082 0.0000884
soybean-meatmeal
-30.480519 -95.375109 34.41407 0.7391356
sunflower-meatmeal
52.007576 -15.224388 119.23954 0.2206962
sunflower-soybean
82.488095
19.125803 145.85039 0.0038845
> pairwise.t.test(chickwts$weight, chickwts$feed, p.adj="bonferroni", paired=F)
Pairwise comparisons using t tests with pooled SD
data:
chickwts$weight and chickwts$feed
horsebean
linseed
meatmeal
soybean
sunflower
casein
3.1e-08
0.00022
0.68350
0.00998
1.00000
horsebean
0.22833
0.00011
0.00487
1.2e-08
linseed
0.20218
1.00000
9.3e-05
meatmeal
1.00000
0.39653
P value adjustment method: bonferroni

>
>
24
soybean
0.00447
200
400
Berat
600
800
1000
BoxPlot Chicken Weights
casein
linseed
soybean
Pakan
>
>
>
>
>
## Two-way ANOVA
hsb2<-read.table("http://www.ats.ucla.edu/stat/data/hsb2.csv", sep=",", header=T)
attach(hsb2)
tapply(write, ses, mean)
1
2
3
50.61702 51.92632 55.91379
> tapply(write, ses, sd)
1
2
3
9.490391 9.106044 9.442874
> anova2 <- aov(write ~ ses + female)
> summary(anova2)
Df Sum Sq Mean Sq F value
Pr(>F)
ses
1
770
769.8
9.683 0.00214 **
female
1
1449 1448.8 18.225 3.05e-05 ***
Residuals
197 15660
79.5
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
25
>
>
7.4
>
>
Correlation and Regression Analysis
data(state)
head(state.x77)
Alabama
Alaska
Arizona
Arkansas
California
Colorado
>
>
>
# clearly not a data frame!
Population Income Illiteracy Life Exp Murder HS Grad Frost

Area
3615
3624
2.1
69.05
15.1
41.3
20 50708
365
6315
1.5
69.31
11.3
66.7
152 566432
2212
4530
1.8
70.55
7.8
58.1
15 113417
2110
3378
1.9
70.66
10.1
39.9
65 51945
21198
5114
1.1
71.71
10.3
62.6
20 156361
2541
4884
0.7
72.06
6.8
63.9
166 103766
# Correlation
cor(state.x77)
Population
Income Illiteracy
Life Exp
Population 1.00000000 0.2082276 0.10762237 -0.06805195
Income
0.20822756 1.0000000 -0.43707519 0.34025534
Illiteracy 0.10762237 -0.4370752 1.00000000 -0.58847793
Life Exp
-0.06805195 0.3402553 -0.58847793 1.00000000
Murder
0.34364275 -0.2300776 0.70297520 -0.78084575
HS Grad
-0.09848975 0.6199323 -0.65718861 0.58221620
Frost
-0.33215245 0.2262822 -0.67194697 0.26206801
Area
0.02254384 0.3633154 0.07726113 -0.10733194
HS Grad
Frost
Area
Population -0.09848975 -0.3321525 0.02254384
Income
0.61993232 0.2262822 0.36331544
Illiteracy -0.65718861 -0.6719470 0.07726113
Life Exp
0.58221620 0.2620680 -0.10733194
Murder
-0.48797102 -0.5388834 0.22839021
HS Grad
1.00000000 0.3667797 0.33354187
Frost
0.36677970 1.0000000 0.05922910
Area
0.33354187 0.0592291 1.00000000
>
>
pairs(state.x77[,2:6])
26
Murder
0.3436428
-0.2300776
0.7029752
-0.7808458
1.0000000
-0.4879710
-0.5388834
0.2283902
1.5
2.5
10 14
4500
6000
0.5
2.5
3000
Income
72
0.5
1.5
Illiteracy
10 14
68
70
Life Exp
40
HS Grad
50
60
Murder
3000
>
>
>
>
>
>
>
>
>
>
+
>
4500
6000
68
70
72
40
50
60
colnames(state.x77) <- c("Population", "Income", "Illiteracy", "Life.Exp", "Murder", "HS.

## Regression ##
## try this: ##
#model1 = lm(Life.Exp ~ Population + Income + Illiteracy + #Murder + HS.Grad + Frost + A
## make data frame object
st = as.data.frame(state.x77)
## Fit Linear Regression model
model1 = lm(Life.Exp ~ Population + Income + Illiteracy + Murder +
HS.Grad + Frost + Area , data=st)
summary(model1)
Call:
lm(formula = Life.Exp ~ Population + Income + Illiteracy + Murder +
HS.Grad + Frost + Area, data = st)
Residuals:
Min
1Q
Median
-1.48895 -0.51232 -0.02747
3Q
0.57002
Coefficients:
27
Max
1.49447
Estimate Std. Error t value Pr(>|t|)

(Intercept) 7.094e+01 1.748e+00 40.586 < 2e-16 ***
Population
5.180e-05 2.919e-05
1.775
0.0832 .
Income
-2.180e-05 2.444e-04 -0.089
0.9293
Illiteracy
3.382e-02 3.663e-01
0.092
0.9269
Murder
-3.011e-01 4.662e-02 -6.459 8.68e-08 ***
HS.Grad
4.893e-02 2.332e-02
2.098
0.0420 *
Frost
-5.735e-03 3.143e-03 -1.825
0.0752 .
Area
-7.383e-08 1.668e-06 -0.044
0.9649
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7448 on 42 degrees of freedom
Multiple R-squared: 0.7362,
Adjusted R-squared: 0.6922
F-statistic: 16.74 on 7 and 42 DF, p-value: 2.534e-10
>
>
>
## remove non significant variable

model2 = update(model1, .~.-Population -Illiteracy -Income - Area)
summary(model2)
Call:
lm(formula = Life.Exp ~ Murder + HS.Grad + Frost, data = st)
Residuals:
Min
1Q
-1.5015 -0.5391
Median
0.1014
3Q
0.5921
Max
1.2268
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 71.036379
0.983262 72.246 < 2e-16 ***
Murder
-0.283065
0.036731 -7.706 8.04e-10 ***
HS.Grad
0.049949
0.015201
3.286 0.00195 **
Frost
-0.006912
0.002447 -2.824 0.00699 **
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7427 on 46 degrees of freedom
Multiple R-squared: 0.7127,
Adjusted R-squared: 0.6939
F-statistic: 38.03 on 3 and 46 DF, p-value: 1.634e-12
>
>
## Prediction
predict(model2, list(Murder=10, HS.Grad=50, Frost=90))
1
70.08111
>
>
28
> require(car)
> outlierTest(model2) # Bonferonni p-value for most extreme obs
No Studentized residuals with Bonferonni p < 0.05
Largest |rstudent|:
rstudent unadjusted p-value Bonferonni p
Maine -2.17235
0.035136
NA
> qqPlot(model2, main="QQ Plot") #qq plot for studentized resid
>
1
0
1
2
Studentized Residuals(model2)
QQ Plot
t Quantiles
> leveragePlots(model2) # leverage plots
29
1.5
0.5
1.5
0.5
Life.Exp | others
1
0
1
2
3
Life.Exp | others
Leverage Plots
0.6
0.2
0.6
HS.Grad | others
1
0
1
Life.Exp | others
Murder | others
0.2
0.5
0.0
0.5
Frost | others
>
>
>
>
>
>
# Influential Observations
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(mtcars)-length(model2$coefficients)-2))
plot(model2, which=4, cook.levels=cutoff)
30
Cook's distance
0.15
Washington
0.10
Nevada
0.00
0.05
Cook's distance
0.20
Hawaii
10
20
30
40
50
Obs. number
lm(Life.Exp ~ Murder + HS.Grad + Frost)
> # Influence Plot

> influencePlot(model2,
>
id.method="identify", main="Influence Plot", sub="Circle size is p
31
0
1
2
Studentized Residuals
Influence Plot
0.05
0.10
0.15
0.20
0.25
HatValues
Circle size is proportial to Cook's Distance
> # Evaluate homoscedasticity

> # non-constant error variance test
> ncvTest(model2)
Non-constant Variance Score Test
Variance formula: ~ fitted.values
Chisquare = 0.0297813
Df = 1
p = 0.8629874
> # plot studentized residuals vs. fitted values

> spreadLevelPlot(model2)
Suggested power transformation:
-1.68051
> # Test for Autocorrelated Errors

> durbinWatsonTest(model2)
lag Autocorrelation D-W Statistic p-value
1
0.08963198
1.793994
0.486
Alternative hypothesis: rho != 0
> # Evaluate Collinearity
> vif(model2) # variance inflation factors
32
Murder HS.Grad
Frost
1.633405 1.339236 1.437903
> sqrt(vif(model2)) > 2 # problem?
Murder HS.Grad
FALSE
FALSE
Frost
FALSE
>
>
>
>
>
>
# Evaluate Nonlinearity
# component + residual plot
crPlots(model2)
# Ceres plots
#ceresPlots(model2)
7.5
>
>
>
>
>
>
>
1
2
3
4
5
6
>
>
Logistic Regression
## Logistic Regression ##
## load data ##
bindata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
## view the first few rows of the data
head(bindata)
admit
0
1
1
1
0
1
gre
380
660
800
640
520
760
gpa rank
3.61
3
3.67
3
4.00
1
3.19
4
2.93
4
3.00
2
# obtaind basuc statistics

summary(bindata)
admit
Min.
:0.0000
1st Qu.:0.0000
Median :0.0000
Mean
:0.3175
3rd Qu.:1.0000
Max.
:1.0000
>
gre
Min.
:220.0
1st Qu.:520.0
Median :580.0
Mean
:587.7
3rd Qu.:660.0
Max.
:800.0
gpa
Min.
:2.260
1st Qu.:3.130
Median :3.395
Mean
:3.390
3rd Qu.:3.670
Max.
:4.000
sapply(bindata, sd)
admit
gre
0.4660867 115.5165364
gpa
0.3805668
33
rank
0.9444602
rank
Min.
:1.000
1st Qu.:2.000
Median :2.000
Mean
:2.485
3rd Qu.:3.000
Max.
:4.000
>
>
>
>
>
>
bindata$rank <- factor(bindata$rank)

## fit logistics model
modlogit <- glm(admit ~ gre + gpa + rank, data = bindata, family = "binomial")
## result
summary(modlogit)
Call:
glm(formula = admit ~ gre + gpa + rank, family = "binomial",
data = bindata)
Deviance Residuals:
Min
1Q
Median
-1.6268 -0.8662 -0.6388
3Q
1.1490
Max
2.0790
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.989979
1.139951 -3.500 0.000465 ***
gre
0.002264
0.001094
2.070 0.038465 *
gpa
0.804038
0.331819
2.423 0.015388 *
rank2
-0.675443
0.316490 -2.134 0.032829 *
rank3
-1.340204
0.345306 -3.881 0.000104 ***
rank4
-1.551464
0.417832 -3.713 0.000205 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 499.98
Residual deviance: 458.52
AIC: 470.52
on 399
on 394
degrees of freedom
degrees of freedom
Number of Fisher Scoring iterations: 4
7.6
>
>
+
+
+
+
>
If-statement
w = 3
if( w < 5 )
d=2
} else {
d=10
}
d
[1] 2
>
34
7.7
>
>
>
+
+
+
>
For-loop
h <- seq(from=1, to=10)

s <- c()
for(i in 1:10)
{
s[i] = h[i] * 10
}
s
[1]
10
20
30
40
50
60
70
35
80
90 100

R Workshop STIS

Transféré par

Informations du document

Description originale:

Titre original

Copyright

Formats disponibles

Partager ce document

Partager ou intégrer le document

Options de partage

Avez-vous trouvé ce document utile ?

Ce contenu est-il inapproprié ?

Droits d'auteur :

Formats disponibles

R Workshop STIS

Transféré par

Droits d'auteur :

Formats disponibles

Introduction of R

R is an open-source environment for statistical computing an visualisation based

To get R just simple download R executable file from http://cran.r-project.org/bin/windows/base/.

Data Structure and Simple Manipulations

93 123 306 122 213 405 546

> vc1 %*% vc2

## Factor type for cathegorical data ##

[,1] [,2] [,3] [,4]

> mat2 <- matrix(c(vc2,vc1),nrow=4,ncol=5)

[,1] [,2] [,3] [,4] [,5]

[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]

> ## row binding ##

[,1] [,2] [,3] [,4] [,5]

[,1] [,2] [,3] [,4]

> #diagnoal Matrix

[,1] [,2] [,3] [,4]

> class (Data1)

> Data1[ Data1$sex=="male",]

> Data1[ Data1$X > 80,]

> summary (vc2)

Mean 3rd Qu.

> summary (Data1)

> table(Data1 $sex, Data1$grade)

Grade1 Grade2 Grade3 Grade4

> apply(mat2, 2, sd)

> apply(mat2, 2, function(x) length(x[x > 10]))

> sapply(1:10, function(x) x^2)

> lapply(1:5, function(x) x^4)

Help and Documentation

> help (rnorm)

Input Data Sets

> data1 <- data.frame(NULL)

Reading Built-in Data Sets

## list available data in R #

## Set Working directory ##

Reading Data from Files

> ## Csv file ##

> ## txt file ##

> # Stata files

> ## get from Internet ##

> ## histogram of Generated normally distributed data #

> # Scatter plot with color Denisty

> ## Smooth scatter Plot

> ## a different color scheme:

> z <- ts(matrix(rt(200*3 , df = 3), 200,3),

Plot Time Series

> plot(chik2$feed, chik2$weight)

> barplot(table(chik2$feed), xlab="feed", ylab="Frequency")

Simple Statistical Analysis

> chik2 <- read.table("chickenData.txt")

> ## One Sample t test ##

chik2$weight and chik2$feed

P value adjustment method: bonferroni

ANOVA and Regression

### Working with Data ##

> #Check Available data #

> ## Box Plot ##

chickwts$weight and chickwts$feed

P value adjustment method: bonferroni

BoxPlot Chicken Weights

Correlation and Regression Analysis

# clearly not a data frame!