Vous êtes sur la page 1sur 35

Introduction of R

Setia Pramana
May 23, 2014

What is R?

R is an open-source environment for statistical computing an visualisation based


on the S language developed at Bell Laboratories in the eighties. R first appeared in 1996 by Prof. Ross Ihaka and Robert Gentleman of the University of
Auckland, New Zealand.
It is free, open source, maintained and developed by a community of developers and Works in Windows, Unix, MacO.
R is applicable to the most complex and sophsticated problems, as well as
routine analysis, without any restrictions on access or use.

R Installation

To get R just simple download R executable file from http://cran.r-project.org/bin/windows/base/.

3
>
>
>
>
>
>

Data Structure and Simple Manipulations


### Short intro to R ##
# scalar data object
x <- 5
y <- 2
z <- x + y
z

[1] 7
>
>
>
>
>
>
>

## vector ##
vc1 <- c(2,5,5,3,3,6,2,3,5,6)
# we can also create an object using assign function :#
assign ("vc2",seq(from=1, to=100, by=10))
vc1
1

[1] 2 5 5 3 3 6 2 3 5 6
> vc2
[1]

1 11 21 31 41 51 61 71 81 91

> length(vc2)
[1] 10
> ## vector multiplication #
>
> vc1*vc2
[1]

55 105

93 123 306 122 213 405 546

> vc1 %*% vc2


[,1]
[1,] 1970
>
>
>
>
>

## Factor type for cathegorical data ##


type <- rep(c("High","Medium","Low"),times=10)
type <-factor(type)
type

[1] High
Medium Low
High
Medium Low
High
Medium Low
High
[11] Medium Low
High
Medium Low
High
Medium Low
High
Medium
[21] Low
High
Medium Low
High
Medium Low
High
Medium Low
Levels: High Low Medium
> grade <- rep(c("Grade1","Grade2","Grade3", "Grade4"),each=5)
> grade <-factor(grade)
> grade
[1] Grade1 Grade1 Grade1 Grade1 Grade1 Grade2 Grade2 Grade2 Grade2 Grade2
[11] Grade3 Grade3 Grade3 Grade3 Grade3 Grade4 Grade4 Grade4 Grade4 Grade4
Levels: Grade1 Grade2 Grade3 Grade4
> ## MAtrix ##
>
> mat <- matrix(c(2,3,1,5,4,5,6,7,2,3,1,5,4,5,6,7),nrow=4,ncol=4)
> mat
[1,]
[2,]
[3,]
[4,]

[,1] [,2] [,3] [,4]


2
4
2
4
3
5
3
5
1
6
1
6
5
7
5
7
2

> mat2 <- matrix(c(vc2,vc1),nrow=4,ncol=5)


> mat2

[1,]
[2,]
[3,]
[4,]

[,1] [,2] [,3] [,4] [,5]


1
41
81
5
2
11
51
91
3
3
21
61
2
3
5
31
71
5
6
6

> dim(mat2)
[1] 4 5
> ## coloumn binding##
> cbind(mat2,mat)

[1,]
[2,]
[3,]
[4,]

[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]


1
41
81
5
2
2
4
2
4
11
51
91
3
3
3
5
3
5
21
61
2
3
5
1
6
1
6
31
71
5
6
6
5
7
5
7

> ## row binding ##


> rbind(mat2,vc1[1:5])

[1,]
[2,]
[3,]
[4,]
[5,]

[,1] [,2] [,3] [,4] [,5]


1
41
81
5
2
11
51
91
3
3
21
61
2
3
5
31
71
5
6
6
2
5
5
3
3

> ## Transpose #
>
> t(mat2)

[1,]
[2,]
[3,]
[4,]
[5,]

[,1] [,2] [,3] [,4]


1
11
21
31
41
51
61
71
81
91
2
5
5
3
3
6
2
3
5
6

> #diagnoal Matrix


>
> diag(c(1,4,7,9))

[1,]
[2,]
[3,]
[4,]

[,1] [,2] [,3] [,4]


1
0
0
0
0
4
0
0
0
0
7
0
0
0
0
9

> diag(mat)
[1] 2 5 1 7
>
>
>
>

## Matrix multiplication #
m1 <- matrix(c(6,2,4,5), 2,2)
m2 <- matrix(c(2,4,1,2), 2,2)
m1

[1,]
[2,]

[,1] [,2]
6
4
2
5

> m2

[1,]
[2,]

[,1] [,2]
2
1
4
2

> m1*m2

[1,]
[2,]

[,1] [,2]
12
4
8
10

> m1%*%m2

[1,]
[2,]

[,1] [,2]
28
14
24
12

> ## List ##
>
> myList <- list(vc1, vc2, 5,6,"seven", mat,mat2)
> myList
[[1]]
[1] 2 5 5 3 3 6 2 3 5 6
[[2]]
[1] 1 11 21 31 41 51 61 71 81 91

[[3]]
[1] 5
[[4]]
[1] 6
[[5]]
[1] "seven"
[[6]]
[,1] [,2] [,3] [,4]
[1,]
2
4
2
4
[2,]
3
5
3
5
[3,]
1
6
1
6
[4,]
5
7
5
7
[[7]]
[,1] [,2] [,3] [,4] [,5]
[1,]
1
41
81
5
2
[2,]
11
51
91
3
3
[3,]
21
61
2
3
5
[4,]
31
71
5
6
6
> class (myList)
[1] "list"
> ## Data Frame ##
>
> Data1 <- data.frame( X=c( vc1, vc2), grade, sex=rep(c("male","female"),each=10))
> head(Data1)
1
2
3
4
5
6

X
2
5
5
3
3
6

grade
Grade1
Grade1
Grade1
Grade1
Grade1
Grade2

sex
male
male
male
male
male
male

> tail(Data1)
15
16
17
18
19
20

X
41
51
61
71
81
91

grade
Grade3
Grade4
Grade4
Grade4
Grade4
Grade4

sex
female
female
female
female
female
female
5

> class (Data1)


[1] "data.frame"
> dim(Data1)
[1] 20

> Data1[ Data1$sex=="male",]

1
2
3
4
5
6
7
8
9
10

X
2
5
5
3
3
6
2
3
5
6

grade
Grade1
Grade1
Grade1
Grade1
Grade1
Grade2
Grade2
Grade2
Grade2
Grade2

sex
male
male
male
male
male
male
male
male
male
male

> Data1[ Data1$X > 80,]


X grade
sex
19 81 Grade4 female
20 91 Grade4 female
>
Try this code:
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

## data Extraction ##
vc2[3:10]
vc2>10
vc3 <- vc2[vc2>10]
vc3
mat2[2:3,3:5]
mat2[2:3,3:5]*3
mat2 [,1]
mat2 [,-1]
Data1 $sex
Data1 [1:4,-1]
myList[[1]]
myList[1:2]

Summary Statistics

> summary (vc2)


Min. 1st Qu.
1.0
23.5

Median
46.0

Mean 3rd Qu.


46.0
68.5

> summary (Data1)


X
Min.
: 1.0
1st Qu.: 3.0
Median : 6.0
Mean
:25.0
3rd Qu.:43.5
Max.
:91.0

grade
Grade1:5
Grade2:5
Grade3:5
Grade4:5

sex
female:10
male :10

> mean(vc2)
[1] 46
> var(vc1)
[1] 2.444444
> length(vc2)
[1] 10
> tapply(Data1$X,Data1$grade,mean)
Grade1 Grade2 Grade3 Grade4
3.6
4.4
21.0
71.0
> table(Data1 $sex)
female
10

male
10

> table(Data1 $sex, Data1$grade)

female
male

Grade1 Grade2 Grade3 Grade4


0
0
5
5
5
5
0
0

> ##
> apply(mat2, 1, mean)
[1] 26.0 31.8 18.4 23.8
7

Max.
91.0

> apply(mat2, 2, sd)


[1] 12.909944 12.909944 47.821718

1.500000

1.825742

> apply(mat2, 2, function(x) length(x[x > 10]))


[1] 3 4 2 0 0
> apply(mat2, 2, function(x) mean(x[x > 10]))
[1]

21

56

86 NaN NaN

> rowMeans(mat2)
[1] 26.0 31.8 18.4 23.8
> colMeans(mat2)
[1] 16.00 56.00 44.75

4.25

4.00

> sapply(1:10, function(x) x^2)


[1]

16

25

36

49

64

81 100

> lapply(1:5, function(x) x^4)


[[1]]
[1] 1
[[2]]
[1] 16
[[3]]
[1] 81
[[4]]
[1] 256
[[5]]
[1] 625
> ## Inverse of a matrix ##
> solve(mat2[,-5])
[,1]
[,2]
[,3]
[,4]
[1,] -0.070618503 0.06081601 -0.08727651 0.072079002
[2,] 0.015109148 -0.01279106 0.05775468 -0.035072765
[3,] -0.006237006 0.01663202 -0.01455301 0.004158004
[4,] 0.191268191 -0.17671518 -0.22037422 0.205821206
8

>
> ## Delete all objects ##
>
> #rm(list=ls())
>
>

4.1

Help and Documentation

> help (rnorm)


> ?cor
> example (cor)
>

Read Data

5.1

Input Data Sets

> data1 <- data.frame(NULL)


> edit(data1)

5.2
>
>
>
>
>
>
>
>
>
>

1
2
3
4
5
6

Reading Built-in Data Sets

## list available data in R #


# data()
### Working with available Data sets (built in)##
# load data chicken (available from base)
data(chickwts)
## view the first few rows of the data
head(chickwts)
weight
179
160
136
227
217
168

feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean

> summary(chickwts)
weight
Min.
:108.0

casein

feed
:12

1st Qu.:204.5
Median :258.0
Mean
:261.3
3rd Qu.:323.5
Max.
:423.0
>
>
>
>
>
>
>

horsebean:10
linseed :12
meatmeal :11
soybean :14
sunflower:12

## Set Working directory ##


setwd("C:/Users/Administrator/Documents/Intro_to_R/")
write.csv(chickwts,file="chickenData.csv")
write.table(chickwts,file="chickenData.txt")

5.3

Reading Data from Files

> ## Csv file ##


>
> chik <- read.csv("chickenData.csv")
> head(chik)

1
2
3
4
5
6

X weight
feed
1
179 horsebean
2
160 horsebean
3
136 horsebean
4
227 horsebean
5
217 horsebean
6
168 horsebean

> ## txt file ##


>
> chik2 <- read.table("chickenData.txt")
> head(chik2)

1
2
3
4
5
6

weight
179
160
136
227
217
168

feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean

>
>
>
>

require(foreign)
# SPSS files
dat.spss <- read.spss("hsb2.sav",to.data.frame=TRUE)
head(dat.spss)

10

1
2
3
4
5
6

ID FEMALE RACE
SES SCHTYP
PROG READ WRITE MATH SCIENCE SOCST
70
male white
low public general
57
52
41
47
57
121 female white middle public vocation
68
59
53
63
61
86
male white
high public general
44
33
54
58
31
141
male white
high public vocation
63
44
47
53
56
172
male white middle public academic
47
52
57
53
61
113
male white middle public academic
44
52
51
63
61

> # Stata files


> dat.dta <- read.dta("hsb2.dta")
> head(dat.dta)

1
2
3
4
5
6

id female race
ses schtyp
prog read write math science socst
70
male white
low public general
57
52
41
47
57
121 female white middle public vocation
68
59
53
63
61
86
male white
high public general
44
33
54
58
31
141
male white
high public vocation
63
44
47
53
56
172
male white middle public academic
47
52
57
53
61
113
male white middle public academic
44
52
51
63
61

> ## get from Internet ##


>
> bindata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
> head(bindata)

1
2
3
4
5
6

admit
0
1
1
1
0
1

gre
380
660
800
640
520
760

gpa rank
3.61
3
3.67
3
4.00
1
3.19
4
2.93
4
3.00
2

>
>

Graphics

> ## histogram of Generated normally distributed data #


>
>
> hist(rnorm(100,0,1))
>
>
>

11

15
10
0

Frequency

20

Histogram of rnorm(100, 0, 1)

rnorm(100, 0, 1)

>
>
>
>
>
>

## Scatter Plot ##
n <- 1000
x1 <- matrix(rnorm(n, mean=0, sd=2),ncol = 2)
x2 <- matrix(rnorm(n, mean = 4, sd = 1.5), ncol = 2)
dtx
<- rbind(x1, x2)
plot(dtx)

12

5
5

dtx[,2]

dtx[,1]

> # Scatter plot with color Denisty


> plot(dtx, col = densCols(dtx), pch = 20)
>

13

5
5

dtx[,2]

dtx[,1]

> ## Smooth scatter Plot


>
> smoothScatter(dtx)
>

14

5
5

dtx[,2]

dtx[,1]

> ## a different color scheme:


> Lab.palette <- colorRampPalette(c("blue", "orange", "red"), space = "Lab")
> smoothScatter(dtx, colramp = Lab.palette)
>

15

5
5

dtx[,2]

dtx[,1]

> z <- ts(matrix(rt(200*3 , df = 3), 200,3),


+
start = c(1961, 1), frequency = 4)
> #View(z)
>
> head(z)
Series 1
Series 2
Series 3
[1,] -0.06231692 -0.39472095 -1.0049082
[2,] -0.23154172 -1.10949072 -1.2439971
[3,] -0.10867245 -1.51186314 3.7170121
[4,] 0.63891265 0.02689035 2.8298216
[5,] 1.40172512 0.64631727 -0.4636918
[6,] 1.08954925 0.11188457 -0.1046597
> plot(z, yax.flip = TRUE, main="Plot Time Series")
>

16

2
0

2
0
2
4
5
0
5
15

Series 3

25
1960

1970

1980

1990

2000

Time

> plot(density(chik2$weight))

17

2010

Series 2

6 4 2

Series 1

Plot Time Series

0.000

0.001

0.002

Density

0.003

0.004

density.default(x = chik2$weight)

100

200

300

400

500

N = 71 Bandwidth = 29.96

> stem(chik2$weight)
The decimal point is 2 digit(s) to the right of the |
1
1
2
2
3
3
4

|
|
|
|
|
|
|

124444
5566777889
00112223333444
5556666667778
0001222222333334444
5678899
02

> plot(chik2$feed, chik2$weight)

18

400
350
300
250
200
150
100

casein

linseed

soybean

> barplot(table(chik2$feed), xlab="feed", ylab="Frequency")

19

14
12
10
8

Frequency

6
4
2
0

casein

linseed

soybean

feed

20

7
7.1

Simple Statistical Analysis


Normality Test

> chik2 <- read.table("chickenData.txt")


> shapiro.test(chik2$weight)
Shapiro-Wilk normality test
data: chik2$weight
W = 0.9767, p-value = 0.2101

7.2

t-test

> ## One Sample t test ##


>
> t.test(chik2$weight, alternative='two.sided', mu=150)
One Sample t-test
data: chik2$weight
t = 12.0132, df = 70, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 150
95 percent confidence interval:
242.8301 279.7896
sample estimates:
mean of x
261.3099
> ## Two sample t-test ##
>
>
> chik3 <- chik2[chik2$feed %in% c("casein","sunflower") ,]
> t.test(weight~feed, data=chik3)
Welch Two Sample t-test
data: weight by feed
t = -0.2285, df = 20.502, p-value = 0.8215
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-53.94204 43.27538
sample estimates:
mean in group casein mean in group sunflower
323.5833
328.9167
> plot(weight~feed, data=chik3)
> ## We also do pairwise t-test ##
21

>
> pairwise.t.test(chik2$weight, chik2$feed, p.adj="bonferroni", paired=F)
Pairwise comparisons using t tests with pooled SD
data:

chik2$weight and chik2$feed

horsebean
linseed
meatmeal
soybean
sunflower

casein
3.1e-08
0.00022
0.68350
0.00998
1.00000

horsebean
0.22833
0.00011
0.00487
1.2e-08

linseed
0.20218
1.00000
9.3e-05

meatmeal
1.00000
0.39653

P value adjustment method: bonferroni

250

300

weight

350

400

>
>

casein

linseed

soybean

feed

7.3
>
>

ANOVA and Regression

### Working with Data ##

22

soybean
0.00447

> #Check Available data #


> # load data chicken (available from base)
> data(chickwts)
> head(chickwts)

1
2
3
4
5
6
>

weight
179
160
136
227
217
168

feed
horsebean
horsebean
horsebean
horsebean
horsebean
horsebean

summary(chickwts)

weight
Min.
:108.0
1st Qu.:204.5
Median :258.0
Mean
:261.3
3rd Qu.:323.5
Max.
:423.0

feed
casein
:12
horsebean:10
linseed :12
meatmeal :11
soybean :14
sunflower:12

> ## Box Plot ##


>
> plot(chickwts$feed,chickwts$weight, ylab="Berat", xlab="Pakan", ylim=c(1,1000), main="Box
> ## one-way ANOVA $$
>
> resanova <- aov(weight~feed, data=chickwts)
> #str(res)
>
> summary(resanova)
Df Sum Sq Mean Sq F value
Pr(>F)
feed
5 231129
46226
15.37 5.94e-10 ***
Residuals
65 195556
3009
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> TukeyHSD(resanova)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = weight ~ feed, data = chickwts)
$feed
23

diff
lwr
upr
p adj
horsebean-casein
-163.383333 -232.346876 -94.41979 0.0000000
linseed-casein
-104.833333 -170.587491 -39.07918 0.0002100
meatmeal-casein
-46.674242 -113.906207 20.55772 0.3324584
soybean-casein
-77.154762 -140.517054 -13.79247 0.0083653
sunflower-casein
5.333333 -60.420825 71.08749 0.9998902
linseed-horsebean
58.550000 -10.413543 127.51354 0.1413329
meatmeal-horsebean
116.709091
46.335105 187.08308 0.0001062
soybean-horsebean
86.228571
19.541684 152.91546 0.0042167
sunflower-horsebean 168.716667
99.753124 237.68021 0.0000000
meatmeal-linseed
58.159091
-9.072873 125.39106 0.1276965
soybean-linseed
27.678571 -35.683721 91.04086 0.7932853
sunflower-linseed
110.166667
44.412509 175.92082 0.0000884
soybean-meatmeal
-30.480519 -95.375109 34.41407 0.7391356
sunflower-meatmeal
52.007576 -15.224388 119.23954 0.2206962
sunflower-soybean
82.488095
19.125803 145.85039 0.0038845
> pairwise.t.test(chickwts$weight, chickwts$feed, p.adj="bonferroni", paired=F)
Pairwise comparisons using t tests with pooled SD
data:

chickwts$weight and chickwts$feed

horsebean
linseed
meatmeal
soybean
sunflower

casein
3.1e-08
0.00022
0.68350
0.00998
1.00000

horsebean
0.22833
0.00011
0.00487
1.2e-08

linseed
0.20218
1.00000
9.3e-05

meatmeal
1.00000
0.39653

P value adjustment method: bonferroni


>
>

24

soybean
0.00447

200

400

Berat

600

800

1000

BoxPlot Chicken Weights

casein

linseed

soybean

Pakan

>
>
>
>
>

## Two-way ANOVA
hsb2<-read.table("http://www.ats.ucla.edu/stat/data/hsb2.csv", sep=",", header=T)
attach(hsb2)
tapply(write, ses, mean)

1
2
3
50.61702 51.92632 55.91379
> tapply(write, ses, sd)
1
2
3
9.490391 9.106044 9.442874
> anova2 <- aov(write ~ ses + female)
> summary(anova2)
Df Sum Sq Mean Sq F value
Pr(>F)
ses
1
770
769.8
9.683 0.00214 **
female
1
1449 1448.8 18.225 3.05e-05 ***
Residuals
197 15660
79.5
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
25

>
>

7.4
>
>

Correlation and Regression Analysis

data(state)
head(state.x77)

Alabama
Alaska
Arizona
Arkansas
California
Colorado
>
>
>

# clearly not a data frame!

Population Income Illiteracy Life Exp Murder HS Grad Frost


Area
3615
3624
2.1
69.05
15.1
41.3
20 50708
365
6315
1.5
69.31
11.3
66.7
152 566432
2212
4530
1.8
70.55
7.8
58.1
15 113417
2110
3378
1.9
70.66
10.1
39.9
65 51945
21198
5114
1.1
71.71
10.3
62.6
20 156361
2541
4884
0.7
72.06
6.8
63.9
166 103766

# Correlation
cor(state.x77)

Population
Income Illiteracy
Life Exp
Population 1.00000000 0.2082276 0.10762237 -0.06805195
Income
0.20822756 1.0000000 -0.43707519 0.34025534
Illiteracy 0.10762237 -0.4370752 1.00000000 -0.58847793
Life Exp
-0.06805195 0.3402553 -0.58847793 1.00000000
Murder
0.34364275 -0.2300776 0.70297520 -0.78084575
HS Grad
-0.09848975 0.6199323 -0.65718861 0.58221620
Frost
-0.33215245 0.2262822 -0.67194697 0.26206801
Area
0.02254384 0.3633154 0.07726113 -0.10733194
HS Grad
Frost
Area
Population -0.09848975 -0.3321525 0.02254384
Income
0.61993232 0.2262822 0.36331544
Illiteracy -0.65718861 -0.6719470 0.07726113
Life Exp
0.58221620 0.2620680 -0.10733194
Murder
-0.48797102 -0.5388834 0.22839021
HS Grad
1.00000000 0.3667797 0.33354187
Frost
0.36677970 1.0000000 0.05922910
Area
0.33354187 0.0592291 1.00000000
>
>

pairs(state.x77[,2:6])

26

Murder
0.3436428
-0.2300776
0.7029752
-0.7808458
1.0000000
-0.4879710
-0.5388834
0.2283902

1.5

2.5

10 14

4500

6000

0.5

2.5

3000

Income

72

0.5

1.5

Illiteracy

10 14

68

70

Life Exp

40

HS Grad

50

60

Murder

3000

>
>
>
>
>
>
>
>
>
>
+
>

4500

6000

68

70

72

40

50

60

colnames(state.x77) <- c("Population", "Income", "Illiteracy", "Life.Exp", "Murder", "HS.


## Regression ##
## try this: ##
#model1 = lm(Life.Exp ~ Population + Income + Illiteracy + #Murder + HS.Grad + Frost + A
## make data frame object
st = as.data.frame(state.x77)
## Fit Linear Regression model
model1 = lm(Life.Exp ~ Population + Income + Illiteracy + Murder +
HS.Grad + Frost + Area , data=st)
summary(model1)

Call:
lm(formula = Life.Exp ~ Population + Income + Illiteracy + Murder +
HS.Grad + Frost + Area, data = st)
Residuals:
Min
1Q
Median
-1.48895 -0.51232 -0.02747

3Q
0.57002

Coefficients:
27

Max
1.49447

Estimate Std. Error t value Pr(>|t|)


(Intercept) 7.094e+01 1.748e+00 40.586 < 2e-16 ***
Population
5.180e-05 2.919e-05
1.775
0.0832 .
Income
-2.180e-05 2.444e-04 -0.089
0.9293
Illiteracy
3.382e-02 3.663e-01
0.092
0.9269
Murder
-3.011e-01 4.662e-02 -6.459 8.68e-08 ***
HS.Grad
4.893e-02 2.332e-02
2.098
0.0420 *
Frost
-5.735e-03 3.143e-03 -1.825
0.0752 .
Area
-7.383e-08 1.668e-06 -0.044
0.9649
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7448 on 42 degrees of freedom
Multiple R-squared: 0.7362,
Adjusted R-squared: 0.6922
F-statistic: 16.74 on 7 and 42 DF, p-value: 2.534e-10
>
>
>

## remove non significant variable


model2 = update(model1, .~.-Population -Illiteracy -Income - Area)
summary(model2)

Call:
lm(formula = Life.Exp ~ Murder + HS.Grad + Frost, data = st)
Residuals:
Min
1Q
-1.5015 -0.5391

Median
0.1014

3Q
0.5921

Max
1.2268

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 71.036379
0.983262 72.246 < 2e-16 ***
Murder
-0.283065
0.036731 -7.706 8.04e-10 ***
HS.Grad
0.049949
0.015201
3.286 0.00195 **
Frost
-0.006912
0.002447 -2.824 0.00699 **
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7427 on 46 degrees of freedom
Multiple R-squared: 0.7127,
Adjusted R-squared: 0.6939
F-statistic: 38.03 on 3 and 46 DF, p-value: 1.634e-12
>
>

## Prediction
predict(model2, list(Murder=10, HS.Grad=50, Frost=90))

1
70.08111
>
>
28

> require(car)
> outlierTest(model2) # Bonferonni p-value for most extreme obs
No Studentized residuals with Bonferonni p < 0.05
Largest |rstudent|:
rstudent unadjusted p-value Bonferonni p
Maine -2.17235
0.035136
NA
> qqPlot(model2, main="QQ Plot") #qq plot for studentized resid
>

1
0
1
2

Studentized Residuals(model2)

QQ Plot

t Quantiles

> leveragePlots(model2) # leverage plots

29

1.5
0.5
1.5

0.5

Life.Exp | others

1
0
1
2
3

Life.Exp | others

Leverage Plots

0.6

0.2

0.6

HS.Grad | others

1
0
1

Life.Exp | others

Murder | others

0.2

0.5

0.0

0.5

Frost | others

>
>
>
>
>
>

# Influential Observations
# Cook's D plot
# identify D values > 4/(n-k-1)
cutoff <- 4/((nrow(mtcars)-length(model2$coefficients)-2))
plot(model2, which=4, cook.levels=cutoff)

30

Cook's distance

0.15

Washington

0.10

Nevada

0.00

0.05

Cook's distance

0.20

Hawaii

10

20

30

40

50

Obs. number
lm(Life.Exp ~ Murder + HS.Grad + Frost)

> # Influence Plot


> influencePlot(model2,
>

id.method="identify", main="Influence Plot", sub="Circle size is p

31

0
1
2

Studentized Residuals

Influence Plot

0.05

0.10

0.15

0.20

0.25

HatValues
Circle size is proportial to Cook's Distance

> # Evaluate homoscedasticity


> # non-constant error variance test
> ncvTest(model2)
Non-constant Variance Score Test
Variance formula: ~ fitted.values
Chisquare = 0.0297813
Df = 1

p = 0.8629874

> # plot studentized residuals vs. fitted values


> spreadLevelPlot(model2)
Suggested power transformation:

-1.68051

> # Test for Autocorrelated Errors


> durbinWatsonTest(model2)
lag Autocorrelation D-W Statistic p-value
1
0.08963198
1.793994
0.486
Alternative hypothesis: rho != 0
> # Evaluate Collinearity
> vif(model2) # variance inflation factors

32

Murder HS.Grad
Frost
1.633405 1.339236 1.437903
> sqrt(vif(model2)) > 2 # problem?
Murder HS.Grad
FALSE
FALSE

Frost
FALSE

>
>
>
>
>
>

# Evaluate Nonlinearity
# component + residual plot
crPlots(model2)
# Ceres plots
#ceresPlots(model2)

7.5
>
>
>
>
>
>
>
1
2
3
4
5
6
>
>

Logistic Regression

## Logistic Regression ##
## load data ##
bindata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
## view the first few rows of the data
head(bindata)
admit
0
1
1
1
0
1

gre
380
660
800
640
520
760

gpa rank
3.61
3
3.67
3
4.00
1
3.19
4
2.93
4
3.00
2

# obtaind basuc statistics


summary(bindata)

admit
Min.
:0.0000
1st Qu.:0.0000
Median :0.0000
Mean
:0.3175
3rd Qu.:1.0000
Max.
:1.0000
>

gre
Min.
:220.0
1st Qu.:520.0
Median :580.0
Mean
:587.7
3rd Qu.:660.0
Max.
:800.0

gpa
Min.
:2.260
1st Qu.:3.130
Median :3.395
Mean
:3.390
3rd Qu.:3.670
Max.
:4.000

sapply(bindata, sd)
admit
gre
0.4660867 115.5165364

gpa
0.3805668
33

rank
0.9444602

rank
Min.
:1.000
1st Qu.:2.000
Median :2.000
Mean
:2.485
3rd Qu.:3.000
Max.
:4.000

>
>
>
>
>
>

bindata$rank <- factor(bindata$rank)


## fit logistics model
modlogit <- glm(admit ~ gre + gpa + rank, data = bindata, family = "binomial")
## result
summary(modlogit)

Call:
glm(formula = admit ~ gre + gpa + rank, family = "binomial",
data = bindata)
Deviance Residuals:
Min
1Q
Median
-1.6268 -0.8662 -0.6388

3Q
1.1490

Max
2.0790

Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.989979
1.139951 -3.500 0.000465 ***
gre
0.002264
0.001094
2.070 0.038465 *
gpa
0.804038
0.331819
2.423 0.015388 *
rank2
-0.675443
0.316490 -2.134 0.032829 *
rank3
-1.340204
0.345306 -3.881 0.000104 ***
rank4
-1.551464
0.417832 -3.713 0.000205 ***
--Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 499.98
Residual deviance: 458.52
AIC: 470.52

on 399
on 394

degrees of freedom
degrees of freedom

Number of Fisher Scoring iterations: 4

7.6
>
>
+
+
+
+
>

If-statement

w = 3
if( w < 5 )
d=2
} else {
d=10
}
d

[1] 2
>
34

7.7
>
>
>
+
+
+
>

For-loop

h <- seq(from=1, to=10)


s <- c()
for(i in 1:10)
{
s[i] = h[i] * 10
}
s

[1]

10

20

30

40

50

60

70

35

80

90 100

Vous aimerez peut-être aussi