Académique Documents
Professionnel Documents
Culture Documents
In [4]: # QUESTION 1
# What type of plot would you use to display the relationship between runs and one of the other nume
rical variables?
# Plot this relationship using the variable at bats as the predictor. Does the relationship look lin
ear?
# If you knew a team’s at bats, would you be comfortable using a linear model to predict the number
of runs?
In [5]: # Scatter Plot to present two numerical variables simultaneously because it permits the relationship
between the variables to be examined with ease.
# Linear relationship between runs scored in a season and a number of other player statistics.
# If the relationship looks linear, we can quantify the strength of the relationship with the correl
ation coefficient.
In [6]: dataframe1=mlb11[['runs','at_bats']]
dataframe1
Out[6]:
runs at_bats
0 855 5659
1 875 5710
2 787 5563
3 730 5672
4 762 5532
5 718 5600
6 867 5518
7 721 5447
8 735 5544
9 615 5598
10 708 5585
11 644 5436
12 654 5549
13 735 5612
14 667 5513
15 713 5579
16 654 5502
17 704 5509
18 731 5421
19 743 5559
20 619 5487
21 625 5508
22 610 5421
23 645 5452
24 707 5436
25 641 5528
26 624 5441
27 570 5486
28 593 5417
29 556 5421
In [7]: dataframe1.columns
In [9]: # Since the relationship is linear we can quanitfy the strength of the relationship with the correla
tion coefficient.
dataframe1.corr(method='pearson', min_periods=1)
Out[9]:
runs at_bats
# Linear Model
In [52]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, at_bats, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()
In [ ]: # If the team’s at bats was known using a linear model to predict the number of runs would be suitab
le.
In [10]: # QUESTION 2
# Choose another traditional variable from mlb11.csv that you think might be a good predictor of run
s.
# Produce a scatterplot of the two variables and fit a linear model.
# At a glance, does there seem to be a linear relationship?
wins= np.array([96,90,95,71,90,77,97,96,73,56,69,82,71,79,86,102,79,80,94,81,63,72,72,74,91,89,80,86
,71,67])
# Linear Model
dataframe2=mlb11[['runs', 'wins']]
plot2=dataframe2.plot.scatter(x='runs', y='wins', c='orange')
dataframe2.corr(method='pearson', min_periods=1)
Out[13]:
runs wins
In [14]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, wins, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()
In [15]: # Yes, the relationship between X= Runs and Y= Wins seems to be LINEAR.
In [16]: # QUESTION 3
# Now that you can summarize the linear relationship between two variables, investigate the relation
ships between runs and each of the other five traditional variables.
# Which variable best predicts runs?
# Support your conclusion using the graphical and numerical methods we’ve discussed.
hits= np.array([1599,1600,1540,1560,1513,1477,1452,1422,1429,1442,1434,1395,1423,1438,1394,1409,1387
,1380,1357,
1384,1357,1358,1325,1330,1324,1345,1319,1327,1284,1263])
# Linear Model
dataframe3.corr(method='pearson', min_periods=1)
Out[19]:
runs hits
In [20]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, hits, 10))
xp = np.linspace(60,1000, 10)
plot4= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()
batting_average = np.array([0.283,0.280,0.277,0.275,0.273,0.264,0.263,0.261,0.258,0.258,0.257,0.257,
0.256,0.256,
0.253,0.253,0.252,0.25,0.25,0.249,0.247,0.247,0.244,0.244,0.244,0.243,0.
242,0.242,
0.237,0.233])
runs= np.array([855,875,787,730,762,718,867,721,735,615,708,644,654,735,667,713,654,704,731,743,619,
625,610,645,
707,641,624,570,593,556])
# Linear Model
strikeouts = np.array([930,1108,1143,1006,978,1085,1138,1083,1201,1164,1120,1087,1202,1250,1086,1024
,989,1269,
1249,1184,1048,1244,1308,1094,1193,1260,1323,1122,1320,1280])
# Linear Model
dataframe4.corr(method='pearson', min_periods=1)
Out[25]:
runs strikeouts
In [26]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, strikeouts, 10))
xp = np.linspace(60,1000, 10)
plot4= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()
In [ ]: # No, the relationship between X= Runs and Y= Strikeouts does not seem to be LINEAR.
stolen_bases = np.array([143,102,49,153,57,130,147,94,118,118,81,126,69,97,135,96,81,89,133,131,92,9
5,108,117,
155,77,106,85,170,125])
# Linear Model
dataframe5.corr(method='pearson', min_periods=1)
Out[29]:
runs stolen_bases
In [30]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, stolen_bases, 10))
xp = np.linspace(60,1000, 10)
plot5= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()
In [ ]: # Yes, the relationship between X= Runs and Y= Stolen Bases seems to be WEAKLY LINEAR.
new_onbase = np.array([0.34,0.349,0.34,0.329,0.341,0.335,0.343,0.325,0.329,0.311,0.316,0.322,0.314,
0.326,
0.313,0.323,0.319,0.317,0.322,0.317,0.306,0.318,0.309,0.311,0.322,0.308,0.309
,
0.303,0.305,0.292])
# Linear Model
dataframe6.corr(method='pearson', min_periods=1)
Out[34]:
runs new_onbase
In [53]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, new_onbase, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()