Vous êtes sur la page 1sur 1

In [2]: # DATA VISUALIZATION LAB: LINEAR REGRESSION USING NUMPY

# TANAYA YADAV - 15BCE0461

# LAB SLOT L53+L54

In [2]: import pandas as pd


import numpy as np
import matplotlib.pyplot as plt

In [3]: mlb11 = pd.read_csv('/Users/tanaya/Semester 7/Data Visualization/mlb11.csv')

In [4]: # QUESTION 1

# What type of plot would you use to display the relationship between runs and one of the other nume
rical variables?
# Plot this relationship using the variable at bats as the predictor. Does the relationship look lin
ear?
# If you knew a team’s at bats, would you be comfortable using a linear model to predict the number
of runs?

In [5]: # Scatter Plot to present two numerical variables simultaneously because it permits the relationship
between the variables to be examined with ease.
# Linear relationship between runs scored in a season and a number of other player statistics.
# If the relationship looks linear, we can quantify the strength of the relationship with the correl
ation coefficient.

In [6]: dataframe1=mlb11[['runs','at_bats']]
dataframe1

Out[6]:
runs at_bats

0 855 5659

1 875 5710

2 787 5563

3 730 5672

4 762 5532

5 718 5600

6 867 5518

7 721 5447

8 735 5544

9 615 5598

10 708 5585

11 644 5436

12 654 5549

13 735 5612

14 667 5513

15 713 5579

16 654 5502

17 704 5509

18 731 5421

19 743 5559

20 619 5487

21 625 5508

22 610 5421

23 645 5452

24 707 5436

25 641 5528

26 624 5441

27 570 5486

28 593 5417

29 556 5421

In [7]: dataframe1.columns

Out[7]: Index(['runs', 'at_bats'], dtype='object')

In [8]: # Scatter Plot (X= Runs, Y= At Bats)


# The relationship looks moderately linear but not strong enough to be able to comfortably use a lin
ear model to predict the number of runs.

plot1=dataframe1.plot.scatter(x='runs', y='at_bats', c='pink')

In [9]: # Since the relationship is linear we can quanitfy the strength of the relationship with the correla
tion coefficient.

dataframe1.corr(method='pearson', min_periods=1)

Out[9]:
runs at_bats

runs 1.000000 0.610627

at_bats 0.610627 1.000000

In [46]: at_bats = np.array([5659,5710,5563,5672,5532,5600,5518,5447,5544,5598,5585,5436,5549,5612,5513,5579,


5502,5509,
5421,5559,5487,5508,5421,5452,5436,5528,5441,5486,5417,5421])

# Linear Model

linear_model1= np.polyfit(runs, at_bats,1)


linear_model1

Out[46]: array([5.91333589e-01, 5.11335102e+03])

In [48]: dataframe_atbats=mlb11[['runs', 'at_bats']]


plot_atbats=dataframe_atbats.plot.scatter(x='runs', y='at_bats', c='grey')

In [52]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, at_bats, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()

In [ ]: # If the team’s at bats was known using a linear model to predict the number of runs would be suitab
le.

# The correlation coefficient of X=RUNS; Y=AT_BATS is 0.610627.

In [10]: # QUESTION 2

# Choose another traditional variable from mlb11.csv that you think might be a good predictor of run
s.
# Produce a scatterplot of the two variables and fit a linear model.
# At a glance, does there seem to be a linear relationship?

In [11]: runs= np.array([855,875,787,730,762,718,867,721,735,615,708,644,654,735,667,713,654,704,731,743,619,


625,610,645,
707,641,624,570,593,556])

wins= np.array([96,90,95,71,90,77,97,96,73,56,69,82,71,79,86,102,79,80,94,81,63,72,72,74,91,89,80,86
,71,67])

# Linear Model

linear_model= np.polyfit(runs, wins,1)


linear_model

Out[11]: array([ 0.08315339, 23.29147734])

In [12]: # Taking 'Wins' as the traditional variable

dataframe2=mlb11[['runs', 'wins']]
plot2=dataframe2.plot.scatter(x='runs', y='wins', c='orange')

In [13]: # Correlation Coefficient between Runs and Wins

dataframe2.corr(method='pearson', min_periods=1)

Out[13]:
runs wins

runs 1.000000 0.600809

wins 0.600809 1.000000

In [14]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, wins, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()

In [15]: # Yes, the relationship between X= Runs and Y= Wins seems to be LINEAR.

In [16]: # QUESTION 3

# Now that you can summarize the linear relationship between two variables, investigate the relation
ships between runs and each of the other five traditional variables.
# Which variable best predicts runs?
# Support your conclusion using the graphical and numerical methods we’ve discussed.

In [17]: # Variable 1 - HITS

hits= np.array([1599,1600,1540,1560,1513,1477,1452,1422,1429,1442,1434,1395,1423,1438,1394,1409,1387
,1380,1357,
1384,1357,1358,1325,1330,1324,1345,1319,1327,1284,1263])

# Linear Model

linear_model_2= np.polyfit(runs, hits,1)


linear_model_2

Out[17]: array([ 0.84592348, 822.16747161])

In [18]: dataframe3=mlb11[['runs', 'hits']]


plot3=dataframe3.plot.scatter(x='runs', y='hits', c='blue')

In [19]: # Numerical Prediction


# Correlation Coefficient between Runs and Hits

dataframe3.corr(method='pearson', min_periods=1)

Out[19]:
runs hits

runs 1.000000 0.801211

hits 0.801211 1.000000

In [20]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, hits, 10))
xp = np.linspace(60,1000, 10)
plot4= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()

In [ ]: # Yes, the relationship between X= Runs and Y= HITS seems to be LINEAR.

In [21]: # Variable 2 - BAT_AVG

batting_average = np.array([0.283,0.280,0.277,0.275,0.273,0.264,0.263,0.261,0.258,0.258,0.257,0.257,
0.256,0.256,
0.253,0.253,0.252,0.25,0.25,0.249,0.247,0.247,0.244,0.244,0.244,0.243,0.
242,0.242,
0.237,0.233])
runs= np.array([855,875,787,730,762,718,867,721,735,615,708,644,654,735,667,713,654,704,731,743,619,
625,610,645,
707,641,624,570,593,556])
# Linear Model

linear_model_3= np.polyfit(runs, batting_average,1)


linear_model_3

Out[21]: array([1.25152321e-04, 1.68127684e-01])

In [22]: # Variable 3 - STRIKEOUTS

strikeouts = np.array([930,1108,1143,1006,978,1085,1138,1083,1201,1164,1120,1087,1202,1250,1086,1024
,989,1269,
1249,1184,1048,1244,1308,1094,1193,1260,1323,1122,1320,1280])

# Linear Model

linear_model_4= np.polyfit(runs, batting_average,1)


linear_model_4

Out[22]: array([1.25152321e-04, 1.68127684e-01])

In [24]: dataframe4=mlb11[['runs', 'strikeouts']]


plot4=dataframe4.plot.scatter(x='runs', y='strikeouts', c='orange')

In [25]: # Numerical Prediction


# Correlation Coefficient between Runs and Strikeouts

dataframe4.corr(method='pearson', min_periods=1)

Out[25]:
runs strikeouts

runs 1.000000 -0.411531

strikeouts -0.411531 1.000000

In [26]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, strikeouts, 10))
xp = np.linspace(60,1000, 10)
plot4= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()

In [ ]: # No, the relationship between X= Runs and Y= Strikeouts does not seem to be LINEAR.

In [27]: # Variable 4 - STOLEN BASES

stolen_bases = np.array([143,102,49,153,57,130,147,94,118,118,81,126,69,97,135,96,81,89,133,131,92,9
5,108,117,
155,77,106,85,170,125])

# Linear Model

linear_model_5= np.polyfit(runs, stolen_bases,1)


linear_model_5

Out[27]: array([1.95487456e-02, 9.57409900e+01])

In [28]: dataframe5=mlb11[['runs', 'stolen_bases']]


plot5=dataframe5.plot.scatter(x='runs', y='stolen_bases', c='magenta')

In [29]: # Numerical Prediction


# Correlation Coefficient between Runs and Stolen Bases

dataframe5.corr(method='pearson', min_periods=1)

Out[29]:
runs stolen_bases

runs 1.000000 0.053981

stolen_bases 0.053981 1.000000

In [30]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, stolen_bases, 10))
xp = np.linspace(60,1000, 10)
plot5= plt.plot(runs, hits, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(1100,1500)
plt.show()

In [ ]: # Yes, the relationship between X= Runs and Y= Stolen Bases seems to be WEAKLY LINEAR.

In [32]: # Variable 5 - NEW ON BASE

new_onbase = np.array([0.34,0.349,0.34,0.329,0.341,0.335,0.343,0.325,0.329,0.311,0.316,0.322,0.314,
0.326,
0.313,0.323,0.319,0.317,0.322,0.317,0.306,0.318,0.309,0.311,0.322,0.308,0.309
,
0.303,0.305,0.292])

# Linear Model

linear_model_6= np.polyfit(runs, new_onbase,1)


linear_model_6

Out[32]: array([1.50169403e-04, 2.16309169e-01])

In [33]: dataframe6=mlb11[['runs', 'new_onbase']]


plot6=dataframe6.plot.scatter(x='runs', y='new_onbase', c='purple')

In [34]: # Numerical Prediction


# Correlation Coefficient between Runs and New on Base

dataframe6.corr(method='pearson', min_periods=1)

Out[34]:
runs new_onbase

runs 1.000000 0.921469

new_onbase 0.921469 1.000000

In [53]: p = np.poly1d(linear_model)
p30 = np.poly1d(np.polyfit(runs, new_onbase, 10))
xp = np.linspace(60,1000, 100)
plot3= plt.plot(runs, wins, '.', xp, p(xp), '-', xp, p30(xp), '--')
plt.ylim(60,110)
plt.show()

Vous aimerez peut-être aussi