Vous êtes sur la page 1sur 10

%%bash

wget -q https://s3.amazonaws.com/jupyter-assignment/data/apache.access.log
In [2]:
import os
import sys
import re
import datetime
from pyspark.sql import functions as F
In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
In [4]:
data = sqlContext.read.text('apache.access.log')
In [5]:
print(data.count())
1043177
In [6]:
print(data.printSchema())
root
|-- value: string (nullable = true)

None
In [7]:
data.show(n=7,truncate=False)
+----------------------------------------------------------------------------
----------------------------------------------+
|value
|
+----------------------------------------------------------------------------
----------------------------------------------+
|in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] "GET /shuttle/missions/st
s-68/news/sts-68-mcc-05.txt HTTP/1.0" 200 1839|
|uplherc.upl.com - - [01/Aug/1995:00:00:07 -0400] "GET / HTTP/1.0" 304 0
|
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/ksclogo-mediu
m.gif HTTP/1.0" 304 0 |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/MOSAIC-logosma
ll.gif HTTP/1.0" 304 0 |
|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] "GET /images/USA-logosmall.
gif HTTP/1.0" 304 0 |
|ix-esc-ca2-07.ix.netcom.com - - [01/Aug/1995:00:00:09 -0400] "GET /images/la
unch-logo.gif HTTP/1.0" 200 1713 |
|uplherc.upl.com - - [01/Aug/1995:00:00:10 -0400] "GET /images/WORLD-logosmal
l.gif HTTP/1.0" 304 0 |
+----------------------------------------------------------------------------
----------------------------------------------+
only showing top 7 rows

In [8]:
split_df = data.select(
# \s = whitespace char, \d = digit char [0-9], \w = word char
# 'host' field: ([^\s]+\s) means take group who DOESN'T begin with whitespa
ce char, and regex stop when it encounters \s
F.regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'),
# 'timestamp' field: capture group whose enclosed by bar bracket [] - paren
thesis doesn't cover the bar-brack cuz you just want the timestamp.
# it goes like: "2-dig/3-alpha/4-dig/2dig:2dig:2dig: -3d
ig"
F.regexp_extract('value', r'^.*\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d
{4})]', 1).alias('timestamp'),
# 'path' field: ^.*" = take any char until you hit the double-quote char.
\w+\s = http request method.
# Finally, ([^\s]+)\s+HTTP = keep extracing all non-whitespac
e char until you bump into \s followed up HTTP
F.regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('path
'),
# 'status' field: http://www.w3schools.com/tags/ref_httpmessages.asp
F.regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('sta
tus'),
# 'content_size' field: the ending series of digits
F.regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('conten
t_size'))
split_df.show(n=5,truncate=False)
+------------------+--------------------------+------------------------------
-----------------+------+------------+
|host |timestamp |path
|status|content_size|
+------------------+--------------------------+------------------------------
-----------------+------+------------+
|in24.inetnebr.com |01/Aug/1995:00:00:01 -0400|/shuttle/missions/sts-68/news/
sts-68-mcc-05.txt|200 |1839 |
|uplherc.upl.com |01/Aug/1995:00:00:07 -0400|/
|304 |0 |
|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/ksclogo-medium.gif
|304 |0 |
|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/MOSAIC-logosmall.gif
|304 |0 |
|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/USA-logosmall.gif
|304 |0 |
+------------------+--------------------------+------------------------------
-----------------+------+------------+
only showing top 5 rows

In [9]:
cleaned_df = split_df.na.fill({'content_size': 0})
In [10]:
month_map = {
'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12
}

def parse_clf_time(s):
""" Convert Common Log time format into a Python datetime object
Args:
s (str): date and time in Apache time format [dd/mmm/yyyy:hh:mm:ss (+
/-)zzzz]
Returns:
a string suitable for passing to CAST('timestamp')
"""
# NOTE: We're ignoring time zone here. In a production application, you'd
want to handle that.
return "{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}".format(
int(s[7:11]),
month_map[s[3:6]],
int(s[0:2]),
int(s[12:14]),
int(s[15:17]),
int(s[18:20])
)

u_parse_time = F.udf(parse_clf_time)
In [11]:
col_to_append = (u_parse_time(cleaned_df['timestamp'])
.cast('timestamp') # convert column type. https://wtak23.git
hub.io/pyspark/generated/generated/sql.Column.cast.html
.alias('time') # rename
)
print(col_to_append)
Column<b'CAST(parse_clf_time(timestamp) AS TIMESTAMP) AS `time`'>
In [12]:
# now append column to our parsed, cleaned dataframe
logs_df = cleaned_df.select('*', col_to_append)
logs_df.show(n=5,truncate=False)
+------------------+--------------------------+------------------------------
-----------------+------+------------+-------------------+
|host |timestamp |path
|status|content_size|time |
+------------------+--------------------------+------------------------------
-----------------+------+------------+-------------------+
|in24.inetnebr.com |01/Aug/1995:00:00:01 -0400|/shuttle/missions/sts-68/news/
sts-68-mcc-05.txt|200 |1839 |1995-08-01 00:00:01|
|uplherc.upl.com |01/Aug/1995:00:00:07 -0400|/
|304 |0 |1995-08-01 00:00:07|
|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/ksclogo-medium.gif
|304 |0 |1995-08-01 00:00:08|
|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/MOSAIC-logosmall.gif
|304 |0 |1995-08-01 00:00:08|
|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/USA-logosmall.gif
|304 |0 |1995-08-01 00:00:08|
+------------------+--------------------------+------------------------------
-----------------+------+------------+-------------------+
only showing top 5 rows

In [13]:
logs_df = logs_df.drop('timestamp')
logs_df.show(n=5,truncate=False)
+------------------+-----------------------------------------------+------+--
----------+-------------------+
|host |path |status|co
ntent_size|time |
+------------------+-----------------------------------------------+------+--
----------+-------------------+
|in24.inetnebr.com |/shuttle/missions/sts-68/news/sts-68-mcc-05.txt|200 |18
39 |1995-08-01 00:00:01|
|uplherc.upl.com |/ |304 |0
|1995-08-01 00:00:07|
|uplherc.upl.com |/images/ksclogo-medium.gif |304 |0
|1995-08-01 00:00:08|
|uplherc.upl.com |/images/MOSAIC-logosmall.gif |304 |0
|1995-08-01 00:00:08|
|uplherc.upl.com |/images/USA-logosmall.gif |304 |0
|1995-08-01 00:00:08|
+------------------+-----------------------------------------------+------+--
----------+-------------------+
only showing top 5 rows

In [14]:
total_log_entries = logs_df.count()
print(total_log_entries)
1043177
In [15]:
logs_df.printSchema()
root
|-- host: string (nullable = true)
|-- path: string (nullable = true)
|-- status: integer (nullable = true)
|-- content_size: integer (nullable = false)
|-- time: timestamp (nullable = true)

In [16]:
logs_df.show()
+--------------------+--------------------+------+------------+--------------
-----+
| host| path|status|content_size|
time|
+--------------------+--------------------+------+------------+--------------
-----+
| in24.inetnebr.com |/shuttle/missions...| 200| 1839|1995-08-01 00:
00:01|
| uplherc.upl.com | /| 304| 0|1995-08-01 00:
00:07|
| uplherc.upl.com |/images/ksclogo-m...| 304| 0|1995-08-01 00:
00:08|
| uplherc.upl.com |/images/MOSAIC-lo...| 304| 0|1995-08-01 00:
00:08|
| uplherc.upl.com |/images/USA-logos...| 304| 0|1995-08-01 00:
00:08|
|ix-esc-ca2-07.ix....|/images/launch-lo...| 200| 1713|1995-08-01 00:
00:09|
| uplherc.upl.com |/images/WORLD-log...| 304| 0|1995-08-01 00:
00:10|
|slppp6.intermind....|/history/skylab/s...| 200| 1687|1995-08-01 00:
00:10|
|piweba4y.prodigy....|/images/launchmed...| 200| 11853|1995-08-01 00:
00:10|
|slppp6.intermind....|/history/skylab/s...| 200| 9202|1995-08-01 00:
00:11|
|slppp6.intermind....|/images/ksclogosm...| 200| 3635|1995-08-01 00:
00:12|
|ix-esc-ca2-07.ix....|/history/apollo/i...| 200| 1173|1995-08-01 00:
00:12|
|slppp6.intermind....|/history/apollo/i...| 200| 3047|1995-08-01 00:
00:13|
| uplherc.upl.com |/images/NASA-logo...| 304| 0|1995-08-01 00:
00:14|
| 133.43.96.45 |/shuttle/missions...| 200| 10566|1995-08-01 00:
00:16|
|kgtyk4.kj.yamagat...| /| 200| 7280|1995-08-01 00:
00:17|
|kgtyk4.kj.yamagat...|/images/ksclogo-m...| 200| 5866|1995-08-01 00:
00:18|
| d0ucr6.fnal.gov |/history/apollo/a...| 200| 2743|1995-08-01 00:
00:19|
|ix-esc-ca2-07.ix....|/shuttle/resource...| 200| 6849|1995-08-01 00:
00:19|
| d0ucr6.fnal.gov |/history/apollo/a...| 200| 14897|1995-08-01 00:
00:20|
+--------------------+--------------------+------+------------+--------------
-----+
only showing top 20 rows

Q1: Show the content_size Statistics using describe method (5 points)


In [29]:
logs_df.describe("content_size").show()
+-------+------------------+
|summary| content_size|
+-------+------------------+
| count| 1043177|
| mean|17531.555702435926|
| stddev| 68561.9990626412|
| min| 0|
| max| 3421948|
+-------+------------------+

Q2: Count each status (hint: Use groupby and count). Also sort the output by status (5 points)
In [241]:
Q2 = logs_df.groupby("status").count().orderBy('status')
Q2.show()
+------+------+
|status| count|
+------+------+
| 200|940847|
| 302| 16244|
| 304| 79824|
| 403| 58|
| 404| 6185|
| 500| 2|
| 501| 17|
+------+------+

Q3: Display the about output using Bar graph. (Hint: Use matplotlib library) (5 points)
In [242]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
out2=Q2.toPandas()
position = [i + 0.1 for i, _ in enumerate(out2['status'])]
plt.bar(position,out2['count'])
plt.xticks(position,out2['status'])
plt.show()

Q4: Get all hosts that has accessed the server more than 10 times. Show 15 hosts as an output (5
points)
In [160]:
Q4=logs_df.groupby("host").count()
Q4.filter(Q4["count"]>10).show(n=15)
+--------------------+-----+
| host|count|
+--------------------+-----+
|prakinf2.prakinf....| 96|
| alpha2.csd.uwm.edu | 81|
|cjc07992.slip.dig...| 16|
|n1377004.ksc.nasa...| 227|
| 163.205.2.134 | 78|
|huge.oso.chalmers...| 237|
| 163.205.44.27 | 60|
| shark.ksc.nasa.gov | 26|
| etc5.etechcorp.com | 11|
|dd07-029.compuser...| 18|
| 131.182.101.161 | 64|
| 134.95.100.201 | 15|
|vab08.larc.nasa.gov | 17|
| ip11.iac.net | 92|
|ad11-012.compuser...| 42|
+--------------------+-----+
only showing top 15 rows

Q5: Show top 10 path based on their count (hint: Use groupby and count) (5 points)
In [161]:
Q5=logs_df.groupby("path").count().orderBy('count', ascending=False).show(n=1
0)
+--------------------+-----+
| path|count|
+--------------------+-----+
|/images/NASA-logo...|59666|
|/images/KSC-logos...|50420|
|/images/MOSAIC-lo...|43831|
|/images/USA-logos...|43604|
|/images/WORLD-log...|43217|
|/images/ksclogo-m...|41267|
| /ksc.html|28536|
|/history/apollo/i...|26766|
|/images/launch-lo...|24742|
| /|20173|
+--------------------+-----+
only showing top 10 rows

Difficulty: Intermediate
Q6: What are the top ten paths which do not return code 200? Hint: Create a sorted list containing
the paths and the number of times that they were accessed (counts) with a non 200 return code and
show the top ten paths with their counts. (Use groupby and count) (5 points)
In [113]:
Q6=logs_df.filter(logs_df["status"]!=200)
Q6.groupby("path").count().orderBy('count', ascending=False).show(10)
+--------------------+-----+
| path|count|
+--------------------+-----+
|/images/NASA-logo...| 8761|
|/images/KSC-logos...| 7236|
|/images/MOSAIC-lo...| 5197|
|/images/USA-logos...| 5157|
|/images/WORLD-log...| 5020|
|/images/ksclogo-m...| 4728|
|/history/apollo/i...| 2907|
|/images/launch-lo...| 2811|
| /| 2199|
|/images/ksclogosm...| 1622|
+--------------------+-----+
only showing top 10 rows

Q7: How many unique hosts are there in the entire log? (5 points)
In [117]:
logs_df.select("host").distinct().count()
Out[117]:
54507
Difficulty: Hard
Q8: determine the number of unique hosts in the entire log on a day-by-day basis. Hint: You need to
use the dayofmonth function in sql.functions
module. https://spark.apache.org/docs/1.6.2/api/java/org/apache/spark/sql/functions.html Think
about the steps that you need to perform to count the number of different hosts that make requests
each day. (10 points)
In [204]:
x=F.dayofmonth(logs_df.time)
D1=logs_df.select(logs_df.host,x.alias('Day'))
D2=D1.sort(D1.Day).groupby(D1.Day).agg(F.countDistinct(D1.host))
D2.show(n=31)
+---+--------------------+
|Day|count(DISTINCT host)|
+---+--------------------+
| 1| 2582|
| 3| 3222|
| 4| 4190|
| 5| 2502|
| 6| 2537|
| 7| 4106|
| 8| 4406|
| 9| 4317|
| 10| 4523|
| 11| 4346|
| 12| 2864|
| 13| 2650|
| 14| 4454|
| 15| 4214|
| 16| 4340|
| 17| 4385|
| 18| 4168|
| 19| 2550|
| 20| 2560|
| 21| 4134|
| 22| 4456|
+---+--------------------+

Q9: Visualize the Number of Unique Daily Hosts above. (Hint: Use matplotlib library) (5 points)
In [238]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
Day_df=D2.toPandas()
Day_df.columns=['Day','host']
position = [i + 0.1 for i, _ in enumerate(Day_df['Day'])]
plt.bar(position,Day_df['host'])
plt.xticks(position,Day_df['Day'])
plt.show()

Vous aimerez peut-être aussi