Vous êtes sur la page 1sur 5

Data Science 1: Assignment No.

2
Date: Sept 26, 2016

By,
Ashish Menkudale
UIN: 656130575
amenku2@uic.edu

import timeit
import numpy as np
import pandas as pd
import bs4
import requests
from bs4 import BeautifulSoup

start = timeit.default_timer()
#timer started

import urllib2
data = urllib2.urlopen("https://archive.org/stream/ataleoftwocities00098gut/98.txt")
l = []
str = ''
for line in data.readlines():
l.append(line)
str = '\n'.join(l)

print str
# got all the text here

import lxml.html
import re, htmlentitydefs
filtered_str = re.sub('<[^<]+?>', '', str)
print filtered_str
# cleared html tags

import re

removed_punct = re.sub(r'[^\w\s]','',filtered_str)
print removed_punct
#removed punctuation over here
stopwords = ['had', 'has' ,'your' ,'you' ,'with' ,'i' ,'his', 'she', 'he', 'are' ,
'not' ,'the' ,'a','was','an','and','of','at','on','over','under','to',
'from','what','if','else','also','in','is','it','by','this','that','his',
'have','be', 'as', 'were', 'for', 'so', 'him', 'her', 'but', 'she', 'or',
'no', 'will', 'my', 'up', 'its', 'there', 'away', 'me', 'we' , 'they', 'only',
'too', 'down', 'upon', 'into', 'their', 'here', 'could', 'would', 'been',
'after', 'us','1','2','3','4','5','6','7','8','9','0']
querywords = removed_punct.split()
resultwords = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)
print result
#removed the common occurrences

list = reduce(lambda d, c: d.update([(c, d.get(c,0)+1)]) or d, result.split(), {})


sorted_list = list.items()
sorted_list.sort(key = lambda item: item[1])
for word in sorted_list:
print word
# got the frequency and sorted it over here

wordList = re.sub("[^\w]", " ", result).split()


print wordlist
# changed the datatype over here

from collections import Counter


import numpy as np

import matplotlib.pyplot as plt


word_counts = Counter(wordList)
def plot_bar_from_counter(counter, ax=None):

if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111)
frequencies = counter.values()
names = counter.keys()
x_coordinates = np.arange(len(counter))
ax.bar(x_coordinates, frequencies, align='center')
ax.xaxis.set_major_locator(plt.FixedLocator(x_coordinates))
ax.xaxis.set_major_formatter(plt.FixedFormatter(names))
return ax

plot_bar_from_counter(word_counts)
plt.show()
# plotted histogram

print timeit.default_timer()-start
# got the time

6.86172139321

Vous aimerez peut-être aussi