Vous êtes sur la page 1sur 5

Data Science 1: Assignment No.

Date: Sept 26, 2016

Ashish Menkudale
UIN: 656130575

import timeit
import numpy as np
import pandas as pd
import bs4
import requests
from bs4 import BeautifulSoup

start = timeit.default_timer()
#timer started

import urllib2
data = urllib2.urlopen("https://archive.org/stream/ataleoftwocities00098gut/98.txt")
l = []
str = ''
for line in data.readlines():
str = '\n'.join(l)

print str
# got all the text here

import lxml.html
import re, htmlentitydefs
filtered_str = re.sub('<[^<]+?>', '', str)
print filtered_str
# cleared html tags

import re

removed_punct = re.sub(r'[^\w\s]','',filtered_str)
print removed_punct
#removed punctuation over here
stopwords = ['had', 'has' ,'your' ,'you' ,'with' ,'i' ,'his', 'she', 'he', 'are' ,
'not' ,'the' ,'a','was','an','and','of','at','on','over','under','to',
'have','be', 'as', 'were', 'for', 'so', 'him', 'her', 'but', 'she', 'or',
'no', 'will', 'my', 'up', 'its', 'there', 'away', 'me', 'we' , 'they', 'only',
'too', 'down', 'upon', 'into', 'their', 'here', 'could', 'would', 'been',
'after', 'us','1','2','3','4','5','6','7','8','9','0']
querywords = removed_punct.split()
resultwords = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)
print result
#removed the common occurrences

list = reduce(lambda d, c: d.update([(c, d.get(c,0)+1)]) or d, result.split(), {})

sorted_list = list.items()
sorted_list.sort(key = lambda item: item[1])
for word in sorted_list:
print word
# got the frequency and sorted it over here

wordList = re.sub("[^\w]", " ", result).split()

print wordlist
# changed the datatype over here

from collections import Counter

import numpy as np

import matplotlib.pyplot as plt

word_counts = Counter(wordList)
def plot_bar_from_counter(counter, ax=None):

if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111)
frequencies = counter.values()
names = counter.keys()
x_coordinates = np.arange(len(counter))
ax.bar(x_coordinates, frequencies, align='center')
return ax

# plotted histogram

print timeit.default_timer()-start
# got the time


Vous aimerez peut-être aussi