Académique Documents
Professionnel Documents
Culture Documents
2
Date: Sept 26, 2016
By,
Ashish Menkudale
UIN: 656130575
amenku2@uic.edu
import timeit
import numpy as np
import pandas as pd
import bs4
import requests
from bs4 import BeautifulSoup
start = timeit.default_timer()
#timer started
import urllib2
data = urllib2.urlopen("https://archive.org/stream/ataleoftwocities00098gut/98.txt")
l = []
str = ''
for line in data.readlines():
l.append(line)
str = '\n'.join(l)
print str
# got all the text here
import lxml.html
import re, htmlentitydefs
filtered_str = re.sub('<[^<]+?>', '', str)
print filtered_str
# cleared html tags
import re
removed_punct = re.sub(r'[^\w\s]','',filtered_str)
print removed_punct
#removed punctuation over here
stopwords = ['had', 'has' ,'your' ,'you' ,'with' ,'i' ,'his', 'she', 'he', 'are' ,
'not' ,'the' ,'a','was','an','and','of','at','on','over','under','to',
'from','what','if','else','also','in','is','it','by','this','that','his',
'have','be', 'as', 'were', 'for', 'so', 'him', 'her', 'but', 'she', 'or',
'no', 'will', 'my', 'up', 'its', 'there', 'away', 'me', 'we' , 'they', 'only',
'too', 'down', 'upon', 'into', 'their', 'here', 'could', 'would', 'been',
'after', 'us','1','2','3','4','5','6','7','8','9','0']
querywords = removed_punct.split()
resultwords = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)
print result
#removed the common occurrences
if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111)
frequencies = counter.values()
names = counter.keys()
x_coordinates = np.arange(len(counter))
ax.bar(x_coordinates, frequencies, align='center')
ax.xaxis.set_major_locator(plt.FixedLocator(x_coordinates))
ax.xaxis.set_major_formatter(plt.FixedFormatter(names))
return ax
plot_bar_from_counter(word_counts)
plt.show()
# plotted histogram
print timeit.default_timer()-start
# got the time
6.86172139321