-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess.py
More file actions
39 lines (31 loc) · 1.09 KB
/
process.py
File metadata and controls
39 lines (31 loc) · 1.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding: utf-8 -*
#copyright Spencer Hitchcock and Huda Khayrallah
import nltk, urllib, re, string, HTMLParser
from urllib import urlopen
#Return X number of terms from frequency distribution
x = 100
raw = ""
#List of interesting urls of spanish-language wikipedia articles
urls = [
"http://es.wikipedia.org/wiki/Tango",
"http://es.wikipedia.org/wiki/Tango",
"http://es.wikipedia.org/wiki/Tango",
"http://es.wikipedia.org/wiki/Buenos_Aires",
"http://es.wikipedia.org/wiki/Felis_silvestris_catus",
]
#Append all html to raw
for url in urls:
raw += urllib.urlopen(url).read()
#Get rid of HTML words
raw = nltk.clean_html(raw)
#Split up raw text into tokens
raw= nltk.word_tokenize(raw)
#spanish language stopwords
stops = nltk.corpus.stopwords.words('spanish')
#lowercase words, strip of punctuation, remove stopwords
tokens = [w.lower().strip(string.punctuation) for w in raw if w.lower().strip(string.punctuation) not in stops and w != ' ']
regex = re.compile("^[0-9]+")
tokens = [w for w in tokens if not regex.match(w)]
freq = (nltk.FreqDist(tokens)).keys()[:x]
for w in freq:
print(w)