-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathnlp_data_preprocessing.py
More file actions
90 lines (75 loc) · 2.65 KB
/
nlp_data_preprocessing.py
File metadata and controls
90 lines (75 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""nlp_data_preprocessing.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1yzzUDX-cpixlPNXubnJpD9BJpMaRK6fm
"""
# !pip install gensim
# !pip install nltk
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(10)
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet
df = pd.read_csv("raw_reddit_news_posts_comments.csv")
model_data = df[["created_utc", "title", "comments", "url_content"]]
# model_data.head()
def lemmatize_stemming(text):
stemmer = SnowballStemmer("english")
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
result = []
text = text.replace('[removed]', '').replace('[deleted]', '')
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
result.append(lemmatize_stemming(token))
return result
# doc_sample = model_data["comments"][0]
# print('original document: ')
# words = []
# for word in doc_sample.split(' '):
# words.append(word)
# print(words)
# print('\n\n tokenized and lemmatized document: ')
# print(preprocess(doc_sample))
processed_data = pd.DataFrame()
processed_data['date'] = model_data["created_utc"]
processed_data['title'] = model_data['title'].map(preprocess)
processed_data['comments'] = model_data['comments'].map(preprocess)
processed_data['url_content'] = model_data['url_content'].map(preprocess)
processed_data.to_csv("preprocessed_reddit_data.csv", index=False)
def dummy_fun(doc):
return doc
title_tfidf = TfidfVectorizer(
analyzer='word',
tokenizer=dummy_fun,
preprocessor=dummy_fun,
token_pattern=None)
title_tfidf.fit(processed_data['title'])
with open('title_vectorizer.pkl', 'wb') as fin:
pickle.dump(title_tfidf, fin)
comments_tfidf = TfidfVectorizer(
analyzer='word',
tokenizer=dummy_fun,
preprocessor=dummy_fun,
token_pattern=None)
comments_tfidf.fit(processed_data['comments'])
with open('comments_vectorizer.pkl', 'wb') as fin:
pickle.dump(comments_tfidf, fin)
urlcontent_tfidf = TfidfVectorizer(
analyzer='word',
tokenizer=dummy_fun,
preprocessor=dummy_fun,
token_pattern=None)
urlcontent_tfidf.fit(processed_data['url_content'])
with open('urlcontent_vectorizer.pkl', 'wb') as fin:
pickle.dump(urlcontent_tfidf, fin)