Topic_Modeling/nlp_data_preprocessing.py at main · DVA-6242-Fall-2022/Topic_Modeling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""nlp_data_preprocessing.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1yzzUDX-cpixlPNXubnJpD9BJpMaRK6fm
"""

# !pip install gensim
# !pip install nltk

import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(10)
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet

df = pd.read_csv("raw_reddit_news_posts_comments.csv")
model_data = df[["created_utc", "title", "comments", "url_content"]]
# model_data.head()

def lemmatize_stemming(text):
  stemmer = SnowballStemmer("english")
  return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    text = text.replace('[removed]', '').replace('[deleted]', '')
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

# doc_sample = model_data["comments"][0]
# print('original document: ')
# words = []
# for word in doc_sample.split(' '):
#     words.append(word)
# print(words)
# print('\n\n tokenized and lemmatized document: ')
# print(preprocess(doc_sample))

processed_data = pd.DataFrame()
processed_data['date'] = model_data["created_utc"]
processed_data['title'] = model_data['title'].map(preprocess)
processed_data['comments'] = model_data['comments'].map(preprocess)
processed_data['url_content'] = model_data['url_content'].map(preprocess)
processed_data.to_csv("preprocessed_reddit_data.csv", index=False)

def dummy_fun(doc):
    return doc

title_tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

title_tfidf.fit(processed_data['title'])
with open('title_vectorizer.pkl', 'wb') as fin:
  pickle.dump(title_tfidf, fin)

comments_tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

comments_tfidf.fit(processed_data['comments'])
with open('comments_vectorizer.pkl', 'wb') as fin:
  pickle.dump(comments_tfidf, fin)

urlcontent_tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

urlcontent_tfidf.fit(processed_data['url_content'])
with open('urlcontent_vectorizer.pkl', 'wb') as fin:
  pickle.dump(urlcontent_tfidf, fin)