From ae06bcb59419a477ae0ee20707354fa6e993fe04 Mon Sep 17 00:00:00 2001 From: knowledge27 <84104476+KT-27@users.noreply.github.com> Date: Sat, 12 Jun 2021 07:47:52 +0530 Subject: [PATCH] added steps for better cleaning --- Data Transformation/integrated/transform_description.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Data Transformation/integrated/transform_description.py b/Data Transformation/integrated/transform_description.py index 8538d3c..ebe20ff 100644 --- a/Data Transformation/integrated/transform_description.py +++ b/Data Transformation/integrated/transform_description.py @@ -7,7 +7,8 @@ from nltk.stem.porter import * from stemming.porter2 import stem import spacy - +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize def transform(desc): desc = removeNondecodableChars(desc) @@ -16,6 +17,10 @@ def transform(desc): desc = removePunctuation(desc) desc = stemming(desc) # print 'Description Transformation Ended ' + stop_words = set(stopwords.words('english')) + word_tokens = word_tokenize(desc) + filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words] + desc=' '.join(filtered_sentence) return desc @@ -26,7 +31,7 @@ def new_transform(desc): desc = removePunctuation(desc) # desc = lemmatize_spacy(desc) desc = stemming(desc) - + return desc