-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTFidfVectorizer.py
More file actions
35 lines (27 loc) · 1.12 KB
/
TFidfVectorizer.py
File metadata and controls
35 lines (27 loc) · 1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# Load the Excel file
df = pd.read_excel(r'C:\\Users\Amorii7.AMORII7\Desktop\\vicinitas_search_results-2.xlsx')
# Get the text data as a list
documents = df['Name'].tolist()
#documents = df.iloc[:, 3].tolist()
# Replace NaN values with an empty string
df = df.fillna('')
# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer()
# Loop through each column and extract the text data
#for col in df.columns:
# Get the text data as a list
#documents = df[col].astype(str).tolist()
# Fit the vectorizer on the documents
vectorizer.fit(documents)
# Get the feature names
feature_names = vectorizer.get_feature_names_out()
# Loop through each document and print its TF-IDF features
for i, document in enumerate(documents):
print(f"Document {i+1}:")
tf_idf_vector = vectorizer.transform([document])
for j, feature_index in enumerate(tf_idf_vector.indices):
feature_name = feature_names[feature_index]
feature_score = tf_idf_vector.data[j]
print(f" {feature_name}: {feature_score}")