1- from typing import Dict , List , Set
1+ from typing import Dict , List
22import math
33from collections import Counter
44import re
5+ from sklearn .feature_extraction .text import TfidfVectorizer
6+ from sklearn .metrics .pairwise import cosine_similarity
7+ import numpy as np
58
69
710class TextSimilarity :
8- """Class for computing text similarity using various methods."""
9-
10- def __init__ (self ):
11- self .idf_cache : Dict [str , float ] = {}
11+ """Class for computing text similarity using TF-IDF and cosine similarity."""
1212
1313 def preprocess (self , text : str ) -> List [str ]:
1414 """
@@ -44,6 +44,7 @@ def compute_tf(self, tokens: List[str]) -> Dict[str, float]:
4444 def compute_idf (self , documents : List [List [str ]]) -> Dict [str , float ]:
4545 """
4646 Compute Inverse Document Frequency (IDF) for a corpus.
47+ Uses scikit-learn's formula: log((1 + n) / (1 + df)) + 1
4748
4849 Args:
4950 documents: List of tokenized documents
@@ -59,7 +60,8 @@ def compute_idf(self, documents: List[List[str]]) -> Dict[str, float]:
5960 for token in unique_tokens :
6061 doc_freq [token ] += 1
6162
62- idf = {token : math .log (num_docs / (freq + 1 )) + 1
63+ # Use scikit-learn's IDF formula for consistency
64+ idf = {token : math .log ((1 + num_docs ) / (1 + freq )) + 1
6365 for token , freq in doc_freq .items ()}
6466
6567 return idf
@@ -80,7 +82,7 @@ def compute_tfidf(self, tokens: List[str], idf: Dict[str, float]) -> Dict[str, f
8082 for token , tf_val in tf .items ()}
8183 return tfidf
8284
83- def cosine_similarity (self , vec1 : Dict [str , float ], vec2 : Dict [str , float ]) -> float :
85+ def cosine_similarity_manual (self , vec1 : Dict [str , float ], vec2 : Dict [str , float ]) -> float :
8486 """
8587 Compute cosine similarity between two vectors.
8688
@@ -109,31 +111,9 @@ def cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> f
109111
110112 return dot_product / (magnitude1 * magnitude2 )
111113
112- def jaccard_similarity (self , tokens1 : List [str ], tokens2 : List [str ]) -> float :
113- """
114- Compute Jaccard similarity between two token lists.
115-
116- Args:
117- tokens1: First list of tokens
118- tokens2: Second list of tokens
119-
120- Returns:
121- Jaccard similarity score (0 to 1)
122- """
123- set1 = set (tokens1 )
124- set2 = set (tokens2 )
125-
126- intersection = len (set1 & set2 )
127- union = len (set1 | set2 )
128-
129- if union == 0 :
130- return 0.0
131-
132- return intersection / union
133-
134- def tfidf_similarity (self , text1 : str , text2 : str , corpus : List [str ] = None ) -> float :
114+ def tfidf_similarity_manual (self , text1 : str , text2 : str , corpus : List [str ] = None ) -> float :
135115 """
136- Compute similarity using TF-IDF and cosine similarity.
116+ Compute similarity using manual TF-IDF and cosine similarity.
137117
138118 Args:
139119 text1: First text document
@@ -164,56 +144,52 @@ def tfidf_similarity(self, text1: str, text2: str, corpus: List[str] = None) ->
164144 tfidf2 = self .compute_tfidf (tokens2 , idf )
165145
166146 # Compute cosine similarity
167- return self .cosine_similarity (tfidf1 , tfidf2 )
147+ return self .cosine_similarity_manual (tfidf1 , tfidf2 )
168148
169- def simple_similarity (self , text1 : str , text2 : str , method : str = "cosine" ) -> float :
149+ def tfidf_similarity_sklearn (self , text1 : str , text2 : str , corpus : List [ str ] = None ) -> float :
170150 """
171- Compute similarity using simple term frequency .
151+ Compute similarity using scikit-learn TF-IDF and cosine similarity .
172152
173153 Args:
174154 text1: First text document
175155 text2: Second text document
176- method: "cosine" or "jaccard"
156+ corpus: Optional corpus for IDF calculation
177157
178158 Returns:
179159 Similarity score (0 to 1)
180160 """
181- tokens1 = self .preprocess (text1 )
182- tokens2 = self .preprocess (text2 )
161+ # Build corpus if not provided
162+ if corpus is None :
163+ documents = [text1 , text2 ]
164+ else :
165+ documents = corpus + [text1 , text2 ]
183166
184- if method == "jaccard" :
185- return self .jaccard_similarity (tokens1 , tokens2 )
186- else : # cosine
187- tf1 = self .compute_tf (tokens1 )
188- tf2 = self .compute_tf (tokens2 )
189- return self .cosine_similarity (tf1 , tf2 )
167+ # Create TF-IDF vectorizer with L2 normalization (default)
168+ vectorizer = TfidfVectorizer (norm = 'l2' )
169+ tfidf_matrix = vectorizer .fit_transform (documents )
170+
171+ # Get the TF-IDF vectors for text1 and text2
172+ vec1 = tfidf_matrix [- 2 ]
173+ vec2 = tfidf_matrix [- 1 ]
174+
175+ # Compute cosine similarity
176+ similarity = cosine_similarity (vec1 , vec2 )[0 ][0 ]
177+ return similarity
190178
191179
192180# Example usage
193181if __name__ == "__main__" :
194182 sim = TextSimilarity ()
195183
196- # Example 1: Simple cosine similarity
197184 doc1 = "The quick brown fox jumps over the lazy dog"
198185 doc2 = "The lazy dog sleeps under the tree"
199186
200- score = sim .simple_similarity (doc1 , doc2 , method = "cosine" )
201- print (f"Cosine Similarity: { score :.4f} " )
202-
203- # Example 2: Jaccard similarity
204- score = sim .simple_similarity (doc1 , doc2 , method = "jaccard" )
205- print (f"Jaccard Similarity: { score :.4f} " )
206-
207- # Example 3: TF-IDF similarity
208- corpus = [
209- "The quick brown fox jumps over the lazy dog" ,
210- "The lazy dog sleeps under the tree" ,
211- "A quick brown dog runs in the park"
212- ]
187+ # Manual TF-IDF implementation
188+ score_manual = sim .tfidf_similarity_manual (doc1 , doc2 )
189+ print (f"Manual TF-IDF Cosine Similarity: { score_manual :.4f} " )
213190
214- score = sim .tfidf_similarity (doc1 , doc2 , corpus )
215- print (f"TF-IDF Cosine Similarity: { score :.4f} " )
191+ # Scikit-learn TF-IDF implementation
192+ score_sklearn = sim .tfidf_similarity_sklearn (doc1 , doc2 )
193+ print (f"Scikit-learn TF-IDF Cosine Similarity: { score_sklearn :.4f} " )
216194
217- # Example 4: Identical documents
218- score = sim .tfidf_similarity (doc1 , doc1 )
219- print (f"Identical Document Similarity: { score :.4f} " )
195+ print (f"\n Difference: { abs (score_manual - score_sklearn ):.4f} " )
0 commit comments