Skip to content

Commit 2dc4c0b

Browse files
committed
adding nlp algos
1 parent afa1c53 commit 2dc4c0b

24 files changed

Lines changed: 1308 additions & 62 deletions

src/my_project/interviews/nlp_coding_questions/round_1/01_word_frequency_counter.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/01_word_frequency_counter.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/03_stopword_removal.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/03_stopword_removal.py

File renamed without changes.
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
from typing import Dict, Literal, List
2+
import re
3+
4+
# ============================================================================
5+
# METHOD 1: Using Transformers (BERT, RoBERTa, DistilBERT)
6+
# ============================================================================
7+
class TransformerSentimentAnalyzer:
8+
"""
9+
Sentiment analysis using pre-trained transformer models.
10+
Install: pip install transformers torch
11+
"""
12+
13+
def __init__(self, model_name: str = "distilbert-base-uncased-finetuned-sst-2-english"):
14+
"""
15+
Initialize with a pre-trained model.
16+
17+
Popular models:
18+
- distilbert-base-uncased-finetuned-sst-2-english (fast, lightweight)
19+
- cardiffnlp/twitter-roberta-base-sentiment (social media)
20+
- nlptown/bert-base-multilingual-uncased-sentiment (multilingual)
21+
"""
22+
from transformers import pipeline
23+
self.classifier = pipeline('sentiment-analysis', model=model_name)
24+
25+
26+
def analyze(self, text: str) -> Dict:
27+
"""Analyze sentiment using transformer model."""
28+
result = self.classifier(text)[0]
29+
return {
30+
'sentiment': result['label'].lower(),
31+
'confidence': round(result['score'], 4)
32+
}
33+
34+
35+
def batch_analyze(self, texts: List[str]) -> List[Dict]:
36+
"""Analyze multiple texts efficiently."""
37+
results = self.classifier(text)
38+
39+
return [
40+
{
41+
'sentiment':r['label'].lower(),
42+
'confidence': round(r['score'], 4)
43+
}
44+
for r in results
45+
]
46+
47+
48+
49+
# ============================================================================
50+
# TESTING AND EXAMPLES
51+
# ============================================================================
52+
if __name__ == "__main__":
53+
54+
print("="*80)
55+
print("TRANSFORMER SENTIMENT ANALYSIS - TESTING")
56+
print("="*80 + "\n")
57+
58+
# Initialize the analyzer
59+
print("Loading model... (this may take a moment on first run)")
60+
analyzer = TransformerSentimentAnalyzer()
61+
print("Model loaded successfully!\n")
62+
63+
# ========================================================================
64+
# TEST 1: Single Text Analysis
65+
# ========================================================================
66+
print("TEST 1: Single Text Analysis")
67+
print("-"*80)
68+
69+
single_text = "This movie is absolutely amazing! I loved every minute of it."
70+
result = analyzer.analyze(single_text)
71+
72+
print(f"Text: {single_text}")
73+
print(f"Sentiment: {result['sentiment'].upper()}")
74+
print(f"Confidence: {result['confidence']*100:.2f}%\n")
75+
76+
# ========================================================================
77+
# TEST 2: Multiple Different Sentiments
78+
# ========================================================================
79+
print("\nTEST 2: Multiple Different Sentiments")
80+
print("-"*80)
81+
82+
test_cases = [
83+
"I absolutely love this product! It's the best thing ever!",
84+
"This is terrible. Worst experience of my life.",
85+
"It's okay, nothing special but not bad either.",
86+
"The customer service was outstanding and very helpful.",
87+
"I'm so disappointed and frustrated with this purchase.",
88+
]
89+
90+
for i, text in enumerate(test_cases, 1):
91+
result = analyzer.analyze(text)
92+
print(f"\n{i}. Text: {text}")
93+
print(f" Sentiment: {result['sentiment'].upper()}")
94+
print(f" Confidence: {result['confidence']*100:.2f}%")
95+
96+
# ========================================================================
97+
# TEST 3: Batch Analysis (More Efficient)
98+
# ========================================================================
99+
print("\n\nTEST 3: Batch Analysis")
100+
print("-"*80)
101+
102+
batch_texts = [
103+
"Great service!",
104+
"Horrible experience.",
105+
"Not impressed at all.",
106+
"Fantastic product, highly recommend!",
107+
"Could be better, but acceptable."
108+
]
109+
110+
batch_results = analyzer.batch_analyze(batch_texts)
111+
112+
for text, result in zip(batch_texts, batch_results):
113+
print(f"\nText: {text}")
114+
print(f"Sentiment: {result['sentiment'].upper()} ({result['confidence']*100:.2f}%)")
115+
116+
# ========================================================================
117+
# TEST 4: Edge Cases
118+
# ========================================================================
119+
print("\n\nTEST 4: Edge Cases")
120+
print("-"*80)
121+
122+
edge_cases = [
123+
"😊❤️", # Emojis
124+
"Not bad at all!", # Negation
125+
"I don't hate it.", # Double negation
126+
"", # Empty (will handle gracefully)
127+
"Meh.", # Ambiguous
128+
]
129+
130+
for text in edge_cases:
131+
if text: # Skip empty strings
132+
result = analyzer.analyze(text)
133+
print(f"\nText: '{text}'")
134+
print(f"Sentiment: {result['sentiment'].upper()}")
135+
print(f"Confidence: {result['confidence']*100:.2f}%")
136+
137+
# ========================================================================
138+
# TEST 5: Summary Statistics
139+
# ========================================================================
140+
print("\n\n" + "="*80)
141+
print("SUMMARY STATISTICS")
142+
print("="*80)
143+
144+
all_texts = test_cases + batch_texts
145+
all_results = analyzer.batch_analyze(all_texts)
146+
147+
positive_count = sum(1 for r in all_results if r['sentiment'] == 'positive')
148+
negative_count = sum(1 for r in all_results if r['sentiment'] == 'negative')
149+
avg_confidence = sum(r['confidence'] for r in all_results) / len(all_results)
150+
151+
print(f"\nTotal texts analyzed: {len(all_texts)}")
152+
print(f"Positive sentiments: {positive_count}")
153+
print(f"Negative sentiments: {negative_count}")
154+
print(f"Average confidence: {avg_confidence*100:.2f}%")
155+
156+
print("\n" + "="*80)
157+
print("Testing completed!")
158+
print("="*80)

src/my_project/interviews/nlp_coding_questions/round_1/05_named_entity_recognition_ner.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/05_named_entity_recognition_ner.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/06_text_similarity.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/06_text_similarity.py

Lines changed: 38 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
from typing import Dict, List, Set
1+
from typing import Dict, List
22
import math
33
from collections import Counter
44
import re
5+
from sklearn.feature_extraction.text import TfidfVectorizer
6+
from sklearn.metrics.pairwise import cosine_similarity
7+
import numpy as np
58

69

710
class TextSimilarity:
8-
"""Class for computing text similarity using various methods."""
9-
10-
def __init__(self):
11-
self.idf_cache: Dict[str, float] = {}
11+
"""Class for computing text similarity using TF-IDF and cosine similarity."""
1212

1313
def preprocess(self, text: str) -> List[str]:
1414
"""
@@ -44,6 +44,7 @@ def compute_tf(self, tokens: List[str]) -> Dict[str, float]:
4444
def compute_idf(self, documents: List[List[str]]) -> Dict[str, float]:
4545
"""
4646
Compute Inverse Document Frequency (IDF) for a corpus.
47+
Uses scikit-learn's formula: log((1 + n) / (1 + df)) + 1
4748
4849
Args:
4950
documents: List of tokenized documents
@@ -59,7 +60,8 @@ def compute_idf(self, documents: List[List[str]]) -> Dict[str, float]:
5960
for token in unique_tokens:
6061
doc_freq[token] += 1
6162

62-
idf = {token: math.log(num_docs / (freq + 1)) + 1
63+
# Use scikit-learn's IDF formula for consistency
64+
idf = {token: math.log((1 + num_docs) / (1 + freq)) + 1
6365
for token, freq in doc_freq.items()}
6466

6567
return idf
@@ -80,7 +82,7 @@ def compute_tfidf(self, tokens: List[str], idf: Dict[str, float]) -> Dict[str, f
8082
for token, tf_val in tf.items()}
8183
return tfidf
8284

83-
def cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
85+
def cosine_similarity_manual(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
8486
"""
8587
Compute cosine similarity between two vectors.
8688
@@ -109,31 +111,9 @@ def cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> f
109111

110112
return dot_product / (magnitude1 * magnitude2)
111113

112-
def jaccard_similarity(self, tokens1: List[str], tokens2: List[str]) -> float:
113-
"""
114-
Compute Jaccard similarity between two token lists.
115-
116-
Args:
117-
tokens1: First list of tokens
118-
tokens2: Second list of tokens
119-
120-
Returns:
121-
Jaccard similarity score (0 to 1)
122-
"""
123-
set1 = set(tokens1)
124-
set2 = set(tokens2)
125-
126-
intersection = len(set1 & set2)
127-
union = len(set1 | set2)
128-
129-
if union == 0:
130-
return 0.0
131-
132-
return intersection / union
133-
134-
def tfidf_similarity(self, text1: str, text2: str, corpus: List[str] = None) -> float:
114+
def tfidf_similarity_manual(self, text1: str, text2: str, corpus: List[str] = None) -> float:
135115
"""
136-
Compute similarity using TF-IDF and cosine similarity.
116+
Compute similarity using manual TF-IDF and cosine similarity.
137117
138118
Args:
139119
text1: First text document
@@ -164,56 +144,52 @@ def tfidf_similarity(self, text1: str, text2: str, corpus: List[str] = None) ->
164144
tfidf2 = self.compute_tfidf(tokens2, idf)
165145

166146
# Compute cosine similarity
167-
return self.cosine_similarity(tfidf1, tfidf2)
147+
return self.cosine_similarity_manual(tfidf1, tfidf2)
168148

169-
def simple_similarity(self, text1: str, text2: str, method: str = "cosine") -> float:
149+
def tfidf_similarity_sklearn(self, text1: str, text2: str, corpus: List[str] = None) -> float:
170150
"""
171-
Compute similarity using simple term frequency.
151+
Compute similarity using scikit-learn TF-IDF and cosine similarity.
172152
173153
Args:
174154
text1: First text document
175155
text2: Second text document
176-
method: "cosine" or "jaccard"
156+
corpus: Optional corpus for IDF calculation
177157
178158
Returns:
179159
Similarity score (0 to 1)
180160
"""
181-
tokens1 = self.preprocess(text1)
182-
tokens2 = self.preprocess(text2)
161+
# Build corpus if not provided
162+
if corpus is None:
163+
documents = [text1, text2]
164+
else:
165+
documents = corpus + [text1, text2]
183166

184-
if method == "jaccard":
185-
return self.jaccard_similarity(tokens1, tokens2)
186-
else: # cosine
187-
tf1 = self.compute_tf(tokens1)
188-
tf2 = self.compute_tf(tokens2)
189-
return self.cosine_similarity(tf1, tf2)
167+
# Create TF-IDF vectorizer with L2 normalization (default)
168+
vectorizer = TfidfVectorizer(norm='l2')
169+
tfidf_matrix = vectorizer.fit_transform(documents)
170+
171+
# Get the TF-IDF vectors for text1 and text2
172+
vec1 = tfidf_matrix[-2]
173+
vec2 = tfidf_matrix[-1]
174+
175+
# Compute cosine similarity
176+
similarity = cosine_similarity(vec1, vec2)[0][0]
177+
return similarity
190178

191179

192180
# Example usage
193181
if __name__ == "__main__":
194182
sim = TextSimilarity()
195183

196-
# Example 1: Simple cosine similarity
197184
doc1 = "The quick brown fox jumps over the lazy dog"
198185
doc2 = "The lazy dog sleeps under the tree"
199186

200-
score = sim.simple_similarity(doc1, doc2, method="cosine")
201-
print(f"Cosine Similarity: {score:.4f}")
202-
203-
# Example 2: Jaccard similarity
204-
score = sim.simple_similarity(doc1, doc2, method="jaccard")
205-
print(f"Jaccard Similarity: {score:.4f}")
206-
207-
# Example 3: TF-IDF similarity
208-
corpus = [
209-
"The quick brown fox jumps over the lazy dog",
210-
"The lazy dog sleeps under the tree",
211-
"A quick brown dog runs in the park"
212-
]
187+
# Manual TF-IDF implementation
188+
score_manual = sim.tfidf_similarity_manual(doc1, doc2)
189+
print(f"Manual TF-IDF Cosine Similarity: {score_manual:.4f}")
213190

214-
score = sim.tfidf_similarity(doc1, doc2, corpus)
215-
print(f"TF-IDF Cosine Similarity: {score:.4f}")
191+
# Scikit-learn TF-IDF implementation
192+
score_sklearn = sim.tfidf_similarity_sklearn(doc1, doc2)
193+
print(f"Scikit-learn TF-IDF Cosine Similarity: {score_sklearn:.4f}")
216194

217-
# Example 4: Identical documents
218-
score = sim.tfidf_similarity(doc1, doc1)
219-
print(f"Identical Document Similarity: {score:.4f}")
195+
print(f"\nDifference: {abs(score_manual - score_sklearn):.4f}")

src/my_project/interviews/nlp_coding_questions/round_1/07_topic_modeling.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/07_topic_modeling.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/088_text_generation_rnn.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/088_text_generation_rnn.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/08_text_generation_rnn.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/08_text_generation_rnn.py

File renamed without changes.

src/my_project/interviews/nlp_coding_questions/round_1/09_name_entity_linking.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/09_name_entity_linking.py

File renamed without changes.

0 commit comments

Comments
 (0)