ivanpenaloza
diff --git a/‎…ons/round_1/01_word_frequency_counter.py‎ ‎…ons/round_1/01_word_frequency_counter.py‎src/my_project/interviews/nlp_coding_questions/round_1/01_word_frequency_counter.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/01_word_frequency_counter.py b/‎…ons/round_1/01_word_frequency_counter.py‎ ‎…ons/round_1/01_word_frequency_counter.py‎src/my_project/interviews/nlp_coding_questions/round_1/01_word_frequency_counter.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/01_word_frequency_counter.py
diff --git a/‎…d_1/02_text_cleaning_and_tokenization.py‎ ‎…d_1/02_text_cleaning_and_tokenization.py‎src/my_project/interviews/nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py b/‎…d_1/02_text_cleaning_and_tokenization.py‎ ‎…d_1/02_text_cleaning_and_tokenization.py‎src/my_project/interviews/nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/02_text_cleaning_and_tokenization.py
diff --git a/‎…questions/round_1/03_stopword_removal.py‎ ‎…questions/round_1/03_stopword_removal.py‎src/my_project/interviews/nlp_coding_questions/round_1/03_stopword_removal.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/03_stopword_removal.py b/‎…questions/round_1/03_stopword_removal.py‎ ‎…questions/round_1/03_stopword_removal.py‎src/my_project/interviews/nlp_coding_questions/round_1/03_stopword_removal.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/03_stopword_removal.py
diff --git a/‎src/my_project/interviews/common_nlp_coding_questions/round_1/04_sentiment_analysis.py‎
Lines changed: 158 additions & 0 deletions b/‎src/my_project/interviews/common_nlp_coding_questions/round_1/04_sentiment_analysis.py‎
Lines changed: 158 additions & 0 deletions
diff --git a/‎…und_1/05_named_entity_recognition_ner.py‎ ‎…und_1/05_named_entity_recognition_ner.py‎src/my_project/interviews/nlp_coding_questions/round_1/05_named_entity_recognition_ner.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/05_named_entity_recognition_ner.py b/‎…und_1/05_named_entity_recognition_ner.py‎ ‎…und_1/05_named_entity_recognition_ner.py‎src/my_project/interviews/nlp_coding_questions/round_1/05_named_entity_recognition_ner.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/05_named_entity_recognition_ner.py
diff --git a/‎…_questions/round_1/06_text_similarity.py‎ ‎…_questions/round_1/06_text_similarity.py‎src/my_project/interviews/nlp_coding_questions/round_1/06_text_similarity.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/06_text_similarity.py
Lines changed: 38 additions & 62 deletions b/‎…_questions/round_1/06_text_similarity.py‎ ‎…_questions/round_1/06_text_similarity.py‎src/my_project/interviews/nlp_coding_questions/round_1/06_text_similarity.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/06_text_similarity.py
Lines changed: 38 additions & 62 deletions
diff --git a/‎…g_questions/round_1/07_topic_modeling.py‎ ‎…g_questions/round_1/07_topic_modeling.py‎src/my_project/interviews/nlp_coding_questions/round_1/07_topic_modeling.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/07_topic_modeling.py b/‎…g_questions/round_1/07_topic_modeling.py‎ ‎…g_questions/round_1/07_topic_modeling.py‎src/my_project/interviews/nlp_coding_questions/round_1/07_topic_modeling.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/07_topic_modeling.py
diff --git a/‎…tions/round_1/088_text_generation_rnn.py‎ ‎…tions/round_1/088_text_generation_rnn.py‎src/my_project/interviews/nlp_coding_questions/round_1/088_text_generation_rnn.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/088_text_generation_rnn.py b/‎…tions/round_1/088_text_generation_rnn.py‎ ‎…tions/round_1/088_text_generation_rnn.py‎src/my_project/interviews/nlp_coding_questions/round_1/088_text_generation_rnn.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/088_text_generation_rnn.py
diff --git a/‎…stions/round_1/08_text_generation_rnn.py‎ ‎…stions/round_1/08_text_generation_rnn.py‎src/my_project/interviews/nlp_coding_questions/round_1/08_text_generation_rnn.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/08_text_generation_rnn.py b/‎…stions/round_1/08_text_generation_rnn.py‎ ‎…stions/round_1/08_text_generation_rnn.py‎src/my_project/interviews/nlp_coding_questions/round_1/08_text_generation_rnn.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/08_text_generation_rnn.py
diff --git a/‎…stions/round_1/09_name_entity_linking.py‎ ‎…stions/round_1/09_name_entity_linking.py‎src/my_project/interviews/nlp_coding_questions/round_1/09_name_entity_linking.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/09_name_entity_linking.py b/‎…stions/round_1/09_name_entity_linking.py‎ ‎…stions/round_1/09_name_entity_linking.py‎src/my_project/interviews/nlp_coding_questions/round_1/09_name_entity_linking.py renamed to src/my_project/interviews/common_nlp_coding_questions/round_1/09_name_entity_linking.py
@@ -0,0 +1,158 @@
+from typing import Dict, Literal, List
+import re
+
+# ============================================================================
+# METHOD 1: Using Transformers (BERT, RoBERTa, DistilBERT)
+# ============================================================================
+class TransformerSentimentAnalyzer:
+    """
+    Sentiment analysis using pre-trained transformer models.
+    Install: pip install transformers torch
+    """
+    
+    def __init__(self, model_name: str = "distilbert-base-uncased-finetuned-sst-2-english"):
+        """
+        Initialize with a pre-trained model.
+        
+        Popular models:
+        - distilbert-base-uncased-finetuned-sst-2-english (fast, lightweight)
+        - cardiffnlp/twitter-roberta-base-sentiment (social media)
+        - nlptown/bert-base-multilingual-uncased-sentiment (multilingual)
+        """
+        from transformers import pipeline
+        self.classifier = pipeline('sentiment-analysis', model=model_name)
+
+    
+    def analyze(self, text: str) -> Dict:
+        """Analyze sentiment using transformer model."""
+        result = self.classifier(text)[0]
+        return {
+            'sentiment': result['label'].lower(),
+            'confidence': round(result['score'], 4)
+        }
+
+    
+    def batch_analyze(self, texts: List[str]) -> List[Dict]:
+        """Analyze multiple texts efficiently."""
+        results = self.classifier(text)
+
+        return [
+            {
+                'sentiment':r['label'].lower(),
+                'confidence': round(r['score'], 4)
+            }
+            for r in results
+        ]
+
+
+
+# ============================================================================
+# TESTING AND EXAMPLES
+# ============================================================================
+if __name__ == "__main__":
+    
+    print("="*80)
+    print("TRANSFORMER SENTIMENT ANALYSIS - TESTING")
+    print("="*80 + "\n")
+    
+    # Initialize the analyzer
+    print("Loading model... (this may take a moment on first run)")
+    analyzer = TransformerSentimentAnalyzer()
+    print("Model loaded successfully!\n")
+    
+    # ========================================================================
+    # TEST 1: Single Text Analysis
+    # ========================================================================
+    print("TEST 1: Single Text Analysis")
+    print("-"*80)
+    
+    single_text = "This movie is absolutely amazing! I loved every minute of it."
+    result = analyzer.analyze(single_text)
+    
+    print(f"Text: {single_text}")
+    print(f"Sentiment: {result['sentiment'].upper()}")
+    print(f"Confidence: {result['confidence']*100:.2f}%\n")
+    
+    # ========================================================================
+    # TEST 2: Multiple Different Sentiments
+    # ========================================================================
+    print("\nTEST 2: Multiple Different Sentiments")
+    print("-"*80)
+    
+    test_cases = [
+        "I absolutely love this product! It's the best thing ever!",
+        "This is terrible. Worst experience of my life.",
+        "It's okay, nothing special but not bad either.",
+        "The customer service was outstanding and very helpful.",
+        "I'm so disappointed and frustrated with this purchase.",
+    ]
+    
+    for i, text in enumerate(test_cases, 1):
+        result = analyzer.analyze(text)
+        print(f"\n{i}. Text: {text}")
+        print(f"   Sentiment: {result['sentiment'].upper()}")
+        print(f"   Confidence: {result['confidence']*100:.2f}%")
+    
+    # ========================================================================
+    # TEST 3: Batch Analysis (More Efficient)
+    # ========================================================================
+    print("\n\nTEST 3: Batch Analysis")
+    print("-"*80)
+    
+    batch_texts = [
+        "Great service!",
+        "Horrible experience.",
+        "Not impressed at all.",
+        "Fantastic product, highly recommend!",
+        "Could be better, but acceptable."
+    ]
+    
+    batch_results = analyzer.batch_analyze(batch_texts)
+    
+    for text, result in zip(batch_texts, batch_results):
+        print(f"\nText: {text}")
+        print(f"Sentiment: {result['sentiment'].upper()} ({result['confidence']*100:.2f}%)")
+    
+    # ========================================================================
+    # TEST 4: Edge Cases
+    # ========================================================================
+    print("\n\nTEST 4: Edge Cases")
+    print("-"*80)
+    
+    edge_cases = [
+        "😊❤️",  # Emojis
+        "Not bad at all!",  # Negation
+        "I don't hate it.",  # Double negation
+        "",  # Empty (will handle gracefully)
+        "Meh.",  # Ambiguous
+    ]
+    
+    for text in edge_cases:
+        if text:  # Skip empty strings
+            result = analyzer.analyze(text)
+            print(f"\nText: '{text}'")
+            print(f"Sentiment: {result['sentiment'].upper()}")
+            print(f"Confidence: {result['confidence']*100:.2f}%")
+    
+    # ========================================================================
+    # TEST 5: Summary Statistics
+    # ========================================================================
+    print("\n\n" + "="*80)
+    print("SUMMARY STATISTICS")
+    print("="*80)
+    
+    all_texts = test_cases + batch_texts
+    all_results = analyzer.batch_analyze(all_texts)
+    
+    positive_count = sum(1 for r in all_results if r['sentiment'] == 'positive')
+    negative_count = sum(1 for r in all_results if r['sentiment'] == 'negative')
+    avg_confidence = sum(r['confidence'] for r in all_results) / len(all_results)
+    
+    print(f"\nTotal texts analyzed: {len(all_texts)}")
+    print(f"Positive sentiments: {positive_count}")
+    print(f"Negative sentiments: {negative_count}")
+    print(f"Average confidence: {avg_confidence*100:.2f}%")
+    
+    print("\n" + "="*80)
+    print("Testing completed!")
+    print("="*80)
@@ -1,14 +1,14 @@
-from typing import Dict, List, Set
+from typing import Dict, List
 import math
 from collections import Counter
 import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
 
 
 class TextSimilarity:
-    """Class for computing text similarity using various methods."""
-    
-    def __init__(self):
-        self.idf_cache: Dict[str, float] = {}
+    """Class for computing text similarity using TF-IDF and cosine similarity."""
 
     def preprocess(self, text: str) -> List[str]:
         """
@@ -44,6 +44,7 @@ def compute_tf(self, tokens: List[str]) -> Dict[str, float]:
     def compute_idf(self, documents: List[List[str]]) -> Dict[str, float]:
         """
         Compute Inverse Document Frequency (IDF) for a corpus.
+        Uses scikit-learn's formula: log((1 + n) / (1 + df)) + 1
         
         Args:
             documents: List of tokenized documents
@@ -59,7 +60,8 @@ def compute_idf(self, documents: List[List[str]]) -> Dict[str, float]:
             for token in unique_tokens:
                 doc_freq[token] += 1
 
-        idf = {token: math.log(num_docs / (freq + 1)) + 1 
+        # Use scikit-learn's IDF formula for consistency
+        idf = {token: math.log((1 + num_docs) / (1 + freq)) + 1 
                for token, freq in doc_freq.items()}
 
         return idf
@@ -80,7 +82,7 @@ def compute_tfidf(self, tokens: List[str], idf: Dict[str, float]) -> Dict[str, f
                  for token, tf_val in tf.items()}
         return tfidf
 
-    def cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
+    def cosine_similarity_manual(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
         """
         Compute cosine similarity between two vectors.
         
@@ -109,31 +111,9 @@ def cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> f
 
         return dot_product / (magnitude1 * magnitude2)
 
-    def jaccard_similarity(self, tokens1: List[str], tokens2: List[str]) -> float:
-        """
-        Compute Jaccard similarity between two token lists.
-        
-        Args:
-            tokens1: First list of tokens
-            tokens2: Second list of tokens
-            
-        Returns:
-            Jaccard similarity score (0 to 1)
-        """
-        set1 = set(tokens1)
-        set2 = set(tokens2)
-        
-        intersection = len(set1 & set2)
-        union = len(set1 | set2)
-        
-        if union == 0:
-            return 0.0
-        
-        return intersection / union
-    
-    def tfidf_similarity(self, text1: str, text2: str, corpus: List[str] = None) -> float:
+    def tfidf_similarity_manual(self, text1: str, text2: str, corpus: List[str] = None) -> float:
         """
-        Compute similarity using TF-IDF and cosine similarity.
+        Compute similarity using manual TF-IDF and cosine similarity.
         
         Args:
             text1: First text document
@@ -164,56 +144,52 @@ def tfidf_similarity(self, text1: str, text2: str, corpus: List[str] = None) ->
         tfidf2 = self.compute_tfidf(tokens2, idf)
 
         # Compute cosine similarity
-        return self.cosine_similarity(tfidf1, tfidf2)
+        return self.cosine_similarity_manual(tfidf1, tfidf2)
 
-    def simple_similarity(self, text1: str, text2: str, method: str = "cosine") -> float:
+    def tfidf_similarity_sklearn(self, text1: str, text2: str, corpus: List[str] = None) -> float:
         """
-        Compute similarity using simple term frequency.
+        Compute similarity using scikit-learn TF-IDF and cosine similarity.
         
         Args:
             text1: First text document
             text2: Second text document
-            method: "cosine" or "jaccard"
+            corpus: Optional corpus for IDF calculation
             
         Returns:
             Similarity score (0 to 1)
         """
-        tokens1 = self.preprocess(text1)
-        tokens2 = self.preprocess(text2)
+        # Build corpus if not provided
+        if corpus is None:
+            documents = [text1, text2]
+        else:
+            documents = corpus + [text1, text2]
 
-        if method == "jaccard":
-            return self.jaccard_similarity(tokens1, tokens2)
-        else:  # cosine
-            tf1 = self.compute_tf(tokens1)
-            tf2 = self.compute_tf(tokens2)
-            return self.cosine_similarity(tf1, tf2)
+        # Create TF-IDF vectorizer with L2 normalization (default)
+        vectorizer = TfidfVectorizer(norm='l2')
+        tfidf_matrix = vectorizer.fit_transform(documents)
+        
+        # Get the TF-IDF vectors for text1 and text2
+        vec1 = tfidf_matrix[-2]
+        vec2 = tfidf_matrix[-1]
+        
+        # Compute cosine similarity
+        similarity = cosine_similarity(vec1, vec2)[0][0]
+        return similarity
 
 
 # Example usage
 if __name__ == "__main__":
     sim = TextSimilarity()
 
-    # Example 1: Simple cosine similarity
     doc1 = "The quick brown fox jumps over the lazy dog"
     doc2 = "The lazy dog sleeps under the tree"
 
-    score = sim.simple_similarity(doc1, doc2, method="cosine")
-    print(f"Cosine Similarity: {score:.4f}")
-    
-    # Example 2: Jaccard similarity
-    score = sim.simple_similarity(doc1, doc2, method="jaccard")
-    print(f"Jaccard Similarity: {score:.4f}")
-    
-    # Example 3: TF-IDF similarity
-    corpus = [
-        "The quick brown fox jumps over the lazy dog",
-        "The lazy dog sleeps under the tree",
-        "A quick brown dog runs in the park"
-    ]
+    # Manual TF-IDF implementation
+    score_manual = sim.tfidf_similarity_manual(doc1, doc2)
+    print(f"Manual TF-IDF Cosine Similarity: {score_manual:.4f}")
 
-    score = sim.tfidf_similarity(doc1, doc2, corpus)
-    print(f"TF-IDF Cosine Similarity: {score:.4f}")
+    # Scikit-learn TF-IDF implementation
+    score_sklearn = sim.tfidf_similarity_sklearn(doc1, doc2)
+    print(f"Scikit-learn TF-IDF Cosine Similarity: {score_sklearn:.4f}")
 
-    # Example 4: Identical documents
-    score = sim.tfidf_similarity(doc1, doc1)
-    print(f"Identical Document Similarity: {score:.4f}")
+    print(f"\nDifference: {abs(score_manual - score_sklearn):.4f}")