PlanetRead · krishk2 · May 16, 2026 · May 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,24 @@
+# Virtual Environments
+.venv/
+env/
+__pycache__/
+*.pyc
+
+# Massive Datasets & Videos
+*.mp4
+*.mkv
+*.avi
+*.wav
+Multi label Audiotory Dataset from Diverse Indian Urban Environments/
+Indian_sounds_dataset/
+*.csv
+
+# Output Files (Generated dynamically)
+*.srt
+phase2_audio_events.json
+phase3_multimodal_events.json
+
+# IDEs & System
+.vscode/
+.idea/
+.DS_Store
diff --git a/indian_sounds_model.pkl b/indian_sounds_model.pkl
diff --git a/main.py b/main.py
@@ -0,0 +1,122 @@
+import argparse
+import json
+import os
+import time
+
+from src.media_processor import MediaProcessor
+from src.audio_analyzer import AudioAnalyzer
+from src.visual_analyzer import VisualAnalyzer
+from src.caption_generator import CaptionGenerator
+
+def print_header(text):
+    print(f"\n{'='*50}")
+    print(f" {text}")
+    print(f"{'='*50}")
+
+def main(video_path, context):
+    start_time = time.time()
+
+    # Setup routing flags based on user context
+    use_hpss = False
+    use_custom_model = False
+
+    if context == 'indian':
+        use_hpss = True
+        use_custom_model = True
+        print_header("INDIAN CONTEXT DETECTED: Enabling HPSS and Custom Models")
+    else:
+        print_header("GENERAL CONTEXT DETECTED: Standard YAMNet processing")
+
+    # ---------------------------------------------------------
+    # PHASE 1: Media Processing
+    # ---------------------------------------------------------
+    print_header("PHASE 1: MEDIA PROCESSING")
+    mp = MediaProcessor(video_path)
+    audio_path = mp.extract_audio()
+    waveform, sr = mp.load_audio(audio_path)
+    print(f"Loaded audio waveform. Sample rate: {sr} Hz")
+
+    # ---------------------------------------------------------
+    # PHASE 2: Audio Analysis (YAMNet + Custom Indian Sounds ML)
+    # ---------------------------------------------------------
+    print_header("PHASE 2: MULTIMODAL AUDIO ANALYSIS")
+    aa = AudioAnalyzer()
+
+    # Process audio with context-aware routing
+    audio_events = aa.process_full_audio(
+        waveform, 
+        sr, 
+        use_custom_model=use_custom_model, 
+        use_hpss=use_hpss
+    )
+
+    # Save intermediate JSON
+    with open("phase2_audio_events.json", "w") as f:
+        json.dump(audio_events, f, indent=4)
+    print(f"Found {len(audio_events)} non-speech audio events. Saved to phase2_audio_events.json")
+
+    # ---------------------------------------------------------
+    # PHASE 3: Visual Analysis (MediaPipe Reaction Tracking)
+    # ---------------------------------------------------------
+    print_header("PHASE 3: VISUAL REACTION ANALYSIS")
+    va = VisualAnalyzer()
+    multimodal_events = []
+
+    total_events = len(audio_events)
+    for idx, event in enumerate(audio_events, 1):
+        # OPTIMIZATION: Skip very weak sounds that aren't worth heavy visual analysis
+        if event['events'][0]['confidence'] < 0.3:
+            continue
+
+        # Print progress on the same line
+        print(f"Processing Visuals for Event {idx}/{total_events}...", end="\r")
+
+        # OPTIMIZATION: Extract 5 frames instead of 15 (cuts ML workload by 66%)
+        frames = mp.get_frame_sequence(event['timestamp'], event['end_timestamp'], max_frames=5)
+
+        # Calculate visual reaction variance
+        visual_score = va.analyze_sequence_for_reaction(frames)
+
+        # Extract best label and confidence from YAMNet output
+        best_pred = event['events'][0]
+
+        multimodal_event = {
+            'timestamp': event['timestamp'],
+            'end_timestamp': event['end_timestamp'],
+            'label': best_pred['label'],
+            'audio_confidence': best_pred['confidence'],
+            'visual_significance': visual_score
+        }
+        multimodal_events.append(multimodal_event)
+
+    with open("phase3_multimodal_events.json", "w") as f:
+        json.dump(multimodal_events, f, indent=4)
+    print("Visual analysis complete. Saved to phase3_multimodal_events.json")
+
+    # ---------------------------------------------------------
+    # PHASE 4: Decision Engine & Subtitle Generation
+    # ---------------------------------------------------------
+    print_header("PHASE 4: CAPTION GENERATION")
+    cg = CaptionGenerator()
+
+    # This will apply thresholds and build output.srt
+    output_srt = "output.srt"
+    cg.filter_and_generate(multimodal_events, output_srt)
+
+    # Cleanup
+    mp.close()
+    if os.path.exists(audio_path):
+        os.remove(audio_path)
+
+    elapsed = time.time() - start_time
+    print_header(f"PIPELINE COMPLETE ({elapsed:.1f}s)")
+    print(f"Subtitles generated at: {output_srt}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="AutoCC: Multimodal Video Captioning")
+    parser.add_argument("--input", required=True, help="Path to input video file")
+    parser.add_argument("--context", type=str, choices=['general', 'indian'], default='general', 
+                        help="Select 'indian' to enable HPSS music stripping and localized ML models.")
+    args = parser.parse_args()
+
+    main(args.input, args.context)
diff --git a/pull_request_description.md b/pull_request_description.md
@@ -0,0 +1,74 @@
+## Resolves Issue #2 and Issue #26
+- **Resolves #2:** [DMP 2026] Create Intelligent Closed Caption (CC) Suggestion Tool
+- **Resolves #26:** YAMNet's Western training bias causes systematic miss-detection of India-specific sounds in educational content.
+
+---
+
+### 🎥 Demo Link
+**[View Pipeline Execution Demo](https://drive.google.com/file/d/1UkbEMbsKTS_MZD_Jet65KIK8X9Ib1sjU/view?usp=sharing)**
+
+---
+
+### 🚀 Overview
+This PR completely overhauls the **AutoCC Multimodal Pipeline** to solve critical localization, inference overhead, and foley-misclassification issues. By injecting an intelligent context-routing engine, we bypass YAMNet's inherent Western acoustic biases and gracefully handle dense, music-heavy audio environments.
+
+### ⚙️ Pipeline Explanation
+The AutoCC engine operates in 4 highly optimized phases:
+1. **Media Processing:** Extracts the raw audio waveform and efficiently sets up `cv2` video pointers in RAM for zero-latency frame jumping.
+2. **Multimodal Audio Analysis:** Chunks audio into 0.96s frames. Extracts 1024-D embeddings via YAMNet, routes them through a custom Local Context classifier, and logs potential subtitle events.
+3. **Visual Reaction Analysis:** Uses MediaPipe Pose & Face Mesh to analyze the video frames matching the audio timestamps. Calculates the variance of physical movement (flinching/reacting) to confirm if the audio event is visually significant to the scene.
+4. **Intelligent Caption Generation:** Applies thresholds, maps foley anomalies to semantic movie actions (e.g., `Sewing Machine` ➔ `[Rapid punches]`), and generates the final context-aware `output.srt`.
+
+---
+
+### 🧠 Unique Architectural Approaches
+
+#### 1. Overcoming Western Bias via Transfer Learning (Custom RF Classifier)
+*YAMNet natively misclassifies localized sounds (e.g., it cannot identify a Rickshaw Horn or a Dhak drum, mapping them to generic bells or noise).*
+- **The Solution:** Rather than expensively fine-tuning YAMNet from scratch, we implemented a highly efficient **Transfer Learning override**. 
+- The pipeline natively extracts the 1024-D embeddings from YAMNet and passes them into a custom-trained `RandomForestClassifier` (trained on 5,800+ clips from the SAS-KIIT and Mendeley Indian Urban Environment datasets). 
+- If the custom model recognizes a localized sound with >55% confidence, it intercepts the generic prediction and injects the culturally accurate label (e.g., `Indian Crowd/Human (Local Context)`).
+
+#### 2. Defeating Background Interference via HPSS Music Stripping
+*Indian educational and cinematic media is notorious for aggressive background music. This causes YAMNet to endlessly detect "Music," masking the actual ambient events and stalling the pipeline with hundreds of false-positive visual checks.*
+- **The Solution:** We implemented **Harmonic-Percussive Source Separation (HPSS)** using `librosa`.
+- When the user triggers the script with `--context indian`, the script performs an acoustic "X-Ray." It mathematically splits the waveform, throws away the "Harmonic" frequencies (melodic music, sustained chords), and only feeds the raw "Percussive" transients (horns, crashes, dog barks) into YAMNet. 
+- This enables flawless detection of hidden ambient noises even underneath a blaring soundtrack.
+
+#### 3. Intelligent Foley-to-Semantic Mapping
+*Audio models are "blind" and take sounds literally. Rapid punches in an action scene are systematically mislabeled by YAMNet as a `[Sewing Machine]` or `[Fusillade]` due to acoustic similarities.*
+- **The Solution:** Implemented a hardcoded Context-Mapping dictionary inside the `CaptionGenerator`. 
+- By combining **MediaPipe Visual Variance** (confirming human movement on-screen) with Foley mapping, a `[Sewing Machine]` detection coupled with a high visual flinch score is intelligently rewritten into `[Rapid punches]`. 
+
+---
+
+### 🛠️ Additional Optimizations Included
+- **C-API Crash Prevention:** Pinned numpy strictly to `<2.0.0` to resolve fatal `_multiarray_umath` crashes with TensorFlow.
+- **I/O Overhead Fix:** Refactored `MediaProcessor` to persist the `cv2.VideoCapture` object in RAM, cutting video processing time from 10+ minutes down to ~15 seconds by eliminating redundant disk-reads.
+
+---
+
+### 📦 Installation & Requirements
+To run this pipeline, install the dependencies using the newly provided `requirements.txt` file. 
+> [!WARNING]
+> **CRITICAL:** The `requirements.txt` explicitly pins `numpy<2.0`. TensorFlow's C-API crashes when running YAMNet on newer versions of NumPy.
+
+```bash
+pip install -r requirements.txt
+```
+
+---
+
+### 💻 How to Run
+
+To run the pipeline on a standard/Western video:
+```bash
+python main.py --input sample_video.mp4 --context general
+```
+
+To run the pipeline on an Indian cinematic/educational video (enables HPSS Music Stripping & Local Models):
+```bash
+python main.py --input sample_video.mp4 --context indian
+```
+
+The final context-aware subtitles will be saved directly to `output.srt`.
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+tensorflow
+tensorflow-hub
+mediapipe
+moviepy
+opencv-python
+librosa
+scikit-learn
+numpy<2.0
diff --git a/src/audio_analyzer.py b/src/audio_analyzer.py
@@ -0,0 +1,115 @@
+import tensorflow as tf
+import tensorflow_hub as hub
+import numpy as np
+import csv
+import os
+import librosa
+try:
+    import joblib
+except ImportError:
+    joblib = None
+
+class AudioAnalyzer:
+    def __init__(self):
+        print("Loading YAMNet model from TensorFlow Hub...")
+        self.model = hub.load('https://tfhub.dev/google/yamnet/1')
+        self.class_map_path = self.model.class_map_path().numpy()
+        self.labels = self.load_class_map(self.class_map_path)
+
+        # Filter out speech, ambient noise, and continuous music/singing (reduces overhead drastically)
+        self.ignore_keywords = [
+            'Speech', 'Narration', 'Silence', 'Inside, small room', 
+            'Outside, rural or natural', 'Noise', 'Environmental noise',
+            'Music', 'Singing', 'Humming', 'Lullaby', 'Vocal music', 
+            'A capella', 'Chant', 'Mantra', 'Bird'
+        ]
+
+        # Attempt to load custom Indian sounds classifier
+        self.custom_clf = None
+        if joblib:
+            model_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'indian_sounds_model.pkl')
+            if os.path.exists(model_path):
+                print("Loading custom Indian Sounds classifier...")
+                self.custom_clf = joblib.load(model_path)
+            else:
+                print(f"Custom model not found at {model_path}. Using base YAMNet only.")
+
+    def load_class_map(self, csv_path):
+        labels = []
+        with tf.io.gfile.GFile(csv_path) as csvfile:
+            reader = csv.DictReader(csvfile)
+            for row in reader:
+                labels.append(row['display_name'])
+        return labels
+
+    def is_speech_or_music(self, label):
+        for keyword in self.ignore_keywords:
+            if keyword.lower() in label.lower():
+                return True
+        return False
+
+    def process_full_audio(self, waveform, sample_rate, use_custom_model=True, use_hpss=False):
+        """
+        Runs YAMNet over the entire waveform. 
+        YAMNet natively processes in fast 0.96s chunks.
+        """
+        print("Analyzing audio track with YAMNet...")
+
+        if use_hpss:
+            print("Applying Harmonic-Percussive Source Separation (HPSS) to strip background music...")
+            import warnings
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                # Keep only the percussive elements (hits, noise, speech) and discard harmonic (music)
+                _, waveform = librosa.effects.hpss(waveform)
+
+        # YAMNet requires exactly 16000 Hz float32 waveform
+        waveform = waveform.astype(np.float32)
+
+        scores, embeddings, spectrogram = self.model(waveform)
+        scores_np = scores.numpy() # Shape: (N, 521)
+
+        events_timeline = []
+
+        # YAMNet processes audio in 0.96s frames.
+        frame_duration = 0.96 
+
+        for i in range(len(scores_np)):
+            frame_scores = scores_np[i]
+            # Get top 5 predictions for this chunk
+            top_indices = np.argsort(frame_scores)[::-1][:5]
+
+            results = []
+            for idx in top_indices:
+                prob = float(frame_scores[idx])
+                label = self.labels[idx]
+
+                # --- Custom Transfer Learning Override ---
+                if use_custom_model and self.custom_clf and prob > 0.1:
+                    # Pass this 0.96s frame's embedding to our custom model
+                    chunk_embedding = embeddings[i].numpy().reshape(1, -1)
+                    custom_label = self.custom_clf.predict(chunk_embedding)[0]
+                    custom_prob = np.max(self.custom_clf.predict_proba(chunk_embedding))
+
+                    # If the custom model is highly confident, override YAMNet's generic label
+                    if custom_prob >= 0.55:
+                        label = f"{custom_label} (Local Context)"
+                        prob = custom_prob  # Use the custom model's confidence
+                # -----------------------------------------
+
+                if not self.is_speech_or_music(label):
+                    results.append({"label": label, "confidence": prob})
+                    if len(results) >= 3:
+                        break
+
+            timestamp = i * frame_duration
+
+            # If we have a significant non-speech event, log it
+            if results and results[0]['confidence'] > 0.1:
+                events_timeline.append({
+                    "timestamp": timestamp,
+                    "end_timestamp": timestamp + frame_duration,
+                    "events": results
+                })
+
+        return events_timeline