Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Virtual Environments
.venv/
env/
__pycache__/
*.pyc

# Massive Datasets & Videos
*.mp4
*.mkv
*.avi
*.wav
Multi label Audiotory Dataset from Diverse Indian Urban Environments/
Indian_sounds_dataset/
*.csv

# Output Files (Generated dynamically)
*.srt
phase2_audio_events.json
phase3_multimodal_events.json

# IDEs & System
.vscode/
.idea/
.DS_Store
Binary file added indian_sounds_model.pkl
Binary file not shown.
122 changes: 122 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import argparse
import json
import os
import time

from src.media_processor import MediaProcessor
from src.audio_analyzer import AudioAnalyzer
from src.visual_analyzer import VisualAnalyzer
from src.caption_generator import CaptionGenerator

def print_header(text):
print(f"\n{'='*50}")
print(f" {text}")
print(f"{'='*50}")

def main(video_path, context):
start_time = time.time()

# Setup routing flags based on user context
use_hpss = False
use_custom_model = False

if context == 'indian':
use_hpss = True
use_custom_model = True
print_header("INDIAN CONTEXT DETECTED: Enabling HPSS and Custom Models")
else:
print_header("GENERAL CONTEXT DETECTED: Standard YAMNet processing")

# ---------------------------------------------------------
# PHASE 1: Media Processing
# ---------------------------------------------------------
print_header("PHASE 1: MEDIA PROCESSING")
mp = MediaProcessor(video_path)
audio_path = mp.extract_audio()
waveform, sr = mp.load_audio(audio_path)
print(f"Loaded audio waveform. Sample rate: {sr} Hz")

# ---------------------------------------------------------
# PHASE 2: Audio Analysis (YAMNet + Custom Indian Sounds ML)
# ---------------------------------------------------------
print_header("PHASE 2: MULTIMODAL AUDIO ANALYSIS")
aa = AudioAnalyzer()

# Process audio with context-aware routing
audio_events = aa.process_full_audio(
waveform,
sr,
use_custom_model=use_custom_model,
use_hpss=use_hpss
)

# Save intermediate JSON
with open("phase2_audio_events.json", "w") as f:
json.dump(audio_events, f, indent=4)
print(f"Found {len(audio_events)} non-speech audio events. Saved to phase2_audio_events.json")

# ---------------------------------------------------------
# PHASE 3: Visual Analysis (MediaPipe Reaction Tracking)
# ---------------------------------------------------------
print_header("PHASE 3: VISUAL REACTION ANALYSIS")
va = VisualAnalyzer()
multimodal_events = []

total_events = len(audio_events)
for idx, event in enumerate(audio_events, 1):
# OPTIMIZATION: Skip very weak sounds that aren't worth heavy visual analysis
if event['events'][0]['confidence'] < 0.3:
continue

# Print progress on the same line
print(f"Processing Visuals for Event {idx}/{total_events}...", end="\r")

# OPTIMIZATION: Extract 5 frames instead of 15 (cuts ML workload by 66%)
frames = mp.get_frame_sequence(event['timestamp'], event['end_timestamp'], max_frames=5)

# Calculate visual reaction variance
visual_score = va.analyze_sequence_for_reaction(frames)

# Extract best label and confidence from YAMNet output
best_pred = event['events'][0]

multimodal_event = {
'timestamp': event['timestamp'],
'end_timestamp': event['end_timestamp'],
'label': best_pred['label'],
'audio_confidence': best_pred['confidence'],
'visual_significance': visual_score
}
multimodal_events.append(multimodal_event)

with open("phase3_multimodal_events.json", "w") as f:
json.dump(multimodal_events, f, indent=4)
print("Visual analysis complete. Saved to phase3_multimodal_events.json")

# ---------------------------------------------------------
# PHASE 4: Decision Engine & Subtitle Generation
# ---------------------------------------------------------
print_header("PHASE 4: CAPTION GENERATION")
cg = CaptionGenerator()

# This will apply thresholds and build output.srt
output_srt = "output.srt"
cg.filter_and_generate(multimodal_events, output_srt)

# Cleanup
mp.close()
if os.path.exists(audio_path):
os.remove(audio_path)

elapsed = time.time() - start_time
print_header(f"PIPELINE COMPLETE ({elapsed:.1f}s)")
print(f"Subtitles generated at: {output_srt}")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="AutoCC: Multimodal Video Captioning")
parser.add_argument("--input", required=True, help="Path to input video file")
parser.add_argument("--context", type=str, choices=['general', 'indian'], default='general',
help="Select 'indian' to enable HPSS music stripping and localized ML models.")
args = parser.parse_args()

main(args.input, args.context)
74 changes: 74 additions & 0 deletions pull_request_description.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
## Resolves Issue #2 and Issue #26
- **Resolves #2:** [DMP 2026] Create Intelligent Closed Caption (CC) Suggestion Tool
- **Resolves #26:** YAMNet's Western training bias causes systematic miss-detection of India-specific sounds in educational content.

---

### 🎥 Demo Link
**[View Pipeline Execution Demo](https://drive.google.com/file/d/1UkbEMbsKTS_MZD_Jet65KIK8X9Ib1sjU/view?usp=sharing)**

---

### 🚀 Overview
This PR completely overhauls the **AutoCC Multimodal Pipeline** to solve critical localization, inference overhead, and foley-misclassification issues. By injecting an intelligent context-routing engine, we bypass YAMNet's inherent Western acoustic biases and gracefully handle dense, music-heavy audio environments.

### ⚙️ Pipeline Explanation
The AutoCC engine operates in 4 highly optimized phases:
1. **Media Processing:** Extracts the raw audio waveform and efficiently sets up `cv2` video pointers in RAM for zero-latency frame jumping.
2. **Multimodal Audio Analysis:** Chunks audio into 0.96s frames. Extracts 1024-D embeddings via YAMNet, routes them through a custom Local Context classifier, and logs potential subtitle events.
3. **Visual Reaction Analysis:** Uses MediaPipe Pose & Face Mesh to analyze the video frames matching the audio timestamps. Calculates the variance of physical movement (flinching/reacting) to confirm if the audio event is visually significant to the scene.
4. **Intelligent Caption Generation:** Applies thresholds, maps foley anomalies to semantic movie actions (e.g., `Sewing Machine` ➔ `[Rapid punches]`), and generates the final context-aware `output.srt`.

---

### 🧠 Unique Architectural Approaches

#### 1. Overcoming Western Bias via Transfer Learning (Custom RF Classifier)
*YAMNet natively misclassifies localized sounds (e.g., it cannot identify a Rickshaw Horn or a Dhak drum, mapping them to generic bells or noise).*
- **The Solution:** Rather than expensively fine-tuning YAMNet from scratch, we implemented a highly efficient **Transfer Learning override**.
- The pipeline natively extracts the 1024-D embeddings from YAMNet and passes them into a custom-trained `RandomForestClassifier` (trained on 5,800+ clips from the SAS-KIIT and Mendeley Indian Urban Environment datasets).
- If the custom model recognizes a localized sound with >55% confidence, it intercepts the generic prediction and injects the culturally accurate label (e.g., `Indian Crowd/Human (Local Context)`).

#### 2. Defeating Background Interference via HPSS Music Stripping
*Indian educational and cinematic media is notorious for aggressive background music. This causes YAMNet to endlessly detect "Music," masking the actual ambient events and stalling the pipeline with hundreds of false-positive visual checks.*
- **The Solution:** We implemented **Harmonic-Percussive Source Separation (HPSS)** using `librosa`.
- When the user triggers the script with `--context indian`, the script performs an acoustic "X-Ray." It mathematically splits the waveform, throws away the "Harmonic" frequencies (melodic music, sustained chords), and only feeds the raw "Percussive" transients (horns, crashes, dog barks) into YAMNet.
- This enables flawless detection of hidden ambient noises even underneath a blaring soundtrack.

#### 3. Intelligent Foley-to-Semantic Mapping
*Audio models are "blind" and take sounds literally. Rapid punches in an action scene are systematically mislabeled by YAMNet as a `[Sewing Machine]` or `[Fusillade]` due to acoustic similarities.*
- **The Solution:** Implemented a hardcoded Context-Mapping dictionary inside the `CaptionGenerator`.
- By combining **MediaPipe Visual Variance** (confirming human movement on-screen) with Foley mapping, a `[Sewing Machine]` detection coupled with a high visual flinch score is intelligently rewritten into `[Rapid punches]`.

---

### 🛠️ Additional Optimizations Included
- **C-API Crash Prevention:** Pinned numpy strictly to `<2.0.0` to resolve fatal `_multiarray_umath` crashes with TensorFlow.
- **I/O Overhead Fix:** Refactored `MediaProcessor` to persist the `cv2.VideoCapture` object in RAM, cutting video processing time from 10+ minutes down to ~15 seconds by eliminating redundant disk-reads.

---

### 📦 Installation & Requirements
To run this pipeline, install the dependencies using the newly provided `requirements.txt` file.
> [!WARNING]
> **CRITICAL:** The `requirements.txt` explicitly pins `numpy<2.0`. TensorFlow's C-API crashes when running YAMNet on newer versions of NumPy.

```bash
pip install -r requirements.txt
```

---

### 💻 How to Run

To run the pipeline on a standard/Western video:
```bash
python main.py --input sample_video.mp4 --context general
```

To run the pipeline on an Indian cinematic/educational video (enables HPSS Music Stripping & Local Models):
```bash
python main.py --input sample_video.mp4 --context indian
```

The final context-aware subtitles will be saved directly to `output.srt`.
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
tensorflow
tensorflow-hub
mediapipe
moviepy
opencv-python
librosa
scikit-learn
numpy<2.0
115 changes: 115 additions & 0 deletions src/audio_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import os
import librosa
try:
import joblib
except ImportError:
joblib = None

class AudioAnalyzer:
def __init__(self):
print("Loading YAMNet model from TensorFlow Hub...")
self.model = hub.load('https://tfhub.dev/google/yamnet/1')
self.class_map_path = self.model.class_map_path().numpy()
self.labels = self.load_class_map(self.class_map_path)

# Filter out speech, ambient noise, and continuous music/singing (reduces overhead drastically)
self.ignore_keywords = [
'Speech', 'Narration', 'Silence', 'Inside, small room',
'Outside, rural or natural', 'Noise', 'Environmental noise',
'Music', 'Singing', 'Humming', 'Lullaby', 'Vocal music',
'A capella', 'Chant', 'Mantra', 'Bird'
]

# Attempt to load custom Indian sounds classifier
self.custom_clf = None
if joblib:
model_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'indian_sounds_model.pkl')
if os.path.exists(model_path):
print("Loading custom Indian Sounds classifier...")
self.custom_clf = joblib.load(model_path)
else:
print(f"Custom model not found at {model_path}. Using base YAMNet only.")

def load_class_map(self, csv_path):
labels = []
with tf.io.gfile.GFile(csv_path) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
labels.append(row['display_name'])
return labels

def is_speech_or_music(self, label):
for keyword in self.ignore_keywords:
if keyword.lower() in label.lower():
return True
return False

def process_full_audio(self, waveform, sample_rate, use_custom_model=True, use_hpss=False):
"""
Runs YAMNet over the entire waveform.
YAMNet natively processes in fast 0.96s chunks.
"""
print("Analyzing audio track with YAMNet...")

if use_hpss:
print("Applying Harmonic-Percussive Source Separation (HPSS) to strip background music...")
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# Keep only the percussive elements (hits, noise, speech) and discard harmonic (music)
_, waveform = librosa.effects.hpss(waveform)

# YAMNet requires exactly 16000 Hz float32 waveform
waveform = waveform.astype(np.float32)

scores, embeddings, spectrogram = self.model(waveform)
scores_np = scores.numpy() # Shape: (N, 521)

events_timeline = []

# YAMNet processes audio in 0.96s frames.
frame_duration = 0.96

for i in range(len(scores_np)):
frame_scores = scores_np[i]
# Get top 5 predictions for this chunk
top_indices = np.argsort(frame_scores)[::-1][:5]

results = []
for idx in top_indices:
prob = float(frame_scores[idx])
label = self.labels[idx]

# --- Custom Transfer Learning Override ---
if use_custom_model and self.custom_clf and prob > 0.1:
# Pass this 0.96s frame's embedding to our custom model
chunk_embedding = embeddings[i].numpy().reshape(1, -1)
custom_label = self.custom_clf.predict(chunk_embedding)[0]
custom_prob = np.max(self.custom_clf.predict_proba(chunk_embedding))

# If the custom model is highly confident, override YAMNet's generic label
if custom_prob >= 0.55:
label = f"{custom_label} (Local Context)"
prob = custom_prob # Use the custom model's confidence
# -----------------------------------------

if not self.is_speech_or_music(label):
results.append({"label": label, "confidence": prob})
if len(results) >= 3:
break

timestamp = i * frame_duration

# If we have a significant non-speech event, log it
if results and results[0]['confidence'] > 0.1:
events_timeline.append({
"timestamp": timestamp,
"end_timestamp": timestamp + frame_duration,
"events": results
})

return events_timeline
Loading