diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..318b75d --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +# Virtual environment +venv/ +.venv/ + +# Python cache +__pycache__/ +*.pyc + +# VS Code +.vscode/ + +# macOS +.DS_Store diff --git a/README.md b/README.md new file mode 100644 index 0000000..37e1822 --- /dev/null +++ b/README.md @@ -0,0 +1,272 @@ +# Intelligent CC Suggestion Tool (DMP 2026) + +A Python pipeline that detects non-speech audio events in a video and checks whether there's a visible reaction on screen before flagging it as a CC candidate. The idea is to avoid dumping every background sound into the caption track — only events that actually affect what's happening on screen should get a CC. + +This submission covers: + +* Goal 1 — Sound Event Detection +* Goal 2 — Speaker Reaction Detection + +--- + +# File Structure + +```text +. +├── sound_event_detector.py # Goal 1 +├── reaction_detector.py # Goal 2 +├── requirements.txt +└── README.md +``` + +--- + +# Prerequisites + +You'll need `ffmpeg` installed on your machine. + +## macOS + +```bash +brew install ffmpeg +``` + +## Ubuntu / Debian + +```bash +sudo apt install ffmpeg +``` + +--- + +# Setup & Installation + +## 1. Clone the repository + +```bash +git clone +cd Intelligent-cc-generation +``` + +## 2. Create a virtual environment + +### macOS / Linux + +```bash +python3 -m venv venv +``` + +### Windows + +```bash +python -m venv venv +``` + +--- + +## 3. Activate the virtual environment + +### macOS / Linux + +```bash +source venv/bin/activate +``` + +### Windows + +```bash +venv\Scripts\activate +``` + +--- + +## 4. Install dependencies + +```bash +pip install -r requirements.txt +``` + +The first run will download the YAMNet model (~25 MB) from TensorFlow Hub and cache it locally. Subsequent runs are instant. + +--- + +# Goal 1 — Sound Event Detection + +Run: + +```bash +python3 sound_event_detector.py video.mp4 +``` + +This extracts the audio track via ffmpeg (converting to mono 16kHz WAV, which is what YAMNet expects), runs it through YAMNet, and returns a list of non-speech events with timestamps and confidence scores. + +## Implementation details + +### Speech filtering + +YAMNet class indices `0–6` are all speech variants: + +* Speech +* Male speech +* Female speech +* Child speech +* Conversation +* Narration +* Whispering + +These are hard-dropped before anything else. For Hindi/regional content where dialogue is dense, doing this as a blocklist rather than a low-confidence filter makes a noticeable difference. + +--- + +### Event merging + +Consecutive YAMNet windows (~0.48s each) with the same label get merged into a single event. Peak confidence is preserved. + +Without this, a 2-second gunshot would show up as four separate entries. + +--- + +### CC label mapping + +There's a lookup table that maps YAMNet class names to readable CC labels: + +* `[gunshot]` +* `[glass breaking]` +* `[applause]` +* etc. + +Anything not in the table falls back to the first word of the YAMNet class name. + +--- + +## Sample output + +```text +3.36s – 5.28s [music] (conf: 0.87) [Music] +12.00s – 12.48s [gunshot] (conf: 0.74) [Gunshot, gunfire] +18.72s – 19.20s [applause] (conf: 0.61) [Applause] +``` + +--- + +# Goal 2 — Speaker Reaction Detection + +Run: + +```bash +python3 reaction_detector.py video.mp4 +``` + +This runs Goal 1 first, then for each detected event it pulls three frames from the video: + +* one just before the event midpoint +* one at the midpoint +* one just after + +It then scores how much visible change happened around that moment. + +--- + +## Reaction scoring pipeline + +### Motion score + +Computes grayscale pixel-diff between: + +* before/mid +* mid/after + +frame pairs. + +The mean diff intensity is normalized to `0–1`. + +The higher of the two values is used so the system captures both anticipatory movement and delayed reactions. + +--- + +### Face score + +Runs Haar cascade face detection on the midpoint frame. + +The largest detected face is expressed as a fraction of the total frame area, then scaled up. + +The intuition is that a close-up reacting face carries more signal than a tiny face in a wide shot. + +--- + +### Final reaction score + +```text +reaction_score = (0.7 × motion_score) + (0.3 × face_score) +``` + +Motion receives the higher weight because movement is generally a stronger reaction signal than mere face visibility. + +Events below `0.25` are filtered out entirely. + +--- + +## Sample output + +```text +=== Important CC Candidate Events === + +12.00s – 12.48s [gunshot] (audio:0.74) (motion:0.81) (face:0.55) (reaction:0.73) + +18.72s – 19.20s [applause] (audio:0.61) (motion:0.52) (face:0.44) (reaction:0.50) + +3.36s – 5.28s [music] (audio:0.87) (motion:0.03) (face:0.18) (reaction:0.08) ← filtered out +``` + +The background music event gets dropped even though it had high audio confidence because there was no visible reaction associated with it. + +--- + +# Limitations & Next Steps + +## YAMNet and Indian content + +YAMNet is trained on AudioSet, which is heavily English/Western biased. + +Sounds like: + +* dhol +* firecrackers +* devotional music + +may not map cleanly to existing classes. + +The fallback label helps, but benchmarking on actual PlanetRead clips would be valuable. + +PANNs may provide better coverage for these cases. + +--- + +## Single-frame reaction scoring + +Currently only the midpoint frame is used for scoring. + +If someone reacts slightly after an event, the system can miss it. + +Sampling a short temporal window and taking the peak score would improve robustness. + +--- + +## Face detection quality + +Haar cascades work reasonably well for frontal faces but struggle with: + +* profile views +* partial occlusion +* motion blur + +MediaPipe Pose + FaceMesh would provide richer reaction signals such as: + +* head turns +* shoulder movement +* mouth opening + +and is the natural upgrade path. + +--- + diff --git a/reaction_detector.py b/reaction_detector.py new file mode 100644 index 0000000..32619ce --- /dev/null +++ b/reaction_detector.py @@ -0,0 +1,192 @@ +import sys + +import cv2 +import numpy as np + +import sound_event_detector + + +# --------------------------------- +# Configuration +# --------------------------------- +REACTION_THRESHOLD = 0.25 + +# Motion is more important than simple face presence +MOTION_WEIGHT = 0.7 +FACE_WEIGHT = 0.3 + + +# --------------------------------- +# Load face detector +# --------------------------------- +FACE_CASCADE = cv2.CascadeClassifier( + cv2.data.haarcascades + "haarcascade_frontalface_default.xml" +) + + +# --------------------------------- +# Get frame at timestamp +# --------------------------------- +def _get_frame(cap, timestamp_sec): + fps = cap.get(cv2.CAP_PROP_FPS) + + if fps <= 0: + fps = 25 + + frame_number = int(timestamp_sec * fps) + + cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number) + + ret, frame = cap.read() + + if not ret: + return None + + return frame + + +# --------------------------------- +# Motion intensity between frames +# --------------------------------- +def _motion_score(frame1, frame2): + if frame1 is None or frame2 is None: + return 0.0 + + gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY) + gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY) + + diff = cv2.absdiff(gray1, gray2) + + motion = np.mean(diff) + + # Normalize roughly to 0–1 + score = min(1.0, motion / 50.0) + + return round(float(score), 4) + + +# --------------------------------- +# Face prominence score +# --------------------------------- +def _face_score(frame): + if frame is None: + return 0.0 + + gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + + faces = FACE_CASCADE.detectMultiScale( + gray, + scaleFactor=1.1, + minNeighbors=5, + minSize=(40, 40), + ) + + if len(faces) == 0: + return 0.0 + + largest_face = max(w * h for (_, _, w, h) in faces) + + frame_area = frame.shape[0] * frame.shape[1] + + ratio = largest_face / frame_area + + # Normalize face prominence + score = min(1.0, ratio * 20) + + return round(float(score), 4) + + +# --------------------------------- +# Main reaction scoring +# --------------------------------- +def detect_reactions(video_path, events): + cap = cv2.VideoCapture(video_path) + + if not cap.isOpened(): + raise RuntimeError(f"Could not open video: {video_path}") + + results = [] + + try: + for event in events: + + midpoint = (event["start"] + event["end"]) / 2 + + before_t = max(0.0, midpoint - 0.3) + after_t = midpoint + 0.3 + + frame_before = _get_frame(cap, before_t) + frame_mid = _get_frame(cap, midpoint) + frame_after = _get_frame(cap, after_t) + + # Measure motion around event + motion_before = _motion_score(frame_before, frame_mid) + motion_after = _motion_score(frame_mid, frame_after) + + motion_score = max(motion_before, motion_after) + + # Detect visible face prominence + face_score = _face_score(frame_mid) + + # Weighted reaction importance + reaction_score = round( + (MOTION_WEIGHT * motion_score) + + (FACE_WEIGHT * face_score), + 4, + ) + + results.append({ + **event, + "motion_score": motion_score, + "face_score": face_score, + "reaction_score": reaction_score, + }) + + finally: + cap.release() + + return results + + +# --------------------------------- +# CLI +# --------------------------------- +if __name__ == "__main__": + + if len(sys.argv) < 2: + print("Usage: python reaction_detector.py ") + sys.exit(1) + + video_path = sys.argv[1] + + print("Detecting sound events...") + events = sound_event_detector.detect_events(video_path) + + if not events: + print("No meaningful sound events detected.") + sys.exit(0) + + print("Analyzing visual reactions...") + scored_events = detect_reactions(video_path, events) + + # Keep only stronger reactions + filtered_events = [ + e for e in scored_events + if e["reaction_score"] >= REACTION_THRESHOLD + ] + + if not filtered_events: + print("No strong reactions detected.") + sys.exit(0) + + print("\n=== Important CC Candidate Events ===\n") + + for e in filtered_events: + print( + f"{e['start']:.2f}s – {e['end']:.2f}s " + f"{e['cc_text']} " + f"(audio:{e['confidence']:.2f}) " + f"(motion:{e['motion_score']:.2f}) " + f"(face:{e['face_score']:.2f}) " + f"(reaction:{e['reaction_score']:.2f})" + ) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..549b916 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +# tensorflow>=2.12.0 +# tensorflow-hub>=0.13.0 +# scipy>=1.10.0 +# mediapipe>=0.10.0 +# opencv-python>=4.8.0 + + +tensorflow==2.16.1 +tensorflow-hub==0.16.1 +numpy==1.26.4 +scipy==1.13.1 +mediapipe==0.10.14 +opencv-python==4.10.0.84 +setuptools<81 \ No newline at end of file diff --git a/sound_event_detector.py b/sound_event_detector.py new file mode 100644 index 0000000..5270462 --- /dev/null +++ b/sound_event_detector.py @@ -0,0 +1,161 @@ +import sys +import os +import csv +import subprocess +import tempfile + +import warnings + +warnings.filterwarnings( + "ignore", + message=".*pkg_resources is deprecated as an API.*" +) + + +import numpy as np +import tensorflow as tf +import tensorflow_hub as hub +from scipy.io import wavfile + +YAMNET_URL = "https://tfhub.dev/google/yamnet/1" + +# YAMNet class indices 0-6 are all speech variants — skip them +SPEECH_INDICES = set(range(7)) + +CONFIDENCE_THRESHOLD = 0.30 + +# YAMNet hop size: ~0.975s window with 0.48s hop +HOP_SIZE = 0.48 + +# Keys are substrings matched against YAMNet display names +CC_LABELS = { + "Gunshot": "[gunshot]", + "Explosion": "[explosion]", + "Glass": "[glass breaking]", + "Applause": "[applause]", + "Clapping": "[clapping]", + "Crowd": "[crowd noise]", + "Laughter": "[laughter]", + "Music": "[music]", + "Alarm": "[alarm]", + "Siren": "[siren]", + "Screaming": "[screaming]", + "Crying": "[crying]", + "Dog": "[dog barking]", + "Cat": "[cat meowing]", + "Honk": "[honking]", + "Horn": "[honking]", + "Thunder": "[thunder]", + "Rain": "[rain]", + "Fire": "[fire]", + "Bell": "[bell ringing]", + "Telephone": "[phone ringing]", + "Knock": "[knocking]", + "Footstep": "[footsteps]", +} + + +def _extract_audio(video_path): + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + tmp.close() + subprocess.run( + [ + "ffmpeg", "-y", "-i", video_path, + "-ac", "1", "-ar", "16000", "-vn", + tmp.name, + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + return tmp.name + + +def _load_class_names(model): + class_map_path = model.class_map_path().numpy().decode("utf-8") + with tf.io.gfile.GFile(class_map_path) as f: + reader = csv.DictReader(f) + return [row["display_name"] for row in reader] + + +def _read_wav(path): + sr, data = wavfile.read(path) + if data.dtype == np.int16: + data = data.astype(np.float32) / 32768.0 + elif data.dtype == np.int32: + data = data.astype(np.float32) / 2147483648.0 + else: + data = data.astype(np.float32) + return sr, data + + +def _cc_text(class_name): + for key, label in CC_LABELS.items(): + if key.lower() in class_name.lower(): + return label + first = class_name.split(",")[0].split()[0].lower() + return f"[{first}]" + + +def detect_events(video_path): + wav_path = _extract_audio(video_path) + + try: + model = hub.load(YAMNET_URL) + class_names = _load_class_names(model) + + _, wav = _read_wav(wav_path) + scores, _, _ = model(wav) + scores = scores.numpy() + + events = [] + for i, frame_scores in enumerate(scores): + top_idx = int(np.argmax(frame_scores)) + + if top_idx in SPEECH_INDICES: + continue + + confidence = float(frame_scores[top_idx]) + if confidence < CONFIDENCE_THRESHOLD: + continue + + label = class_names[top_idx] + cc = _cc_text(label) + start = round(i * HOP_SIZE, 3) + end = round(start + HOP_SIZE, 3) + + if events and events[-1]["label"] == label and start <= events[-1]["end"]: + events[-1]["end"] = end + events[-1]["confidence"] = round(max(events[-1]["confidence"], confidence), 4) + else: + events.append({ + "label": label, + "cc_text": cc, + "confidence": round(confidence, 4), + "start": start, + "end": end, + }) + + return events + + finally: + if os.path.exists(wav_path): + os.unlink(wav_path) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python sound_event_detector.py ") + sys.exit(1) + + results = detect_events(sys.argv[1]) + + if not results: + print("No non-speech events detected above threshold.") + else: + for e in results: + print( + f"{e['start']:.2f}s – {e['end']:.2f}s " + f"{e['cc_text']} " + f"(conf: {e['confidence']:.2f}) [{e['label']}]" + ) \ No newline at end of file diff --git a/video.mp4 b/video.mp4 new file mode 100644 index 0000000..f3e44d4 Binary files /dev/null and b/video.mp4 differ