From ee1514fe626e93355ff2154966dbe89f961d747e Mon Sep 17 00:00:00 2001
From: Alok Kohli <alok_k@ph.iitr.ac.in>
Date: Sat, 9 May 2026 21:54:53 +0530
Subject: [PATCH 1/2] feat: Add Sound Event Detection module using YAMNet

---
 README.md     | 10 ++++++++
 sed_module.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 README.md
 create mode 100644 sed_module.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c24e051
--- /dev/null
+++ b/README.md
@@ -0,0 +1,10 @@
+# Module 1: Sound Event Detection (SED) Prototype
+
+## Approach
+This module uses a pre-trained **YAMNet** model via TensorFlow Hub to process 16kHz mono audio extracted from video files. I chose YAMNet because its training on the YouTube-8M dataset makes it highly robust for detecting environmental and non-speech events (like honking, laughter, or glass breaking). 
+
+The script processes audio in 0.48-second windows. To prevent "over-captioning," I implemented a confidence threshold filter that intentionally drops routine ambient classifications like 'Silence' and 'White noise'.
+
+## Known Limitations & Next Steps
+- **Current Limitation:** The script currently identifies every 0.48s window independently, which can result in repetitive logs for continuous sounds (e.g., a 3-second siren logs 6 separate events).
+- **Next Improvement (Temporal Smoothing):** Implement an algorithm to merge consecutive identical sound events into a single, continuous timestamp block for cleaner SRT generation.
diff --git a/sed_module.py b/sed_module.py
new file mode 100644
index 0000000..6b63411
--- /dev/null
+++ b/sed_module.py
@@ -0,0 +1,63 @@
+import os
+import numpy as np
+import pandas as pd
+import librosa
+import tensorflow as tf
+import tensorflow_hub as hub
+from moviepy.editor import VideoFileClip
+
+# Suppress TensorFlow warnings for a cleaner demo recording
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
+
+class SoundEventDetector:
+    def __init__(self):
+        print("\n--- [START] Loading YAMNet AI Model ---")
+        self.model = hub.load('https://tfhub.dev/google/yamnet/1')
+        class_map_path = self.model.class_map_path().numpy().decode('utf-8')
+        self.class_names = pd.read_csv(class_map_path)['display_name'].values
+        print("--- [SUCCESS] Model Ready ---\n")
+
+    def run_inference(self, video_path):
+        # 1. Audio Extraction
+        audio_temp = "temp_audio.wav"
+        print(f"Processing: {video_path}...")
+        video = VideoFileClip(video_path)
+        video.audio.write_audiofile(audio_temp, fps=16000, nbytes=2, codec='pcm_s16le', verbose=False, logger=None)
+        
+        # 2. Analysis
+        wav_data, _ = librosa.load(audio_temp, sr=16000)
+        scores, _, _ = self.model(wav_data)
+        scores_np = scores.numpy()
+        
+        results = []
+        for i in range(scores_np.shape[0]):
+            idx = np.argmax(scores_np[i])
+            conf = scores_np[i][idx]
+            label = self.class_names[idx]
+            
+            # Filtering for meaningful non-speech events
+            if conf >= 0.15 and label not in ['Speech', 'Silence', 'White noise']:
+                results.append({
+                    "Time": f"{round(i * 0.48, 2)}s",
+                    "Event": label,
+                    "Confidence": f"{round(float(conf) * 100, 1)}%"
+                })
+        
+        if os.path.exists(audio_temp): os.remove(audio_temp)
+        return results
+
+if __name__ == "__main__":
+    input_video = "sample_video.mp4"
+    if os.path.exists(input_video):
+        detector = SoundEventDetector()
+        output = detector.run_inference(input_video)
+        
+        print("\n" + "="*45)
+        print(f"{'TIMESTAMP':<12} | {'DETECTED SOUND':<20} | {'CONF'}")
+        print("-" * 45)
+        for row in output:
+            print(f"{row['Time']:<12} | {row['Event']:<20} | {row['Confidence']}")
+        print("="*45)
+        print(f"\nFound {len(output)} contextually relevant events.\n")
+    else:
+        print(f"Error: {input_video} not found in folder!")
\ No newline at end of file

From 5e57dde21701a8260d6f4f4f4643ed26aec79f4d Mon Sep 17 00:00:00 2001
From: Alok Kohli <alok_k@ph.iitr.ac.in>
Date: Sat, 9 May 2026 22:09:08 +0530
Subject: [PATCH 2/2] docs: update README details

---
 README.md | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c24e051..9a1c9da 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,22 @@
+cat << 'EOF' > README.md
 # Module 1: Sound Event Detection (SED) Prototype
 
 ## Approach
-This module uses a pre-trained **YAMNet** model via TensorFlow Hub to process 16kHz mono audio extracted from video files. I chose YAMNet because its training on the YouTube-8M dataset makes it highly robust for detecting environmental and non-speech events (like honking, laughter, or glass breaking). 
+This module uses a pre-trained **YAMNet** model via TensorFlow Hub to process 16kHz mono audio extracted from video files. YAMNet's training on the YouTube-8M dataset makes it highly robust for detecting environmental and non-speech events (like honking, laughter, or glass breaking). 
 
 The script processes audio in 0.48-second windows. To prevent "over-captioning," I implemented a confidence threshold filter that intentionally drops routine ambient classifications like 'Silence' and 'White noise'.
 
+## Setup & Installation
+1. Ensure Python 3.12 is installed (recommended for Apple Silicon/M-series compatibility).
+2. Create a virtual environment and install dependencies:
+   `pip install tensorflow==2.16.1 tensorflow-hub==0.16.1 librosa moviepy pandas`
+
+## How to Run
+1. Place a test video named `sample_video.mp4` in the same directory as the script.
+2. Execute the module:
+   `python sed_module.py`
+
 ## Known Limitations & Next Steps
-- **Current Limitation:** The script currently identifies every 0.48s window independently, which can result in repetitive logs for continuous sounds (e.g., a 3-second siren logs 6 separate events).
-- **Next Improvement (Temporal Smoothing):** Implement an algorithm to merge consecutive identical sound events into a single, continuous timestamp block for cleaner SRT generation.
+- **Current Limitation:** The script evaluates every 0.48s window independently, resulting in repetitive logs for continuous sounds (e.g., a 3-second siren logs 6 separate events).
+- **Next Improvement (Temporal Smoothing):** Implement an algorithm to merge consecutive identical sound events into a single, continuous timestamp block for cleaner SRT file generation.
+EOF
\ No newline at end of file