From ee1514fe626e93355ff2154966dbe89f961d747e Mon Sep 17 00:00:00 2001 From: Alok Kohli Date: Sat, 9 May 2026 21:54:53 +0530 Subject: [PATCH 1/2] feat: Add Sound Event Detection module using YAMNet --- README.md | 10 ++++++++ sed_module.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 README.md create mode 100644 sed_module.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..c24e051 --- /dev/null +++ b/README.md @@ -0,0 +1,10 @@ +# Module 1: Sound Event Detection (SED) Prototype + +## Approach +This module uses a pre-trained **YAMNet** model via TensorFlow Hub to process 16kHz mono audio extracted from video files. I chose YAMNet because its training on the YouTube-8M dataset makes it highly robust for detecting environmental and non-speech events (like honking, laughter, or glass breaking). + +The script processes audio in 0.48-second windows. To prevent "over-captioning," I implemented a confidence threshold filter that intentionally drops routine ambient classifications like 'Silence' and 'White noise'. + +## Known Limitations & Next Steps +- **Current Limitation:** The script currently identifies every 0.48s window independently, which can result in repetitive logs for continuous sounds (e.g., a 3-second siren logs 6 separate events). +- **Next Improvement (Temporal Smoothing):** Implement an algorithm to merge consecutive identical sound events into a single, continuous timestamp block for cleaner SRT generation. diff --git a/sed_module.py b/sed_module.py new file mode 100644 index 0000000..6b63411 --- /dev/null +++ b/sed_module.py @@ -0,0 +1,63 @@ +import os +import numpy as np +import pandas as pd +import librosa +import tensorflow as tf +import tensorflow_hub as hub +from moviepy.editor import VideoFileClip + +# Suppress TensorFlow warnings for a cleaner demo recording +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + +class SoundEventDetector: + def __init__(self): + print("\n--- [START] Loading YAMNet AI Model ---") + self.model = hub.load('https://tfhub.dev/google/yamnet/1') + class_map_path = self.model.class_map_path().numpy().decode('utf-8') + self.class_names = pd.read_csv(class_map_path)['display_name'].values + print("--- [SUCCESS] Model Ready ---\n") + + def run_inference(self, video_path): + # 1. Audio Extraction + audio_temp = "temp_audio.wav" + print(f"Processing: {video_path}...") + video = VideoFileClip(video_path) + video.audio.write_audiofile(audio_temp, fps=16000, nbytes=2, codec='pcm_s16le', verbose=False, logger=None) + + # 2. Analysis + wav_data, _ = librosa.load(audio_temp, sr=16000) + scores, _, _ = self.model(wav_data) + scores_np = scores.numpy() + + results = [] + for i in range(scores_np.shape[0]): + idx = np.argmax(scores_np[i]) + conf = scores_np[i][idx] + label = self.class_names[idx] + + # Filtering for meaningful non-speech events + if conf >= 0.15 and label not in ['Speech', 'Silence', 'White noise']: + results.append({ + "Time": f"{round(i * 0.48, 2)}s", + "Event": label, + "Confidence": f"{round(float(conf) * 100, 1)}%" + }) + + if os.path.exists(audio_temp): os.remove(audio_temp) + return results + +if __name__ == "__main__": + input_video = "sample_video.mp4" + if os.path.exists(input_video): + detector = SoundEventDetector() + output = detector.run_inference(input_video) + + print("\n" + "="*45) + print(f"{'TIMESTAMP':<12} | {'DETECTED SOUND':<20} | {'CONF'}") + print("-" * 45) + for row in output: + print(f"{row['Time']:<12} | {row['Event']:<20} | {row['Confidence']}") + print("="*45) + print(f"\nFound {len(output)} contextually relevant events.\n") + else: + print(f"Error: {input_video} not found in folder!") \ No newline at end of file From 5e57dde21701a8260d6f4f4f4643ed26aec79f4d Mon Sep 17 00:00:00 2001 From: Alok Kohli Date: Sat, 9 May 2026 22:09:08 +0530 Subject: [PATCH 2/2] docs: update README details --- README.md | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c24e051..9a1c9da 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,22 @@ +cat << 'EOF' > README.md # Module 1: Sound Event Detection (SED) Prototype ## Approach -This module uses a pre-trained **YAMNet** model via TensorFlow Hub to process 16kHz mono audio extracted from video files. I chose YAMNet because its training on the YouTube-8M dataset makes it highly robust for detecting environmental and non-speech events (like honking, laughter, or glass breaking). +This module uses a pre-trained **YAMNet** model via TensorFlow Hub to process 16kHz mono audio extracted from video files. YAMNet's training on the YouTube-8M dataset makes it highly robust for detecting environmental and non-speech events (like honking, laughter, or glass breaking). The script processes audio in 0.48-second windows. To prevent "over-captioning," I implemented a confidence threshold filter that intentionally drops routine ambient classifications like 'Silence' and 'White noise'. +## Setup & Installation +1. Ensure Python 3.12 is installed (recommended for Apple Silicon/M-series compatibility). +2. Create a virtual environment and install dependencies: + `pip install tensorflow==2.16.1 tensorflow-hub==0.16.1 librosa moviepy pandas` + +## How to Run +1. Place a test video named `sample_video.mp4` in the same directory as the script. +2. Execute the module: + `python sed_module.py` + ## Known Limitations & Next Steps -- **Current Limitation:** The script currently identifies every 0.48s window independently, which can result in repetitive logs for continuous sounds (e.g., a 3-second siren logs 6 separate events). -- **Next Improvement (Temporal Smoothing):** Implement an algorithm to merge consecutive identical sound events into a single, continuous timestamp block for cleaner SRT generation. +- **Current Limitation:** The script evaluates every 0.48s window independently, resulting in repetitive logs for continuous sounds (e.g., a 3-second siren logs 6 separate events). +- **Next Improvement (Temporal Smoothing):** Implement an algorithm to merge consecutive identical sound events into a single, continuous timestamp block for cleaner SRT file generation. +EOF \ No newline at end of file