From 8d31249af6cab2503e7496a3250f2dd90072d738 Mon Sep 17 00:00:00 2001 From: Dev Mishra Date: Tue, 5 May 2026 00:14:21 +0530 Subject: [PATCH 1/2] feat: project scaffolding, README, config, and audio extraction utility - Add project structure with src/, config/, tests/ packages - Add comprehensive README with architecture diagram, setup, and usage - Add requirements.txt with all pipeline dependencies - Add setup.py with console script entry point - Add .gitignore for Python, ML models, and media files - Add AudioExtractor class using FFmpeg for video-to-audio conversion - Add centralized config/settings.py with defaults for all modules - Add 15 unit tests for AudioExtractor (all passing) --- .gitignore | 70 +++++++++ README.md | 157 +++++++++++++++++++ config/__init__.py | 1 + config/settings.py | 115 ++++++++++++++ requirements.txt | 20 +++ setup.py | 41 +++++ src/__init__.py | 8 + src/utils/__init__.py | 1 + src/utils/audio_extractor.py | 282 ++++++++++++++++++++++++++++++++++ tests/__init__.py | 1 + tests/fixtures/.gitkeep | 2 + tests/test_audio_extractor.py | 218 ++++++++++++++++++++++++++ 12 files changed, 916 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 config/__init__.py create mode 100644 config/settings.py create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 src/__init__.py create mode 100644 src/utils/__init__.py create mode 100644 src/utils/audio_extractor.py create mode 100644 tests/__init__.py create mode 100644 tests/fixtures/.gitkeep create mode 100644 tests/test_audio_extractor.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7557712 --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +env/ +.env/ +.venv/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# OS +.DS_Store +Thumbs.db + +# Model cache +models/cache/ +*.h5 +*.tflite + +# Media files (test inputs/outputs) +*.mp4 +*.avi +*.mkv +*.mov +*.wav +*.mp3 +*.srt +!tests/fixtures/*.srt + +# Logs +*.log +logs/ + +# Jupyter +.ipynb_checkpoints/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..0f1be3c --- /dev/null +++ b/README.md @@ -0,0 +1,157 @@ +# 🎬 Intelligent Closed Caption (CC) Suggestion Tool + +An AI-powered tool that intelligently identifies moments in a video where a Closed Caption (CC) annotation is genuinely necessary β€” such as when a non-speech audio event meaningfully affects the speakers or the scene β€” and suggests contextually relevant CC text, without over-captioning routine or low-impact sounds. + +## πŸ—οΈ Architecture + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Video File │───▢│ Audio Extractor │───▢│ Sound Event Detector β”‚ +β”‚ (input) β”‚ β”‚ (ffmpeg/moviepy) β”‚ β”‚ (YAMNet) β”‚ +β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β”‚ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ + └───────────▢│ Frame Extractor β”‚ β”‚ + β”‚ (OpenCV) β”‚ β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ + β”‚ Reaction Detectorβ”‚ β”‚ + β”‚ (MediaPipe) β”‚ β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ + β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ” + β”‚ CC Decision Engine β”‚ + β”‚ Combines audio + visual signals β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ SRT Generator β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ output.srt β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +## ✨ Features + +- **Sound Event Detection** β€” Automatically detects and classifies non-speech audio events (honking, explosions, laughter, music, alarms, applause, etc.) with confidence scores and timestamps using YAMNet. +- **Speaker Reaction Detection** β€” Analyzes video frames at detected event timestamps using MediaPipe to identify visible reactions (head turns, startled body language, facial expressions). +- **Intelligent CC Decisions** β€” Combines audio and visual signals to determine whether a CC annotation is truly warranted, avoiding over-captioning of ambient sounds. +- **SRT Output** β€” Generates standard SRT subtitle files with properly formatted timestamps and descriptive CC labels like `[honking]`, `[crowd cheering]`, `[gunshot]`. + +## πŸ“‹ Prerequisites + +- **Python 3.9+** +- **FFmpeg** β€” Must be installed and available on your system PATH + - Windows: `choco install ffmpeg` or download from [ffmpeg.org](https://ffmpeg.org/download.html) + - macOS: `brew install ffmpeg` + - Linux: `sudo apt install ffmpeg` + +## πŸš€ Installation + +1. **Clone the repository** + ```bash + git clone https://github.com/PlanetRead/Intelligent-cc-generation.git + cd Intelligent-cc-generation + ``` + +2. **Create a virtual environment** + ```bash + python -m venv venv + source venv/bin/activate # Linux/macOS + venv\Scripts\activate # Windows + ``` + +3. **Install dependencies** + ```bash + pip install -r requirements.txt + ``` + +4. **Install in development mode** (optional) + ```bash + pip install -e . + ``` + +## 🎯 Usage + +### Extract audio from a video file +```python +from src.utils.audio_extractor import AudioExtractor + +extractor = AudioExtractor() +audio_path = extractor.extract("input_video.mp4") +print(f"Audio saved to: {audio_path}") +``` + +### Full pipeline (coming soon) +```bash +python -m src.cli --input video.mp4 --output captions.srt +``` + +## πŸ§ͺ Running Tests + +```bash +pytest tests/ -v +``` + +## πŸ“ Project Structure + +``` +Intelligent-cc-generation/ +β”œβ”€β”€ src/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ cli.py # CLI entry point +β”‚ β”œβ”€β”€ utils/ +β”‚ β”‚ β”œβ”€β”€ __init__.py +β”‚ β”‚ └── audio_extractor.py # Video β†’ Audio extraction +β”‚ β”œβ”€β”€ detectors/ +β”‚ β”‚ β”œβ”€β”€ __init__.py +β”‚ β”‚ β”œβ”€β”€ sound_event_detector.py # YAMNet-based audio analysis +β”‚ β”‚ └── reaction_detector.py # MediaPipe-based visual analysis +β”‚ β”œβ”€β”€ models/ +β”‚ β”‚ β”œβ”€β”€ __init__.py +β”‚ β”‚ β”œβ”€β”€ event.py # SoundEvent dataclass +β”‚ β”‚ β”œβ”€β”€ reaction.py # ReactionEvent dataclass +β”‚ β”‚ └── cc_suggestion.py # CCSuggestion dataclass +β”‚ β”œβ”€β”€ engine/ +β”‚ β”‚ β”œβ”€β”€ __init__.py +β”‚ β”‚ └── decision_engine.py # CC decision combiner +β”‚ └── output/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ └── srt_generator.py # SRT file writer +β”œβ”€β”€ config/ +β”‚ └── settings.py # Configuration defaults +β”œβ”€β”€ tests/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ test_audio_extractor.py +β”‚ └── fixtures/ +β”œβ”€β”€ requirements.txt +β”œβ”€β”€ setup.py +β”œβ”€β”€ .gitignore +└── README.md +``` + +## πŸ› οΈ Tech Stack + +| Component | Technology | +|-----------|-----------| +| Language | Python 3.9+ | +| Audio Event Detection | [YAMNet](https://tfhub.dev/google/yamnet/1) (TensorFlow Hub) | +| Frame Extraction | [OpenCV](https://opencv.org/) | +| Pose & Expression Analysis | [MediaPipe](https://mediapipe.dev/) | +| Audio Extraction | [FFmpeg](https://ffmpeg.org/) via moviepy | +| Output Format | SRT (SubRip Subtitle) | + +## 🀝 Contributing + +1. Fork the repository +2. Create a feature branch (`git checkout -b feat/your-feature`) +3. Commit your changes (`git commit -m 'Add your feature'`) +4. Push to the branch (`git push origin feat/your-feature`) +5. Open a Pull Request + +## πŸ“„ License + +This project is part of the [Planet Read](https://www.planetread.org/) initiative under the DMP 2026 program. diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..56096f2 --- /dev/null +++ b/config/__init__.py @@ -0,0 +1 @@ +"""Configuration package.""" diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000..744bba5 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,115 @@ +"""Configuration settings for the Intelligent CC Suggestion Tool.""" + +import os + + +# ============================================================================= +# Audio Extraction Settings +# ============================================================================= + +# Default audio sample rate for extracted audio (Hz) +AUDIO_SAMPLE_RATE = 16000 + +# Default audio format for extracted files +AUDIO_FORMAT = "wav" + +# Default output directory for extracted audio files +AUDIO_OUTPUT_DIR = os.path.join(os.getcwd(), "output", "audio") + + +# ============================================================================= +# Sound Event Detection Settings +# ============================================================================= + +# Minimum confidence threshold for a sound event to be considered +SOUND_CONFIDENCE_THRESHOLD = 0.3 + +# Analysis window size in seconds for the sound event detector +ANALYSIS_WINDOW_SIZE = 0.96 # YAMNet default patch size + +# Hop length between analysis windows in seconds +ANALYSIS_HOP_LENGTH = 0.48 + +# Non-speech event categories to detect (YAMNet class names) +# Full list: https://github.com/tensorflow/models/blob/master/research/audioset/yamnet/yamnet_class_map.csv +TARGET_SOUND_EVENTS = [ + "Gunshot, gunfire", + "Explosion", + "Glass", + "Breaking", + "Siren", + "Car alarm", + "Vehicle horn, car horn, honking", + "Screaming", + "Crying, sobbing", + "Laughter", + "Applause", + "Cheering", + "Crowd", + "Dog", + "Thunder", + "Alarm", + "Bell", + "Door", + "Knock", + "Telephone", + "Music", + "Singing", + "Drum", + "Fire", + "Water", + "Rain", + "Wind", +] + + +# ============================================================================= +# Reaction Detection Settings +# ============================================================================= + +# Number of frames to extract around each event timestamp +REACTION_FRAME_COUNT = 10 + +# Time window (seconds) before and after event to look for reactions +REACTION_TIME_WINDOW = 1.5 + +# Minimum confidence for a reaction to be considered significant +REACTION_CONFIDENCE_THRESHOLD = 0.4 + +# Head turn angle threshold (degrees) to consider as a reaction +HEAD_TURN_THRESHOLD = 15.0 + +# Pose change threshold (normalized) for startled body language +POSE_CHANGE_THRESHOLD = 0.1 + + +# ============================================================================= +# CC Decision Engine Settings +# ============================================================================= + +# Weight for audio event confidence in the final decision +AUDIO_WEIGHT = 0.6 + +# Weight for visual reaction confidence in the final decision +VISUAL_WEIGHT = 0.4 + +# Combined confidence threshold for generating a CC annotation +CC_DECISION_THRESHOLD = 0.5 + +# Minimum duration (seconds) between consecutive CC annotations +# to avoid overwhelming the viewer +MIN_CC_GAP = 2.0 + + +# ============================================================================= +# Output Settings +# ============================================================================= + +# Default output format +OUTPUT_FORMAT = "srt" + +# Default output directory for generated subtitle files +OUTPUT_DIR = os.path.join(os.getcwd(), "output") + +# Default CC display duration (seconds) if not determined by event duration +DEFAULT_CC_DURATION = 2.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ce18ead --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +# Core dependencies +moviepy>=1.0.3 +numpy>=1.24.0 + +# Audio/Video processing +librosa>=0.10.0 +soundfile>=0.12.0 +opencv-python>=4.8.0 + +# ML Models +tensorflow>=2.13.0 +tensorflow-hub>=0.14.0 +mediapipe>=0.10.0 + +# Testing +pytest>=7.4.0 +pytest-cov>=4.1.0 + +# Utilities +pydub>=0.25.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..300f427 --- /dev/null +++ b/setup.py @@ -0,0 +1,41 @@ +"""Setup configuration for the Intelligent CC Suggestion Tool.""" + +from setuptools import setup, find_packages + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +with open("requirements.txt", "r", encoding="utf-8") as fh: + requirements = [ + line.strip() + for line in fh + if line.strip() and not line.startswith("#") + ] + +setup( + name="intelligent-cc-generation", + version="0.1.0", + author="Planet Read Contributors", + description="AI-powered tool for intelligent closed caption suggestions", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/PlanetRead/Intelligent-cc-generation", + packages=find_packages(), + python_requires=">=3.9", + install_requires=requirements, + entry_points={ + "console_scripts": [ + "cc-suggest=src.cli:main", + ], + }, + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Multimedia :: Video", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], +) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..7fca335 --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,8 @@ +"""Intelligent Closed Caption (CC) Suggestion Tool. + +An AI-powered tool that intelligently identifies moments in a video +where a Closed Caption annotation is genuinely necessary and suggests +contextually relevant CC text. +""" + +__version__ = "0.1.0" diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..f0ec284 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1 @@ +"""Utility modules for audio/video processing.""" diff --git a/src/utils/audio_extractor.py b/src/utils/audio_extractor.py new file mode 100644 index 0000000..51e8c75 --- /dev/null +++ b/src/utils/audio_extractor.py @@ -0,0 +1,282 @@ +"""Audio extraction utility for extracting audio tracks from video files. + +This module provides the AudioExtractor class that handles extracting +audio from various video formats and saving it as WAV files suitable +for downstream audio analysis. +""" + +import os +import logging +import subprocess +import shutil +from pathlib import Path +from typing import Optional + +from config.settings import AUDIO_SAMPLE_RATE, AUDIO_FORMAT, AUDIO_OUTPUT_DIR + +logger = logging.getLogger(__name__) + + +class AudioExtractionError(Exception): + """Raised when audio extraction from a video file fails.""" + + pass + + +class AudioExtractor: + """Extracts audio tracks from video files using FFmpeg. + + This extractor converts video files to mono WAV audio at a configurable + sample rate, suitable for input to audio analysis models like YAMNet. + + Attributes: + sample_rate: Target audio sample rate in Hz. + output_dir: Directory where extracted audio files will be saved. + audio_format: Output audio format (default: wav). + + Example: + >>> extractor = AudioExtractor(sample_rate=16000) + >>> audio_path = extractor.extract("input_video.mp4") + >>> print(f"Audio saved to: {audio_path}") + """ + + SUPPORTED_VIDEO_FORMATS = { + ".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".m4v", + } + + def __init__( + self, + sample_rate: int = AUDIO_SAMPLE_RATE, + output_dir: str = AUDIO_OUTPUT_DIR, + audio_format: str = AUDIO_FORMAT, + ): + """Initialize the AudioExtractor. + + Args: + sample_rate: Target audio sample rate in Hz. Defaults to 16000. + output_dir: Directory to save extracted audio files. + audio_format: Output audio file format. Defaults to 'wav'. + + Raises: + RuntimeError: If FFmpeg is not found on the system PATH. + """ + self.sample_rate = sample_rate + self.output_dir = output_dir + self.audio_format = audio_format + + # Verify FFmpeg is available + self._ffmpeg_path = self._find_ffmpeg() + if self._ffmpeg_path is None: + raise RuntimeError( + "FFmpeg not found. Please install FFmpeg and ensure it is " + "on your system PATH. Visit https://ffmpeg.org/download.html" + ) + + # Create output directory if it doesn't exist + os.makedirs(self.output_dir, exist_ok=True) + logger.info( + "AudioExtractor initialized (sample_rate=%d, format=%s, output=%s)", + self.sample_rate, + self.audio_format, + self.output_dir, + ) + + @staticmethod + def _find_ffmpeg() -> Optional[str]: + """Locate the FFmpeg executable on the system. + + Returns: + Path to the FFmpeg executable, or None if not found. + """ + ffmpeg_path = shutil.which("ffmpeg") + if ffmpeg_path: + logger.debug("Found FFmpeg at: %s", ffmpeg_path) + return ffmpeg_path + + def _validate_input(self, video_path: str) -> Path: + """Validate the input video file. + + Args: + video_path: Path to the video file. + + Returns: + Resolved Path object for the video file. + + Raises: + FileNotFoundError: If the video file does not exist. + ValueError: If the file format is not supported. + """ + path = Path(video_path).resolve() + + if not path.exists(): + raise FileNotFoundError(f"Video file not found: {path}") + + if not path.is_file(): + raise ValueError(f"Path is not a file: {path}") + + suffix = path.suffix.lower() + if suffix not in self.SUPPORTED_VIDEO_FORMATS: + raise ValueError( + f"Unsupported video format: '{suffix}'. " + f"Supported formats: {', '.join(sorted(self.SUPPORTED_VIDEO_FORMATS))}" + ) + + return path + + def _build_output_path( + self, video_path: Path, output_path: Optional[str] = None + ) -> Path: + """Build the output path for the extracted audio file. + + Args: + video_path: Path to the source video file. + output_path: Optional custom output path. If None, generates + one in the output directory. + + Returns: + Path for the output audio file. + """ + if output_path: + out = Path(output_path).resolve() + os.makedirs(out.parent, exist_ok=True) + return out + + filename = f"{video_path.stem}.{self.audio_format}" + return Path(self.output_dir) / filename + + def extract( + self, + video_path: str, + output_path: Optional[str] = None, + overwrite: bool = False, + ) -> str: + """Extract audio from a video file. + + Extracts the audio track from the given video file and saves it + as a mono WAV file at the configured sample rate. + + Args: + video_path: Path to the input video file. + output_path: Optional custom output file path. If not provided, + the audio will be saved in the configured output directory + with the same stem name as the video. + overwrite: If True, overwrite existing output file. + Defaults to False. + + Returns: + Absolute path to the extracted audio file. + + Raises: + FileNotFoundError: If the video file does not exist. + ValueError: If the video format is not supported. + AudioExtractionError: If FFmpeg fails to extract audio. + FileExistsError: If output file exists and overwrite is False. + """ + video = self._validate_input(video_path) + output = self._build_output_path(video, output_path) + + if output.exists() and not overwrite: + raise FileExistsError( + f"Output file already exists: {output}. " + "Use overwrite=True to replace it." + ) + + logger.info("Extracting audio from: %s", video) + logger.info("Output: %s", output) + + cmd = [ + self._ffmpeg_path, + "-i", str(video), # Input file + "-vn", # Disable video + "-acodec", "pcm_s16le", # 16-bit PCM encoding + "-ar", str(self.sample_rate), # Sample rate + "-ac", "1", # Mono channel + "-y" if overwrite else "-n", # Overwrite flag + str(output), + ] + + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, # 5 minute timeout + ) + + if result.returncode != 0: + error_msg = result.stderr.strip().split("\n")[-1] + raise AudioExtractionError( + f"FFmpeg failed with return code {result.returncode}: " + f"{error_msg}" + ) + + except subprocess.TimeoutExpired: + raise AudioExtractionError( + f"Audio extraction timed out after 300 seconds for: {video}" + ) + except FileNotFoundError: + raise AudioExtractionError( + "FFmpeg executable not found. It may have been removed " + "after initialization." + ) + + if not output.exists(): + raise AudioExtractionError( + f"Audio extraction completed but output file not found: {output}" + ) + + file_size = output.stat().st_size + logger.info( + "Audio extraction successful: %s (%.2f MB)", + output, + file_size / (1024 * 1024), + ) + + return str(output) + + def get_audio_info(self, audio_path: str) -> dict: + """Get information about an audio file using FFprobe. + + Args: + audio_path: Path to the audio file. + + Returns: + Dictionary with audio information including duration, + sample_rate, channels, and codec. + """ + path = Path(audio_path).resolve() + if not path.exists(): + raise FileNotFoundError(f"Audio file not found: {path}") + + ffprobe_path = shutil.which("ffprobe") + if ffprobe_path is None: + raise RuntimeError("FFprobe not found on system PATH.") + + cmd = [ + ffprobe_path, + "-v", "quiet", + "-print_format", "json", + "-show_streams", + "-select_streams", "a:0", + str(path), + ] + + try: + import json + + result = subprocess.run( + cmd, capture_output=True, text=True, timeout=30 + ) + data = json.loads(result.stdout) + stream = data.get("streams", [{}])[0] + + return { + "duration": float(stream.get("duration", 0)), + "sample_rate": int(stream.get("sample_rate", 0)), + "channels": int(stream.get("channels", 0)), + "codec": stream.get("codec_name", "unknown"), + "bit_rate": int(stream.get("bit_rate", 0)), + } + except (subprocess.TimeoutExpired, json.JSONDecodeError, IndexError) as e: + logger.error("Failed to get audio info: %s", e) + return {} diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..46816dd --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Tests package.""" diff --git a/tests/fixtures/.gitkeep b/tests/fixtures/.gitkeep new file mode 100644 index 0000000..ac08cc0 --- /dev/null +++ b/tests/fixtures/.gitkeep @@ -0,0 +1,2 @@ +# Test fixtures directory +# Place sample SRT files and other test data here. diff --git a/tests/test_audio_extractor.py b/tests/test_audio_extractor.py new file mode 100644 index 0000000..8d12634 --- /dev/null +++ b/tests/test_audio_extractor.py @@ -0,0 +1,218 @@ +"""Unit tests for the AudioExtractor utility.""" + +import os +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path + +from src.utils.audio_extractor import AudioExtractor, AudioExtractionError + + +class TestAudioExtractorInit: + """Tests for AudioExtractor initialization.""" + + @patch("src.utils.audio_extractor.shutil.which") + def test_init_with_ffmpeg_available(self, mock_which, tmp_path): + """Should initialize successfully when FFmpeg is found.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + assert extractor.sample_rate == 16000 + assert extractor.audio_format == "wav" + + @patch("src.utils.audio_extractor.shutil.which") + def test_init_without_ffmpeg(self, mock_which): + """Should raise RuntimeError when FFmpeg is not found.""" + mock_which.return_value = None + with pytest.raises(RuntimeError, match="FFmpeg not found"): + AudioExtractor() + + @patch("src.utils.audio_extractor.shutil.which") + def test_init_custom_sample_rate(self, mock_which, tmp_path): + """Should accept custom sample rate.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(sample_rate=44100, output_dir=str(tmp_path)) + assert extractor.sample_rate == 44100 + + @patch("src.utils.audio_extractor.shutil.which") + def test_init_creates_output_directory(self, mock_which, tmp_path): + """Should create the output directory if it doesn't exist.""" + mock_which.return_value = "/usr/bin/ffmpeg" + output_dir = str(tmp_path / "new_dir" / "audio") + AudioExtractor(output_dir=output_dir) + assert os.path.isdir(output_dir) + + +class TestAudioExtractorValidation: + """Tests for input validation.""" + + @patch("src.utils.audio_extractor.shutil.which") + def test_validate_nonexistent_file(self, mock_which, tmp_path): + """Should raise FileNotFoundError for missing files.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + with pytest.raises(FileNotFoundError, match="Video file not found"): + extractor.extract("nonexistent_video.mp4") + + @patch("src.utils.audio_extractor.shutil.which") + def test_validate_unsupported_format(self, mock_which, tmp_path): + """Should raise ValueError for unsupported video formats.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + # Create a dummy file with unsupported extension + dummy_file = tmp_path / "test.xyz" + dummy_file.touch() + + with pytest.raises(ValueError, match="Unsupported video format"): + extractor.extract(str(dummy_file)) + + @patch("src.utils.audio_extractor.shutil.which") + def test_validate_directory_instead_of_file(self, mock_which, tmp_path): + """Should raise ValueError when path points to a directory.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + dir_path = tmp_path / "somedir.mp4" + dir_path.mkdir() + + with pytest.raises(ValueError, match="Path is not a file"): + extractor.extract(str(dir_path)) + + @patch("src.utils.audio_extractor.shutil.which") + def test_validate_supported_formats(self, mock_which, tmp_path): + """Should accept all supported video formats.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + for ext in AudioExtractor.SUPPORTED_VIDEO_FORMATS: + path = tmp_path / f"test{ext}" + path.touch() + # Validation should pass (extraction will fail but that's OK) + validated = extractor._validate_input(str(path)) + assert validated.exists() + + +class TestAudioExtractorExtract: + """Tests for the extract method.""" + + @patch("src.utils.audio_extractor.shutil.which") + def test_file_exists_error_without_overwrite(self, mock_which, tmp_path): + """Should raise FileExistsError when output exists and overwrite=False.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + # Create dummy input and output files + input_file = tmp_path / "test.mp4" + input_file.touch() + output_file = tmp_path / "test.wav" + output_file.touch() + + with pytest.raises(FileExistsError, match="Output file already exists"): + extractor.extract(str(input_file)) + + @patch("src.utils.audio_extractor.subprocess.run") + @patch("src.utils.audio_extractor.shutil.which") + def test_successful_extraction(self, mock_which, mock_run, tmp_path): + """Should successfully extract audio when FFmpeg succeeds.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + input_file = tmp_path / "test.mp4" + input_file.touch() + + output_file = tmp_path / "test.wav" + + # Use side_effect to create the output file when subprocess.run is called + # (simulating FFmpeg creating the file during execution) + def fake_ffmpeg(*args, **kwargs): + output_file.write_bytes(b"\x00" * 1024) + return MagicMock(returncode=0, stderr="") + + mock_run.side_effect = fake_ffmpeg + + result = extractor.extract(str(input_file)) + assert result == str(output_file) + assert mock_run.called + + @patch("src.utils.audio_extractor.subprocess.run") + @patch("src.utils.audio_extractor.shutil.which") + def test_ffmpeg_failure(self, mock_which, mock_run, tmp_path): + """Should raise AudioExtractionError when FFmpeg fails.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + input_file = tmp_path / "test.mp4" + input_file.touch() + + mock_run.return_value = MagicMock( + returncode=1, stderr="Error: Invalid data found" + ) + + with pytest.raises(AudioExtractionError, match="FFmpeg failed"): + extractor.extract(str(input_file)) + + @patch("src.utils.audio_extractor.subprocess.run") + @patch("src.utils.audio_extractor.shutil.which") + def test_extraction_timeout(self, mock_which, mock_run, tmp_path): + """Should raise AudioExtractionError on timeout.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + input_file = tmp_path / "test.mp4" + input_file.touch() + + import subprocess + mock_run.side_effect = subprocess.TimeoutExpired(cmd="ffmpeg", timeout=300) + + with pytest.raises(AudioExtractionError, match="timed out"): + extractor.extract(str(input_file)) + + @patch("src.utils.audio_extractor.subprocess.run") + @patch("src.utils.audio_extractor.shutil.which") + def test_custom_output_path(self, mock_which, mock_run, tmp_path): + """Should save to custom output path when specified.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + input_file = tmp_path / "test.mp4" + input_file.touch() + + custom_output = tmp_path / "custom" / "output.wav" + + # Use side_effect to create output file during FFmpeg execution + def fake_ffmpeg(*args, **kwargs): + custom_output.parent.mkdir(parents=True, exist_ok=True) + custom_output.write_bytes(b"\x00" * 512) + return MagicMock(returncode=0, stderr="") + + mock_run.side_effect = fake_ffmpeg + + result = extractor.extract(str(input_file), output_path=str(custom_output)) + assert result == str(custom_output) + + +class TestBuildOutputPath: + """Tests for output path generation.""" + + @patch("src.utils.audio_extractor.shutil.which") + def test_default_output_path(self, mock_which, tmp_path): + """Should generate output path in the configured output directory.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + video_path = Path("/some/path/my_video.mp4") + result = extractor._build_output_path(video_path) + + assert result == Path(tmp_path) / "my_video.wav" + + @patch("src.utils.audio_extractor.shutil.which") + def test_custom_output_path(self, mock_which, tmp_path): + """Should use custom output path when provided.""" + mock_which.return_value = "/usr/bin/ffmpeg" + extractor = AudioExtractor(output_dir=str(tmp_path)) + + video_path = Path("/some/path/my_video.mp4") + custom_path = str(tmp_path / "custom_name.wav") + result = extractor._build_output_path(video_path, custom_path) + + assert result == Path(custom_path).resolve() From 22f4f0edc3b9ebd80d89a4eaa78a5fea0a52b7a9 Mon Sep 17 00:00:00 2001 From: Dev Mishra <118660840+mishradev1@users.noreply.github.com> Date: Tue, 5 May 2026 00:28:46 +0530 Subject: [PATCH 2/2] Update README.md --- README.md | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 0f1be3c..825894c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ -# 🎬 Intelligent Closed Caption (CC) Suggestion Tool +# Intelligent Closed Caption (CC) Suggestion Tool An AI-powered tool that intelligently identifies moments in a video where a Closed Caption (CC) annotation is genuinely necessary β€” such as when a non-speech audio event meaningfully affects the speakers or the scene β€” and suggests contextually relevant CC text, without over-captioning routine or low-impact sounds. -## πŸ—οΈ Architecture +## Architecture ``` β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” @@ -34,14 +34,14 @@ An AI-powered tool that intelligently identifies moments in a video where a Clos β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` -## ✨ Features +## Features - **Sound Event Detection** β€” Automatically detects and classifies non-speech audio events (honking, explosions, laughter, music, alarms, applause, etc.) with confidence scores and timestamps using YAMNet. - **Speaker Reaction Detection** β€” Analyzes video frames at detected event timestamps using MediaPipe to identify visible reactions (head turns, startled body language, facial expressions). - **Intelligent CC Decisions** β€” Combines audio and visual signals to determine whether a CC annotation is truly warranted, avoiding over-captioning of ambient sounds. - **SRT Output** β€” Generates standard SRT subtitle files with properly formatted timestamps and descriptive CC labels like `[honking]`, `[crowd cheering]`, `[gunshot]`. -## πŸ“‹ Prerequisites +## Prerequisites - **Python 3.9+** - **FFmpeg** β€” Must be installed and available on your system PATH @@ -49,7 +49,7 @@ An AI-powered tool that intelligently identifies moments in a video where a Clos - macOS: `brew install ffmpeg` - Linux: `sudo apt install ffmpeg` -## πŸš€ Installation +## Installation 1. **Clone the repository** ```bash @@ -90,13 +90,13 @@ print(f"Audio saved to: {audio_path}") python -m src.cli --input video.mp4 --output captions.srt ``` -## πŸ§ͺ Running Tests +## Running Tests ```bash pytest tests/ -v ``` -## πŸ“ Project Structure +## Project Structure ``` Intelligent-cc-generation/ @@ -133,7 +133,7 @@ Intelligent-cc-generation/ └── README.md ``` -## πŸ› οΈ Tech Stack +## Tech Stack | Component | Technology | |-----------|-----------| @@ -144,14 +144,7 @@ Intelligent-cc-generation/ | Audio Extraction | [FFmpeg](https://ffmpeg.org/) via moviepy | | Output Format | SRT (SubRip Subtitle) | -## 🀝 Contributing -1. Fork the repository -2. Create a feature branch (`git checkout -b feat/your-feature`) -3. Commit your changes (`git commit -m 'Add your feature'`) -4. Push to the branch (`git push origin feat/your-feature`) -5. Open a Pull Request - -## πŸ“„ License +## License This project is part of the [Planet Read](https://www.planetread.org/) initiative under the DMP 2026 program.