ASR/mcp_server.py at main · AfeiFun/ASR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
#!/usr/bin/env python3
"""
ASR MCP服务器 V2 - 优化版本
基于Model Context Protocol (MCP)的视频转文字服务
支持从URL下载视频并进行语音识别转录
"""

import os
import json
import sys
from pathlib import Path
from typing import Dict, Any

from fastmcp import FastMCP

# 导入本地模块
from video_downloader import VideoDownloader
from asr_transcriber import ASRTranscriber
from video_processor import extract_audio_from_video, get_supported_video_formats

# 创建FastMCP应用
mcp = FastMCP("ASR Transcriber")

# 全局变量
video_downloader = None
asr_transcriber = None

def initialize_services():
    """初始化服务"""
    global video_downloader, asr_transcriber

    try:
        # 初始化视频下载器
        video_downloader = VideoDownloader()
        print("✅ 视频下载器初始化成功", file=sys.stderr)

        # 初始化ASR转录器（默认配置）
        asr_transcriber = ASRTranscriber()
        print("✅ ASR转录器初始化成功", file=sys.stderr)

    except Exception as e:
        print(f"❌ 服务初始化失败: {str(e)}", file=sys.stderr)
        raise

@mcp.tool()
def transcribe_from_url(
    url: str,
    output_format: str = "text",
    language: str = "auto"
) -> str:
    """
    从视频URL下载并转录

    Args:
        url: 视频URL（支持YouTube、Bilibili等平台）
        output_format: 输出格式（text/srt/vtt/json）
        language: 语言代码（auto/zh/en/ja/ko等）

    Returns:
        转录结果
    """
    try:
        global video_downloader, asr_transcriber

        # 验证URL
        if not video_downloader.is_supported_url(url):
            return f"❌ 不支持的URL或URL无效: {url}"

        # 获取视频信息
        try:
            video_info = video_downloader.get_video_info(url)
            print(f"📹 视频信息: {video_info['title']} (时长: {video_info['duration']}秒)", file=sys.stderr)
        except Exception as e:
            return f"❌ 获取视频信息失败: {str(e)}"

        # 创建持久输出目录
        import datetime
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        safe_title = video_downloader._sanitize_filename(video_info['title'])[:50]  # 限制长度
        output_dir = os.path.join(os.getcwd(), "transcriptions", f"{timestamp}_{safe_title}")
        os.makedirs(output_dir, exist_ok=True)

        video_file_path = None
        audio_file_path = None
        transcript_file_path = None

        try:
            # 1. 下载视频文件
            print("🔽 开始下载视频...", file=sys.stderr)
            video_file_path = video_downloader.download_video(url, output_dir)

            # 2. 下载音频文件（用于转录）
            print("🔽 开始下载音频...", file=sys.stderr)
            audio_file_path = video_downloader.download_audio_only(url, output_dir)

            # 3. 重新初始化ASR转录器（使用默认VAD设置）
            transcriber = ASRTranscriber(enable_vad=True)

            # 4. 转录音频
            print("🎤 开始转录...", file=sys.stderr)
            result = transcriber.transcribe_audio(
                audio_path=audio_file_path,
                language=language,
                max_length=5,  # 默认5秒分段
                batch_size=600  # 默认批处理大小
            )

            # 5. 格式化输出并保存到文件
            output_content = format_transcription_output(result, output_format, video_info)

            # 6. 保存转录文件
            if output_format == "srt":
                transcript_file_path = os.path.join(output_dir, f"{safe_title}.srt")
            elif output_format == "vtt":
                transcript_file_path = os.path.join(output_dir, f"{safe_title}.vtt")
            elif output_format == "json":
                transcript_file_path = os.path.join(output_dir, f"{safe_title}.json")
            else:  # text
                transcript_file_path = os.path.join(output_dir, f"{safe_title}.txt")

            with open(transcript_file_path, 'w', encoding='utf-8') as f:
                f.write(output_content)

            # 7. 构建返回结果
            result_info = f"""✅ 转录完成！

📹 **视频信息**:
• 标题: {video_info['title']}
• 时长: {video_info['duration']}秒 ({video_info['duration']//60}分{video_info['duration']%60}秒)
• 格式: {output_format}

📁 **文件保存位置**:
• 📹 视频文件: {video_file_path}
• 🎵 音频文件: {audio_file_path}
• 📝 转录文件: {transcript_file_path}
• 📂 输出目录: {output_dir}

🔍 **转录预览**:
{output_content[:500]}{"..." if len(output_content) > 500 else ""}"""

            return result_info

        except Exception as inner_e:
            # 如果过程中出错，也返回已保存的文件信息
            error_info = f"⚠️ 转录过程中出现错误: {str(inner_e)}\n\n"
            if video_file_path and os.path.exists(video_file_path):
                error_info += f"📹 已保存视频文件: {video_file_path}\n"
            if audio_file_path and os.path.exists(audio_file_path):
                error_info += f"🎵 已保存音频文件: {audio_file_path}\n"
            if transcript_file_path and os.path.exists(transcript_file_path):
                error_info += f"📝 已保存转录文件: {transcript_file_path}\n"
            return error_info

    except Exception as e:
        return f"❌ 转录失败: {str(e)}"

@mcp.tool()
def transcribe_local_file(
    file_path: str,
    output_format: str = "text",
    language: str = "auto"
) -> str:
    """
    转录本地视频/音频文件

    Args:
        file_path: 本地文件路径
        output_format: 输出格式（text/srt/vtt/json）
        language: 语言代码（auto/zh/en/ja/ko等）

    Returns:
        转录结果
    """
    try:
        global asr_transcriber

        file_path = Path(file_path)

        if not file_path.exists():
            return f"❌ 文件不存在: {file_path}"

        # 检查文件类型
        file_ext = file_path.suffix.lower()
        audio_exts = ['.wav', '.mp3', '.flac', '.m4a', '.aac']
        video_exts = get_supported_video_formats()

        # 创建持久输出目录
        import datetime
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        safe_title = file_path.stem[:50]  # 限制长度
        output_dir = os.path.join(os.getcwd(), "transcriptions", f"{timestamp}_{safe_title}_local")
        os.makedirs(output_dir, exist_ok=True)

        audio_file_path = None
        transcript_file_path = None

        try:
            if file_ext in audio_exts:
                # 直接处理音频文件，复制到输出目录
                import shutil
                audio_file_path = os.path.join(output_dir, f"{safe_title}_audio{file_ext}")
                shutil.copy2(str(file_path), audio_file_path)
                audio_file = audio_file_path
            elif file_ext in video_exts:
                # 复制原视频文件
                import shutil
                video_file_path = os.path.join(output_dir, f"{safe_title}_video{file_ext}")
                shutil.copy2(str(file_path), video_file_path)

                # 从视频提取音频
                print("🎬 从视频提取音频...", file=sys.stderr)
                audio_file_path = os.path.join(output_dir, f"{safe_title}_extracted_audio.wav")
                audio_file = extract_audio_from_video(str(file_path), audio_file_path)
            else:
                return f"❌ 不支持的文件格式: {file_ext}"

            # 重新初始化ASR转录器（使用默认VAD设置）
            transcriber = ASRTranscriber(enable_vad=True)

            # 转录音频
            print("🎤 开始转录...", file=sys.stderr)
            result = transcriber.transcribe_audio(
                audio_path=audio_file,
                language=language,
                max_length=5,  # 默认5秒分段
                batch_size=600  # 默认批处理大小
            )

            # 格式化输出并保存到文件
            file_info = {"title": file_path.stem, "duration": 0}
            output_content = format_transcription_output(result, output_format, file_info)

            # 保存转录文件
            if output_format == "srt":
                transcript_file_path = os.path.join(output_dir, f"{safe_title}.srt")
            elif output_format == "vtt":
                transcript_file_path = os.path.join(output_dir, f"{safe_title}.vtt")
            elif output_format == "json":
                transcript_file_path = os.path.join(output_dir, f"{safe_title}.json")
            else:  # text
                transcript_file_path = os.path.join(output_dir, f"{safe_title}.txt")

            with open(transcript_file_path, 'w', encoding='utf-8') as f:
                f.write(output_content)

            # 构建返回结果
            result_info = f"""✅ 转录完成！

📁 **文件信息**:
• 原文件: {file_path}
• 文件类型: {file_ext} ({'Audio' if file_ext in audio_exts else 'Video'})
• 格式: {output_format}

📁 **文件保存位置**:
• 🎵 音频文件: {audio_file_path}
• 📝 转录文件: {transcript_file_path}
• 📂 输出目录: {output_dir}

🔍 **转录预览**:
{output_content[:500]}{"..." if len(output_content) > 500 else ""}"""

            return result_info

        except Exception as inner_e:
            # 如果过程中出错，也返回已保存的文件信息
            error_info = f"⚠️ 转录过程中出现错误: {str(inner_e)}\n\n"
            if audio_file_path and os.path.exists(audio_file_path):
                error_info += f"🎵 已保存音频文件: {audio_file_path}\n"
            if transcript_file_path and os.path.exists(transcript_file_path):
                error_info += f"📝 已保存转录文件: {transcript_file_path}\n"
            return error_info

    except Exception as e:
        return f"❌ 转录失败: {str(e)}"

@mcp.tool()
def get_video_info(url: str) -> str:
    """
    获取视频信息

    Args:
        url: 视频URL

    Returns:
        视频信息（JSON格式）
    """
    try:
        global video_downloader

        if not video_downloader.is_supported_url(url):
            return f"❌ 不支持的URL或URL无效: {url}"

        info = video_downloader.get_video_info(url)

        # 格式化显示
        formatted_info = f"""
📹 **视频信息**

**标题**: {info['title']}
**时长**: {info['duration']}秒 ({info['duration']//60}分{info['duration']%60}秒)
**上传者**: {info['uploader']}
**上传日期**: {info['upload_date']}
**观看次数**: {info['view_count']:,} 次
**可用格式数**: {info['formats_available']} 个
**网页链接**: {info['webpage_url']}

**描述**: {info['description']}
"""

        return formatted_info

    except Exception as e:
        return f"❌ 获取视频信息失败: {str(e)}"

@mcp.tool()
def list_supported_languages() -> str:
    """
    列出支持的语言

    Returns:
        支持的语言列表
    """
    languages = {
        "auto": "自动检测",
        "zh": "中文",
        "en": "英文",
        "ja": "日语",
        "ko": "韩语",
        "es": "西班牙语",
        "fr": "法语",
        "de": "德语",
        "ru": "俄语"
    }

    result = "🌍 **支持的语言**:\n\n"
    for code, name in languages.items():
        result += f"• `{code}`: {name}\n"

    result += "\n**使用方法**: 在转录时设置 `language` 参数，例如 `language='zh'` 表示中文"

    return result

@mcp.tool()
def list_supported_platforms() -> str:
    """
    列出支持的视频平台

    Returns:
        支持的平台列表
    """
    try:
        global video_downloader
        sites = video_downloader.get_supported_sites()

        result = "🌐 **支持的主要视频平台**:\n\n"
        for site in sites[:15]:  # 显示前15个
            result += f"• {site}\n"

        result += f"\n还支持更多平台... (共支持 {len(sites)}+ 个平台)"
        result += "\n\n**使用方法**: 直接提供视频URL即可，系统会自动识别平台"

        return result

    except Exception as e:
        return f"❌ 获取支持平台列表失败: {str(e)}"

@mcp.tool()
def get_output_formats() -> str:
    """
    获取支持的输出格式说明

    Returns:
        输出格式说明
    """
    formats = {
        "text": "纯文本格式，只包含转录的文字内容",
        "srt": "SRT字幕格式，包含时间戳和文本，适合视频字幕",
        "vtt": "WebVTT格式，Web标准字幕格式",
        "json": "JSON结构化格式，包含详细的时间戳和元数据"
    }

    result = "📄 **支持的输出格式**:\n\n"
    for format_type, description in formats.items():
        result += f"• **{format_type}**: {description}\n"

    result += "\n**使用方法**: 在转录时设置 `output_format` 参数，例如 `output_format='srt'`"

    return result

def format_transcription_output(result: Dict[str, Any], output_format: str, video_info: Dict[str, Any]) -> str:
    """
    格式化转录输出

    Args:
        result: ASR转录结果
        output_format: 输出格式
        video_info: 视频信息

    Returns:
        格式化后的输出
    """
    try:
        if output_format == "text":
            return result.get("text", "")

        elif output_format == "json":
            return json.dumps({
                "video_info": video_info,
                "transcription": result
            }, ensure_ascii=False, indent=2)

        elif output_format == "srt":
            segments = result.get("segments", [])
            srt_content = ""

            for i, segment in enumerate(segments, 1):
                start_time = format_srt_time(segment["start"])
                end_time = format_srt_time(segment["end"])
                text = segment["text"].strip()

                srt_content += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"

            return srt_content

        elif output_format == "vtt":
            segments = result.get("segments", [])
            vtt_content = "WEBVTT\n\n"

            for segment in segments:
                start_time = format_vtt_time(segment["start"])
                end_time = format_vtt_time(segment["end"])
                text = segment["text"].strip()

                vtt_content += f"{start_time} --> {end_time}\n{text}\n\n"

            return vtt_content

        else:
            return f"不支持的输出格式: {output_format}"

    except Exception as e:
        return f"格式化输出失败: {str(e)}"

def format_srt_time(seconds: float) -> str:
    """格式化SRT时间格式"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}".replace('.', ',')

def format_vtt_time(seconds: float) -> str:
    """格式化VTT时间格式"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    return f"{hours:02d}:{minutes:02d}:{seconds:06.3f}"

def main():
    """主函数"""
    try:
        # 初始化服务
        initialize_services()

        # 运行MCP服务器
        mcp.run()

    except KeyboardInterrupt:
        print("\n👋 服务器已停止", file=sys.stderr)
        sys.exit(0)
    except Exception as e:
        print(f"❌ 服务器运行失败: {str(e)}", file=sys.stderr)
        sys.exit(1)

if __name__ == "__main__":
    main()