-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranscripts.py
More file actions
87 lines (73 loc) · 3.51 KB
/
transcripts.py
File metadata and controls
87 lines (73 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import pathlib
import traceback
import sys
from youtube_transcript_api import YouTubeTranscriptApi
from werkzeug.utils import secure_filename
#*****************************************************************************************************
#This functions writes the transcript of a video to a file
#*****************************************************************************************************
def write_transcript_to_file(transcript,video_title,video_id):
try:
directory = 'transcripts'
abs_path = pathlib.Path().resolve()
full_path = os.path.join(abs_path, directory)
name = video_title + '_' + video_id + '.txt'
filename = secure_filename(name)
file_path = os.path.join(full_path, filename)
with open(file_path, 'w') as f:
for item in transcript:
start = item.get('start',0.0)
duration = item.get('duration',0.0)
text = item.get('text')
line = "{},{}\n{}\n".format(str(start),str(start+duration),text)
f.write('%s\n' % (line))
except:
print("Error on writing transcript file for video: " + video_title)
print(sys.exc_info()[0])
traceback.print_exc()
if os.path.exists(file_path):
os.remove(file_path)
file_path = ''
return file_path
#*****************************************************************************************************
#This function retrieves the transcript for a videoId given as a parameter.
#Note that this function does not use YouTubeData API to retrieve the transcript but a
#different package named YouTubeTranscriptApi
#Trying to use the YouTubeData API Captions options returns 403 (unauthorized user)
#Even with Oauth credentials and proper scopes
#*****************************************************************************************************
def get_video_transcript(videoId):
transcript_data={}
try:
# Get transcripts
transcript_list = YouTubeTranscriptApi.list_transcripts(videoId)
if transcript_list:
# Priority to manual added transcript
for transcript in transcript_list:
if not transcript.is_generated:
if 'en' in transcript.language_code.lower():
transcript_data["tr_type"] = "Manual"
transcript_data["language"] = transcript.language_code
transcript_data["data"] = transcript.fetch()
return transcript_data
# Then automatically generated
for transcript in transcript_list:
if 'en' in transcript.language_code.lower():
transcript_data["tr_type"] = "Generated"
transcript_data["language"] = transcript.language_code
transcript_data["data"] = transcript.fetch()
return transcript_data
# Translated
for transcript in transcript_list:
# translating the transcript will return another transcript object
transcript_data["tr_type"] = "Translated from " + transcript.language_code
transcript_data["language"] = "English"
transcript_data["data"] = transcript.translate('en').fetch()
return transcript_data
except:
print ('No transcript available.')
transcript_data["tr_type"] = ""
transcript_data["language"] = ""
transcript_data["data"] = ""
return transcript_data