-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPlaintext Transcript.py
87 lines (70 loc) · 3.19 KB
/
Plaintext Transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import re
import json
from pathlib import Path
def clean_vtt_text(text):
# Remove VTT header
if text.startswith('WEBVTT'):
text = re.sub(r'^WEBVTT.*?\n\n', '', text, flags=re.DOTALL)
# Remove timestamps and positioning
text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*?\n', '', text)
# Remove HTML-style tags
text = re.sub(r'<[^>]+>', '', text)
# Remove blank lines and clean up spacing
lines = [line.strip() for line in text.split('\n')]
lines = [line for line in lines if line and not line.isspace()]
# Remove duplicate lines that often occur in VTT files
cleaned_lines = []
prev_line = None
for line in lines:
if line != prev_line:
cleaned_lines.append(line)
prev_line = line
# Return the cleaned lines joined with line breaks
return '\n'.join(cleaned_lines)
def convert_vtt_directory(directory_path):
dir_path = Path(directory_path)
if not dir_path.exists():
print(f"Directory does not exist: {directory_path}")
return
print(f"Processing directory: {dir_path}")
# Traverse all subdirectories recursively
for subdir in dir_path.rglob('*'):
if subdir.is_dir():
print(f"Processing subdirectory: {subdir}")
# Look for the JSON file in the current subdirectory
json_file = next(subdir.glob('*.info.json'), None)
title = "Unknown Title"
description = ""
if json_file:
try:
print(f"Found JSON file: {json_file}")
with json_file.open(encoding='utf-8') as f:
metadata = json.load(f)
title = metadata.get('title', 'Unknown Title')
description = metadata.get('description', 'Unknown Description')
except Exception as e:
print(f"Error reading JSON file {json_file}: {e}")
else:
print(f"No JSON file found in: {subdir}")
# Process all .vtt files in the current subdirectory
vtt_files = list(subdir.glob('*.en.vtt'))
if not vtt_files:
print(f"No .vtt files found in {subdir}")
for vtt_file in vtt_files:
try:
print(f"Processing VTT file: {vtt_file}")
with open(vtt_file, 'r', encoding='utf-8') as f:
vtt_content = f.read()
cleaned_text = clean_vtt_text(vtt_content)
txt_file = vtt_file.with_suffix('.txt')
with open(txt_file, 'w', encoding='utf-8') as f:
f.write(f"Title: {title}\n\n")
f.write(f"== Description ==\n{description}\n\n")
f.write(f"== Transcript ==\n{cleaned_text}\n")
print(f"Converted {vtt_file.relative_to(dir_path)} -> {txt_file.relative_to(dir_path)}")
except Exception as e:
print(f"Error processing {vtt_file.relative_to(dir_path)}: {e}")
if __name__ == "__main__":
directory = "./Transcripts"
convert_vtt_directory(directory)