-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAI Summary.py
149 lines (116 loc) · 4.76 KB
/
AI Summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import requests
import logging
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
LLAMA_ENDPOINT = "http://localhost:11434/api/generate"
MODEL_NAME = "llama3.1:8b"
SYSTEM_PROMPT = """
You are an expert that specializes in summarizing Louis Rossmann YouTube videos.
You receive the Title, Description, and Full Transcript of a Louis Rossmann video. Louis Rossmann is a repair technician and consumer advocate.
Be as thorough and detailed as possible and make sure to include all the main points and key details from the video.
While writing the summary, keep in mind that the machine generated transcripts are dirty and will contain errors. Take great effort to make sure the summary makes sense, is readable, and is coherent.
Avoid mentioning affiliate links or any promotional content.
Common Nonstandard Terms: EULA Roofie, Louis Rossmann, FUTO, Immich, Grayjay
"""
FORMAT_PROMPT = """
Your response MUST follow this EXACT format and MUST be written in MediaWiki markup:
== AI Summary ==
[2-3 sentences describing the overall video content]
=== [Topic] ===
[Details about the first main point]
=== [Topic] ===
[Details about the second main point]
[Add more main points as needed]
"""
def get_date_from_path(file_path: Path) -> tuple:
"""Extract the date components from a file path."""
path_str = str(file_path).replace('⧸', '/').replace('\\', '/')
# Split path into components
parts = path_str.split('/')
year = int(parts[1])
month = int(parts[2])
day = int(parts[3])
return (year, month, day)
def create_prompt(transcript_text):
return f"""System Instructions:
{SYSTEM_PROMPT}
Transcript:
{transcript_text}
Required Format:
{FORMAT_PROMPT}"""
def process_transcript(transcript_file: Path) -> str:
"""Process a single transcript file and return the summary."""
try:
with transcript_file.open("r", encoding="utf-8") as f:
transcript_text = f.read()
except Exception as e:
logger.error(f"Failed to read transcript file {transcript_file}: {e}")
return None
payload = {
"prompt": create_prompt(transcript_text),
"model": MODEL_NAME,
"stream": False,
"options": {
"seed": 0,
"temperature": 0.2,
"num_ctx": 32764,
"num_batch": 256,
"num_keep": 256,
"max_tokens": 32764
}
}
try:
resp = requests.post(
LLAMA_ENDPOINT,
json=payload,
timeout=300
)
resp.raise_for_status()
data = resp.json()
summary_text = data.get("response", "").strip()
if not summary_text.startswith("== AI Summary =="):
logger.warning(f"Response for {transcript_file} doesn't match expected format")
summary_text = "== AI Summary ==\n" + summary_text
return summary_text
except requests.exceptions.RequestException as e:
logger.error(f"Request to LLaMA failed for {transcript_file}: {e}")
return None
except ValueError as ve:
logger.error(f"Failed to parse JSON response for {transcript_file}: {ve}")
return None
def main():
downloads_path = Path("./Transcripts")
transcript_files = list(downloads_path.rglob("*.en.txt"))
if not transcript_files:
logger.info("No .en.txt files found in the Downloads directory.")
return
# Sort files by directory date in reverse chronological order
transcript_files.sort(key=get_date_from_path, reverse=True)
# Log the order of processing
logger.info("\nFiles will be processed in this order:")
for file in transcript_files:
date = get_date_from_path(file)
if date != (1900, 1, 1):
logger.info(f"- {file.name} ({date[0]}/{date[1]:02d}/{date[2]:02d})")
else:
logger.warning(f"- {file.name} (No valid date found)")
for transcript_file in transcript_files:
date = get_date_from_path(transcript_file)
logger.info(f"\nProcessing: {transcript_file.name} ({date[0]}/{date[1]:02d}/{date[2]:02d})")
summary_file = transcript_file.with_suffix(".sum.txt")
if summary_file.exists():
logger.info(f"Summary already exists: {summary_file}. Skipping.")
continue
summary_text = process_transcript(transcript_file)
if not summary_text:
continue
try:
with summary_file.open("w", encoding="utf-8") as sf:
sf.write(summary_text)
logger.info(f"Summary created: {summary_file}")
except Exception as e:
logger.error(f"Failed to write summary for {transcript_file}: {e}")
if __name__ == "__main__":
main()