AI Summary.py

import os
import requests
import logging
from pathlib import Path

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

LLAMA_ENDPOINT = "http://localhost:11434/api/generate"
MODEL_NAME = "llama3.1:8b"

SYSTEM_PROMPT = """
You are an expert that specializes in summarizing Louis Rossmann YouTube videos.
You receive the Title, Description, and Full Transcript of a Louis Rossmann video. Louis Rossmann is a repair technician and consumer advocate.
Be as thorough and detailed as possible and make sure to include all the main points and key details from the video.

While writing the summary, keep in mind that the machine generated transcripts are dirty and will contain errors. Take great effort to make sure the summary makes sense, is readable, and is coherent.

Avoid mentioning affiliate links or any promotional content.

Common Nonstandard Terms: EULA Roofie, Louis Rossmann, FUTO, Immich, Grayjay
"""

FORMAT_PROMPT = """
Your response MUST follow this EXACT format and MUST be written in MediaWiki markup:
== AI Summary ==
[2-3 sentences describing the overall video content]

=== [Topic] ===
[Details about the first main point]

=== [Topic] ===
[Details about the second main point]
[Add more main points as needed]
"""

def get_date_from_path(file_path: Path) -> tuple:
    """Extract the date components from a file path."""

    path_str = str(file_path).replace('⧸', '/').replace('\\', '/')
    
    # Split path into components
    parts = path_str.split('/')
    
    year = int(parts[1])
    month = int(parts[2])
    day = int(parts[3])

    return (year, month, day)

def create_prompt(transcript_text):
    return f"""System Instructions:
{SYSTEM_PROMPT}

Transcript:
{transcript_text}

Required Format:
{FORMAT_PROMPT}"""

def process_transcript(transcript_file: Path) -> str:
    """Process a single transcript file and return the summary."""
    try:
        with transcript_file.open("r", encoding="utf-8") as f:
            transcript_text = f.read()
    except Exception as e:
        logger.error(f"Failed to read transcript file {transcript_file}: {e}")
        return None

    payload = {
        "prompt": create_prompt(transcript_text),
        "model": MODEL_NAME,
        "stream": False,
        "options": {
            "seed": 0,
            "temperature": 0.2,
            "num_ctx": 32764,
            "num_batch": 256,
            "num_keep": 256,
            "max_tokens": 32764
        }
    }

    try:
        resp = requests.post(
            LLAMA_ENDPOINT,
            json=payload,
            timeout=300
        )
        resp.raise_for_status()
        
        data = resp.json()
        summary_text = data.get("response", "").strip()
        
        if not summary_text.startswith("== AI Summary =="):
            logger.warning(f"Response for {transcript_file} doesn't match expected format")
            summary_text = "== AI Summary ==\n" + summary_text
            
        return summary_text
        
    except requests.exceptions.RequestException as e:
        logger.error(f"Request to LLaMA failed for {transcript_file}: {e}")
        return None
    except ValueError as ve:
        logger.error(f"Failed to parse JSON response for {transcript_file}: {ve}")
        return None

def main():
    downloads_path = Path("./Transcripts")
    transcript_files = list(downloads_path.rglob("*.en.txt"))
    
    if not transcript_files:
        logger.info("No .en.txt files found in the Downloads directory.")
        return

    # Sort files by directory date in reverse chronological order
    transcript_files.sort(key=get_date_from_path, reverse=True)
    
    # Log the order of processing
    logger.info("\nFiles will be processed in this order:")
    for file in transcript_files:
        date = get_date_from_path(file)
        if date != (1900, 1, 1):
            logger.info(f"- {file.name} ({date[0]}/{date[1]:02d}/{date[2]:02d})")
        else:
            logger.warning(f"- {file.name} (No valid date found)")
    
    for transcript_file in transcript_files:
        date = get_date_from_path(transcript_file)
        logger.info(f"\nProcessing: {transcript_file.name} ({date[0]}/{date[1]:02d}/{date[2]:02d})")

        summary_file = transcript_file.with_suffix(".sum.txt")
        if summary_file.exists():
            logger.info(f"Summary already exists: {summary_file}. Skipping.")
            continue
        
        summary_text = process_transcript(transcript_file)
        if not summary_text:
            continue
            
        try:
            with summary_file.open("w", encoding="utf-8") as sf:
                sf.write(summary_text)
            logger.info(f"Summary created: {summary_file}")
        except Exception as e:
            logger.error(f"Failed to write summary for {transcript_file}: {e}")

if __name__ == "__main__":
    main()