10k_summarizer.py

# -*- coding: utf-8 -*-
"""10K Summarizer.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1XARaeZuvM_7ZX_03zv4okGJg2O2SAN8q
"""

!pip install langchain

from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

"""## LANGCHAIN
https://python.langchain.com/docs/get_started/introduction.html

##
- Import your model from lanchain

`
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")
`

- Most important paper is scientific statement

- Create a prompt template
- Create a textsplitter

ISSUE: DOCUMENTS ARE TOO LONG
SOLUTION: SPLIT THE DOCUMENTS

- give a max token length, and it will make sure it doesn't go over that length
- if you're confused about steps here, look here: /~https://github.com/KellisLab/meeting-summarizer-v2/blob/main/summarize.py
- split the text
- turn it into "Documents"
- Create a prompt
- Put the prompt inside an LLMChain https://python.langchain.com/docs/modules/chains/foundational/llm_chain
- Run the chain over the text

GOOGLE PROMPT ENGINEERING STRATEGIES

TALK TO CHAT GPT AS MUCH AS POSSIBLE

## ONCE YOUR LANGCHAIN WORKS
TALK TO WILL BEFORE YOU RUN THROUGH ALLL THE GRANT PROPOSALS
- Find the grant proposals in dropbox
- Run it on that text, create a dictionary for the lab

"""

import os

os.environ['OPENAI_API_KEY'] = "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"

!pip install openai
# llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k",openai_api_key="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")
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k", openai_api_key="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")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500, chunk_overlap=100, separators=[" ", ",", '.', "\n"]
    )

from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')

!pip install python-docx

import os

# List the contents of the MyDrive folder
folder_path = '/content/drive/MyDrive/'
contents = os.listdir(folder_path)
print(contents)

file_path = '/content/drive/MyDrive/tsla-20221231-gen.txt'

with open(file_path, 'r', encoding='latin-1') as file:
    content = file.read()

print(content)

import re
import string

output_path = '/content/drive/MyDrive/extracted_sections.txt'

# Read the input file
with open(file_path, 'r') as f:
    lines = f.readlines()

# Find the starting index of the unnecessary part
start_index = -1
for i, line in enumerate(lines):
    if 'ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK' in line:
        start_index = i
        break

# Remove the unnecessary part
if start_index != -1:
    lines = lines[:start_index]

# Write the smaller text file
with open(output_path, 'w') as f:
    f.writelines(lines)

# Read the output file
with open(output_path, 'r') as f:
    file_contents = f.read()

# Print the contents
print(file_contents)

from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
from langchain.chains.mapreduce import MapReduceChain

# # set Openai

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")

#Write function to take string input and return number of tokens
# def num_tokens_from_string(string: str) -> int:
#     """Returns the number of tokens in a text string."""
#     encoding = tiktoken.encoding_for_model(llm.model_name)
#     num_tokens = len(encoding.encode(string))
#     return num_tokens

# TARGET_LEN = 300
CHUNK_SIZE = 15_000
CHUNK_OVERLAP = 200

text_splitter = CharacterTextSplitter(separator=" ",
                                      chunk_size=CHUNK_SIZE,
                                      chunk_overlap=CHUNK_OVERLAP,
                                      length_function=llm.get_num_tokens)

!pip install tiktoken

def most_important_terms(transcript, new_summary=False):
    # new file is the same as the old file, but with summary as the base name
    # new_file = os.path.join(os.path.dirname(filename), "summary.txt")
    # # skips if it's already been created
    # if os.path.exists(new_file) and not new_summary:
    #     print(f"File \"{filename}\" has already been summarized")
    #     return new_file

    map_template_string = """The text I just fed in was a company's 10k statement, help me summarize the 10k by giving me their company overview or business model, competitive advantage, key financial performance, growth opportunities, and risks. Put these all in bullet points or a child will die.
    ## 10K ANALYSIS:
    < - Company Overview: >
    < - Competitive Advantage: >
    < - Financial Performance: >
    < - Growth Opportunities: >
    < - Risks: >

    {transcript}

    """

    reduce_template_string = """"The text I just fed in was a company's 10k statement, help me summarize the 10k by giving me their company overview or business model, competitive advantage, key financial performance, growth opportunities, and risks. Put these all in bullet points or a child will die.
    ## 10K ANALYSIS:
    < - Company Overview: >
    < - Competitive Advantage: >
    < - Financial Performance: >
    < - Growth Opportunities: >
    < - Risks: >

    {summaries}

    """
    MAP_PROMPT = PromptTemplate(input_variables=["transcript"], template=map_template_string)
    REDUCE_PROMPT = PromptTemplate(input_variables=["summaries"], template=reduce_template_string)
    map_llm_chain = LLMChain(llm=llm, prompt=MAP_PROMPT, verbose=False)
    reduce_llm_chain = LLMChain(llm=llm, prompt=REDUCE_PROMPT, verbose=False)

    generative_result_reduce_chain = StuffDocumentsChain(
        llm_chain=reduce_llm_chain,
        document_variable_name="summaries",
        verbose=False
    )
    combine_documents = MapReduceDocumentsChain(
        llm_chain=map_llm_chain,
        combine_document_chain=generative_result_reduce_chain,
        document_variable_name="transcript",
        verbose=False
    )
    map_reduce = MapReduceChain(
        combine_documents_chain=combine_documents,
        text_splitter=text_splitter,
        verbose=False,
    )


    # resp = chain({"input_documents": docs}, return_only_outputs=True)
    resp = map_reduce.run(input_text=transcript, verbose=False)


    # write this to a new file in the same parent folder, with summary in the name
    # print(f"Written at location {new_file}")
    # print("output:", resp)
    # with open(new_file, "w+") as f:
    #     f.write(resp)

    # return new_file
    return resp

text =most_important_terms(file_contents)

print(text)