-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path10k_summarizer.py
214 lines (158 loc) · 6.67 KB
/
10k_summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# -*- coding: utf-8 -*-
"""10K Summarizer.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1XARaeZuvM_7ZX_03zv4okGJg2O2SAN8q
"""
!pip install langchain
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
"""## LANGCHAIN
https://python.langchain.com/docs/get_started/introduction.html
##
- Import your model from lanchain
`
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")
`
- Most important paper is scientific statement
- Create a prompt template
- Create a textsplitter
ISSUE: DOCUMENTS ARE TOO LONG
SOLUTION: SPLIT THE DOCUMENTS
- give a max token length, and it will make sure it doesn't go over that length
- if you're confused about steps here, look here: /~https://github.com/KellisLab/meeting-summarizer-v2/blob/main/summarize.py
- split the text
- turn it into "Documents"
- Create a prompt
- Put the prompt inside an LLMChain https://python.langchain.com/docs/modules/chains/foundational/llm_chain
- Run the chain over the text
GOOGLE PROMPT ENGINEERING STRATEGIES
TALK TO CHAT GPT AS MUCH AS POSSIBLE
## ONCE YOUR LANGCHAIN WORKS
TALK TO WILL BEFORE YOU RUN THROUGH ALLL THE GRANT PROPOSALS
- Find the grant proposals in dropbox
- Run it on that text, create a dictionary for the lab
"""
import os
os.environ['OPENAI_API_KEY'] = "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"
!pip install openai
# llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k",openai_api_key="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")
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k", openai_api_key="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")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2500, chunk_overlap=100, separators=[" ", ",", '.', "\n"]
)
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')
!pip install python-docx
import os
# List the contents of the MyDrive folder
folder_path = '/content/drive/MyDrive/'
contents = os.listdir(folder_path)
print(contents)
file_path = '/content/drive/MyDrive/tsla-20221231-gen.txt'
with open(file_path, 'r', encoding='latin-1') as file:
content = file.read()
print(content)
import re
import string
output_path = '/content/drive/MyDrive/extracted_sections.txt'
# Read the input file
with open(file_path, 'r') as f:
lines = f.readlines()
# Find the starting index of the unnecessary part
start_index = -1
for i, line in enumerate(lines):
if 'ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK' in line:
start_index = i
break
# Remove the unnecessary part
if start_index != -1:
lines = lines[:start_index]
# Write the smaller text file
with open(output_path, 'w') as f:
f.writelines(lines)
# Read the output file
with open(output_path, 'r') as f:
file_contents = f.read()
# Print the contents
print(file_contents)
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
# # set Openai
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")
#Write function to take string input and return number of tokens
# def num_tokens_from_string(string: str) -> int:
# """Returns the number of tokens in a text string."""
# encoding = tiktoken.encoding_for_model(llm.model_name)
# num_tokens = len(encoding.encode(string))
# return num_tokens
# TARGET_LEN = 300
CHUNK_SIZE = 15_000
CHUNK_OVERLAP = 200
text_splitter = CharacterTextSplitter(separator=" ",
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
length_function=llm.get_num_tokens)
!pip install tiktoken
def most_important_terms(transcript, new_summary=False):
# new file is the same as the old file, but with summary as the base name
# new_file = os.path.join(os.path.dirname(filename), "summary.txt")
# # skips if it's already been created
# if os.path.exists(new_file) and not new_summary:
# print(f"File \"{filename}\" has already been summarized")
# return new_file
map_template_string = """The text I just fed in was a company's 10k statement, help me summarize the 10k by giving me their company overview or business model, competitive advantage, key financial performance, growth opportunities, and risks. Put these all in bullet points or a child will die.
## 10K ANALYSIS:
< - Company Overview: >
< - Competitive Advantage: >
< - Financial Performance: >
< - Growth Opportunities: >
< - Risks: >
{transcript}
"""
reduce_template_string = """"The text I just fed in was a company's 10k statement, help me summarize the 10k by giving me their company overview or business model, competitive advantage, key financial performance, growth opportunities, and risks. Put these all in bullet points or a child will die.
## 10K ANALYSIS:
< - Company Overview: >
< - Competitive Advantage: >
< - Financial Performance: >
< - Growth Opportunities: >
< - Risks: >
{summaries}
"""
MAP_PROMPT = PromptTemplate(input_variables=["transcript"], template=map_template_string)
REDUCE_PROMPT = PromptTemplate(input_variables=["summaries"], template=reduce_template_string)
map_llm_chain = LLMChain(llm=llm, prompt=MAP_PROMPT, verbose=False)
reduce_llm_chain = LLMChain(llm=llm, prompt=REDUCE_PROMPT, verbose=False)
generative_result_reduce_chain = StuffDocumentsChain(
llm_chain=reduce_llm_chain,
document_variable_name="summaries",
verbose=False
)
combine_documents = MapReduceDocumentsChain(
llm_chain=map_llm_chain,
combine_document_chain=generative_result_reduce_chain,
document_variable_name="transcript",
verbose=False
)
map_reduce = MapReduceChain(
combine_documents_chain=combine_documents,
text_splitter=text_splitter,
verbose=False,
)
# resp = chain({"input_documents": docs}, return_only_outputs=True)
resp = map_reduce.run(input_text=transcript, verbose=False)
# write this to a new file in the same parent folder, with summary in the name
# print(f"Written at location {new_file}")
# print("output:", resp)
# with open(new_file, "w+") as f:
# f.write(resp)
# return new_file
return resp
text =most_important_terms(file_contents)
print(text)