Spaces:
Sleeping
Sleeping
th8m0z
commited on
Commit
•
f48c06b
1
Parent(s):
0a443bb
decreased chunk sizes + improved summaries
Browse files- functions.py +7 -4
- semantic_search.py +1 -1
functions.py
CHANGED
@@ -36,7 +36,7 @@ def pdf_to_text(path, start_page=1, end_page=None):
|
|
36 |
return text_list
|
37 |
|
38 |
# converts a text into a list of chunks
|
39 |
-
def text_to_chunks(texts, word_length=
|
40 |
|
41 |
filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
|
42 |
text_toks = [t.split(' ') for t in filtered_texts]
|
@@ -106,7 +106,7 @@ def generate_text(openAI_key, prompt, model="gpt-3.5-turbo"):
|
|
106 |
def construct_prompt(question, openAI_key):
|
107 |
topn_chunks = recommender(question)
|
108 |
|
109 |
-
topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-
|
110 |
|
111 |
prompt = 'search results:\n\n'
|
112 |
for c in topn_chunks:
|
@@ -162,11 +162,14 @@ def question_answer(chat_history, url, files, question, openAI_key, model):
|
|
162 |
return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
|
163 |
|
164 |
|
165 |
-
def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=
|
166 |
total_tokens = sum(len(chunk.split()) for chunk in chunks)
|
167 |
if total_tokens > token_limit:
|
168 |
print("has to summarize")
|
169 |
-
summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n"
|
|
|
|
|
|
|
170 |
return generate_text(openAI_key, summary_prompt, model=model)
|
171 |
else:
|
172 |
return chunks
|
|
|
36 |
return text_list
|
37 |
|
38 |
# converts a text into a list of chunks
|
39 |
+
def text_to_chunks(texts, word_length=150, start_page=1, file_number=1):
|
40 |
|
41 |
filtered_texts = [''.join(char for char in text if ord(char) < 128) for text in texts]
|
42 |
text_toks = [t.split(' ') for t in filtered_texts]
|
|
|
106 |
def construct_prompt(question, openAI_key):
|
107 |
topn_chunks = recommender(question)
|
108 |
|
109 |
+
topn_chunks = summarize_ss_results_if_needed(openAI_key, topn_chunks, model="gpt-3.5-turbo")
|
110 |
|
111 |
prompt = 'search results:\n\n'
|
112 |
for c in topn_chunks:
|
|
|
162 |
return f'[ERROR]: Either you do not have access to GPT4 or you have exhausted your quota!'
|
163 |
|
164 |
|
165 |
+
def summarize_ss_results_if_needed(openAI_key, chunks, model, token_limit=8000):
|
166 |
total_tokens = sum(len(chunk.split()) for chunk in chunks)
|
167 |
if total_tokens > token_limit:
|
168 |
print("has to summarize")
|
169 |
+
summary_prompt = "Summarize the following text, while keeping important information, facts and figures. It is also very important to keep the [PDF Number][Page number] notation intact!\n\n"
|
170 |
+
for c in chunks:
|
171 |
+
summary_prompt += c + '\n\n'
|
172 |
+
print(summary_prompt)
|
173 |
return generate_text(openAI_key, summary_prompt, model=model)
|
174 |
else:
|
175 |
return chunks
|
semantic_search.py
CHANGED
@@ -10,7 +10,7 @@ class SemanticSearch:
|
|
10 |
self.fitted = False
|
11 |
|
12 |
# fits the recommender
|
13 |
-
def fit(self, data, batch=1000, n_neighbors=
|
14 |
self.data = data
|
15 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
16 |
n_neighbors = min(n_neighbors, len(self.embeddings))
|
|
|
10 |
self.fitted = False
|
11 |
|
12 |
# fits the recommender
|
13 |
+
def fit(self, data, batch=1000, n_neighbors=5):
|
14 |
self.data = data
|
15 |
self.embeddings = self.get_text_embedding(data, batch=batch)
|
16 |
n_neighbors = min(n_neighbors, len(self.embeddings))
|