Upload 2 files
Browse files- app.py +9 -3
- process_documents.py +23 -23
app.py
CHANGED
|
@@ -12,6 +12,8 @@ import base64
|
|
| 12 |
|
| 13 |
st.set_page_config(layout="wide")
|
| 14 |
os.environ["OPENAI_API_KEY"] = "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"
|
|
|
|
|
|
|
| 15 |
|
| 16 |
get_references = lambda relevant_docs: " ".join(
|
| 17 |
[f"[{ref}]" for ref in sorted([ref.metadata["chunk_id"] for ref in relevant_docs])]
|
|
@@ -46,14 +48,18 @@ def process_documents_wrapper(inputs):
|
|
| 46 |
snippets = process_documents(inputs)
|
| 47 |
st.session_state.retriever = create_retriever(snippets)
|
| 48 |
st.session_state.source_doc_urls = inputs
|
| 49 |
-
st.session_state.index = [
|
|
|
|
|
|
|
| 50 |
response = f"Uploaded and processed documents {inputs}"
|
| 51 |
st.session_state.messages.append((f"/upload {inputs}", response, ""))
|
| 52 |
return response
|
| 53 |
|
| 54 |
|
| 55 |
def index_documents_wrapper(inputs=None):
|
| 56 |
-
response = pd.
|
|
|
|
|
|
|
| 57 |
st.session_state.messages.append(("/index", response, ""))
|
| 58 |
return response
|
| 59 |
|
|
@@ -173,4 +179,4 @@ if __name__ == "__main__":
|
|
| 173 |
default_function=query_llm_wrapper,
|
| 174 |
all_commands=all_commands,
|
| 175 |
)
|
| 176 |
-
boot(command_center)
|
|
|
|
| 12 |
|
| 13 |
st.set_page_config(layout="wide")
|
| 14 |
os.environ["OPENAI_API_KEY"] = "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"
|
| 15 |
+
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
| 16 |
+
os.environ["LANGCHAIN_API_KEY"] = "ls__aca2f2f97d2f4b9caef0ef75c3c33f9d"
|
| 17 |
|
| 18 |
get_references = lambda relevant_docs: " ".join(
|
| 19 |
[f"[{ref}]" for ref in sorted([ref.metadata["chunk_id"] for ref in relevant_docs])]
|
|
|
|
| 48 |
snippets = process_documents(inputs)
|
| 49 |
st.session_state.retriever = create_retriever(snippets)
|
| 50 |
st.session_state.source_doc_urls = inputs
|
| 51 |
+
st.session_state.index = [
|
| 52 |
+
[snip.metadata["chunk_id"], snip.metadata["header"]] for snip in snippets
|
| 53 |
+
]
|
| 54 |
response = f"Uploaded and processed documents {inputs}"
|
| 55 |
st.session_state.messages.append((f"/upload {inputs}", response, ""))
|
| 56 |
return response
|
| 57 |
|
| 58 |
|
| 59 |
def index_documents_wrapper(inputs=None):
|
| 60 |
+
response = pd.DataFrame(
|
| 61 |
+
st.session_state.index, columns=["id", "reference"]
|
| 62 |
+
).to_markdown()
|
| 63 |
st.session_state.messages.append(("/index", response, ""))
|
| 64 |
return response
|
| 65 |
|
|
|
|
| 179 |
default_function=query_llm_wrapper,
|
| 180 |
all_commands=all_commands,
|
| 181 |
)
|
| 182 |
+
boot(command_center)
|
process_documents.py
CHANGED
|
@@ -10,17 +10,32 @@ deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
|
|
| 10 |
|
| 11 |
def process_documents(urls):
|
| 12 |
snippets = []
|
| 13 |
-
for url in urls:
|
| 14 |
if url.endswith(".pdf"):
|
| 15 |
-
snippets.extend(process_pdf(url))
|
| 16 |
else:
|
| 17 |
-
snippets.extend(process_web(url))
|
| 18 |
-
for e, snippet in enumerate(snippets):
|
| 19 |
-
snippet.metadata["chunk_id"] = e
|
| 20 |
return snippets
|
| 21 |
|
| 22 |
|
| 23 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
data = PDFMinerPDFasHTMLLoader(url).load()[0]
|
| 25 |
content = BeautifulSoup(data.page_content, "html.parser").find_all("div")
|
| 26 |
snippets = get_pdf_snippets(content)
|
|
@@ -36,7 +51,8 @@ def process_pdf(url):
|
|
| 36 |
"header": " ".join(snip[1]["header_text"].split()[:10]),
|
| 37 |
"source_url": url,
|
| 38 |
"source_type": "pdf",
|
| 39 |
-
"chunk_id": i,
|
|
|
|
| 40 |
},
|
| 41 |
)
|
| 42 |
for i, snip in enumerate(semantic_snippets)
|
|
@@ -123,19 +139,3 @@ def get_pdf_semantic_snippets(filtered_snippets, median_font_size):
|
|
| 123 |
}
|
| 124 |
semantic_snippets.append((current_content, metadata))
|
| 125 |
return semantic_snippets
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
def process_web(url):
|
| 129 |
-
data = WebBaseLoader(url).load()[0]
|
| 130 |
-
document_snippets = [
|
| 131 |
-
Document(
|
| 132 |
-
page_content=deep_strip(data.page_content),
|
| 133 |
-
metadata={
|
| 134 |
-
"header": data.metadata["title"],
|
| 135 |
-
"source_url": url,
|
| 136 |
-
"source_type": "web",
|
| 137 |
-
"chunk_id": 0,
|
| 138 |
-
},
|
| 139 |
-
)
|
| 140 |
-
]
|
| 141 |
-
return document_snippets
|
|
|
|
| 10 |
|
| 11 |
def process_documents(urls):
|
| 12 |
snippets = []
|
| 13 |
+
for source_id, url in enumerate(urls):
|
| 14 |
if url.endswith(".pdf"):
|
| 15 |
+
snippets.extend(process_pdf(url, source_id))
|
| 16 |
else:
|
| 17 |
+
snippets.extend(process_web(url, source_id))
|
|
|
|
|
|
|
| 18 |
return snippets
|
| 19 |
|
| 20 |
|
| 21 |
+
def process_web(url, source_id):
|
| 22 |
+
data = WebBaseLoader(url).load()[0]
|
| 23 |
+
document_snippets = [
|
| 24 |
+
Document(
|
| 25 |
+
page_content=deep_strip(data.page_content),
|
| 26 |
+
metadata={
|
| 27 |
+
"header": data.metadata["title"],
|
| 28 |
+
"source_url": url,
|
| 29 |
+
"source_type": "web",
|
| 30 |
+
"chunk_id": f"{source_id}_0",
|
| 31 |
+
"source_id": source_id,
|
| 32 |
+
},
|
| 33 |
+
)
|
| 34 |
+
]
|
| 35 |
+
return document_snippets
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def process_pdf(url, source_id):
|
| 39 |
data = PDFMinerPDFasHTMLLoader(url).load()[0]
|
| 40 |
content = BeautifulSoup(data.page_content, "html.parser").find_all("div")
|
| 41 |
snippets = get_pdf_snippets(content)
|
|
|
|
| 51 |
"header": " ".join(snip[1]["header_text"].split()[:10]),
|
| 52 |
"source_url": url,
|
| 53 |
"source_type": "pdf",
|
| 54 |
+
"chunk_id": f"{source_id}_{i}",
|
| 55 |
+
"source_id": source_id,
|
| 56 |
},
|
| 57 |
)
|
| 58 |
for i, snip in enumerate(semantic_snippets)
|
|
|
|
| 139 |
}
|
| 140 |
semantic_snippets.append((current_content, metadata))
|
| 141 |
return semantic_snippets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|