minor fixes
Browse files- app.py +10 -3
- process_documents.py +5 -0
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import streamlit as st
|
|
| 2 |
import os
|
| 3 |
import pandas as pd
|
| 4 |
from command_center import CommandCenter
|
| 5 |
-
from process_documents import process_documents
|
| 6 |
from embed_documents import create_retriever
|
| 7 |
import json
|
| 8 |
from langchain.callbacks import get_openai_callback
|
|
@@ -59,7 +59,12 @@ def process_documents_wrapper(inputs):
|
|
| 59 |
st.session_state.retriever = create_retriever(snippets)
|
| 60 |
st.session_state.source_doc_urls = inputs
|
| 61 |
st.session_state.index = [
|
| 62 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
]
|
| 64 |
response = f"Uploaded and processed documents {inputs}"
|
| 65 |
st.session_state.messages.append((f"/add-papers {inputs}", response, "identity"))
|
|
@@ -68,7 +73,9 @@ def process_documents_wrapper(inputs):
|
|
| 68 |
|
| 69 |
|
| 70 |
def index_documents_wrapper(inputs=None):
|
| 71 |
-
response = pd.DataFrame(
|
|
|
|
|
|
|
| 72 |
st.session_state.messages.append(("/library", response, "dataframe"))
|
| 73 |
return (response, "dataframe")
|
| 74 |
|
|
|
|
| 2 |
import os
|
| 3 |
import pandas as pd
|
| 4 |
from command_center import CommandCenter
|
| 5 |
+
from process_documents import process_documents, num_tokens
|
| 6 |
from embed_documents import create_retriever
|
| 7 |
import json
|
| 8 |
from langchain.callbacks import get_openai_callback
|
|
|
|
| 59 |
st.session_state.retriever = create_retriever(snippets)
|
| 60 |
st.session_state.source_doc_urls = inputs
|
| 61 |
st.session_state.index = [
|
| 62 |
+
[
|
| 63 |
+
snip.metadata["chunk_id"],
|
| 64 |
+
snip.metadata["header"],
|
| 65 |
+
num_tokens(snip.page_content),
|
| 66 |
+
]
|
| 67 |
+
for snip in snippets
|
| 68 |
]
|
| 69 |
response = f"Uploaded and processed documents {inputs}"
|
| 70 |
st.session_state.messages.append((f"/add-papers {inputs}", response, "identity"))
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
def index_documents_wrapper(inputs=None):
|
| 76 |
+
response = pd.DataFrame(
|
| 77 |
+
st.session_state.index, columns=["id", "reference", "tokens"]
|
| 78 |
+
)
|
| 79 |
st.session_state.messages.append(("/library", response, "dataframe"))
|
| 80 |
return (response, "dataframe")
|
| 81 |
|
process_documents.py
CHANGED
|
@@ -4,6 +4,7 @@ from statistics import median
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
from langchain.docstore.document import Document
|
| 6 |
from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
|
|
|
|
| 7 |
|
| 8 |
deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
|
| 9 |
|
|
@@ -153,3 +154,7 @@ def get_pdf_semantic_snippets(filtered_snippets, median_font_size):
|
|
| 153 |
}
|
| 154 |
semantic_snippets.append((current_content, metadata))
|
| 155 |
return semantic_snippets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
from langchain.docstore.document import Document
|
| 6 |
from langchain.document_loaders import PDFMinerPDFasHTMLLoader, WebBaseLoader
|
| 7 |
+
import tiktoken
|
| 8 |
|
| 9 |
deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
|
| 10 |
|
|
|
|
| 154 |
}
|
| 155 |
semantic_snippets.append((current_content, metadata))
|
| 156 |
return semantic_snippets
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def num_tokens(string):
|
| 160 |
+
return len(tiktoken.get_encoding("cl100k_base").encode(string))
|