Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -90,27 +90,42 @@ def format_docs(docs):
|
|
| 90 |
return "\n\n".join(doc.page_content for doc in docs)
|
| 91 |
|
| 92 |
def process_pdf(uploaded_file):
|
| 93 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
try:
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
except Exception as e:
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
semantic_splitter = SemanticChunker(
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
)
|
| 112 |
-
|
| 113 |
-
docs = semantic_splitter.split_documents(documents)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
vector_db = Chroma.from_documents(documents=docs,
|
| 115 |
embedding=st.session_state.embeddings)
|
| 116 |
|
|
|
|
| 90 |
return "\n\n".join(doc.page_content for doc in docs)
|
| 91 |
|
| 92 |
def process_pdf(uploaded_file):
|
| 93 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
| 94 |
+
# tmp_file.write(uploaded_file.getvalue())
|
| 95 |
+
# tmp_file_path = tmp_file.name
|
| 96 |
+
|
| 97 |
+
# try:
|
| 98 |
+
# loader = PyPDFLoader(tmp_file_path)
|
| 99 |
+
# documents = loader.load()
|
| 100 |
+
# except Exception as e:
|
| 101 |
+
# st.error(f"Đọc file thất bại: {e}")
|
| 102 |
+
# return None, 0
|
| 103 |
+
|
| 104 |
+
# semantic_splitter = SemanticChunker(
|
| 105 |
+
# embeddings=st.session_state.embeddings,
|
| 106 |
+
# buffer_size=1, # total sentence collected before perform text split
|
| 107 |
+
# breakpoint_threshold_type='percentile', # set splitting style: 'percentage' of similarity
|
| 108 |
+
# breakpoint_threshold_amount=95, # split text if similarity score > 95%
|
| 109 |
+
# min_chunk_size=500,
|
| 110 |
+
# add_start_index=True, # assign index for chunk
|
| 111 |
+
# )
|
| 112 |
+
|
| 113 |
+
# docs = semantic_splitter.split_documents(documents)
|
| 114 |
+
df = pd.read_excel("chunk_metadata_template.xlsx")
|
| 115 |
+
docs = []
|
| 116 |
+
|
| 117 |
+
# Tạo danh sách các Document có metadata
|
| 118 |
+
for _, row in df.iterrows():
|
| 119 |
+
chunk_with_metadata = Document(
|
| 120 |
+
page_content=row['page_content'],
|
| 121 |
+
metadata={
|
| 122 |
+
'chunk_id': row['chunk_id'],
|
| 123 |
+
'document_title': row['document_title']
|
| 124 |
+
# 'topic': row['topic'],
|
| 125 |
+
# 'stakeholder': row['stakeholder']
|
| 126 |
+
}
|
| 127 |
+
)
|
| 128 |
+
docs.append(chunk_with_metadata)
|
| 129 |
vector_db = Chroma.from_documents(documents=docs,
|
| 130 |
embedding=st.session_state.embeddings)
|
| 131 |
|