Spaces:
Runtime error
Runtime error
changes to app and sambaparse
Browse files- utils/parsing/sambaparse.py +11 -0
utils/parsing/sambaparse.py
CHANGED
|
@@ -8,6 +8,7 @@ from dotenv import load_dotenv
|
|
| 8 |
from langchain.docstore.document import Document
|
| 9 |
import shutil
|
| 10 |
from langchain_community.document_loaders import PyMuPDFLoader
|
|
|
|
| 11 |
|
| 12 |
load_dotenv()
|
| 13 |
|
|
@@ -303,6 +304,16 @@ class SambaParse:
|
|
| 303 |
loader = PyMuPDFLoader(file_path)
|
| 304 |
docs = loader.load()
|
| 305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
for doc in docs:
|
| 307 |
text = doc.page_content
|
| 308 |
metadata = doc.metadata
|
|
|
|
| 8 |
from langchain.docstore.document import Document
|
| 9 |
import shutil
|
| 10 |
from langchain_community.document_loaders import PyMuPDFLoader
|
| 11 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 12 |
|
| 13 |
load_dotenv()
|
| 14 |
|
|
|
|
| 304 |
loader = PyMuPDFLoader(file_path)
|
| 305 |
docs = loader.load()
|
| 306 |
|
| 307 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 308 |
+
chunk_size=1000,
|
| 309 |
+
chunk_overlap=200,
|
| 310 |
+
length_function=len,
|
| 311 |
+
separators=['\n\n', '\n', ' ', ''],
|
| 312 |
+
is_separator_regex=False,
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
docs = splitter.split_documents(docs)
|
| 316 |
+
|
| 317 |
for doc in docs:
|
| 318 |
text = doc.page_content
|
| 319 |
metadata = doc.metadata
|