|
""" |
|
Parse documents, currently pdf and xml are supported. |
|
""" |
|
|
|
import os |
|
|
|
from langchain.document_loaders import ( |
|
PyMuPDFLoader, |
|
) |
|
from langchain.docstore.document import Document |
|
from langchain.text_splitter import ( |
|
|
|
SpacyTextSplitter, |
|
) |
|
|
|
|
|
def load_pdf_as_docs(pdf_path, loader_module=None, load_kwargs=None): |
|
"""Load and parse pdf file(s).""" |
|
|
|
if pdf_path.endswith(".pdf"): |
|
pdf_docs = [pdf_path] |
|
else: |
|
pdf_docs = [ |
|
os.path.join(pdf_path, f) |
|
for f in os.listdir(pdf_path) |
|
if f.endswith(".pdf") |
|
] |
|
|
|
if load_kwargs is None: |
|
load_kwargs = {} |
|
|
|
docs = [] |
|
if loader_module is None: |
|
loader_module = PyMuPDFLoader |
|
for pdf in pdf_docs: |
|
loader = loader_module(pdf, **load_kwargs) |
|
doc = loader.load() |
|
docs.extend(doc) |
|
|
|
return docs |
|
|
|
|
|
def load_xml_as_docs(xml_path, loader_module=None, load_kwargs=None): |
|
"""Load and parse xml file(s).""" |
|
|
|
from bs4 import BeautifulSoup |
|
from unstructured.cleaners.core import group_broken_paragraphs |
|
|
|
if xml_path.endswith(".xml"): |
|
xml_docs = [xml_path] |
|
else: |
|
xml_docs = [ |
|
os.path.join(xml_path, f) |
|
for f in os.listdir(xml_path) |
|
if f.endswith(".xml") |
|
] |
|
|
|
if load_kwargs is None: |
|
load_kwargs = {} |
|
|
|
docs = [] |
|
for xml_file in xml_docs: |
|
with open(xml_file) as fp: |
|
soup = BeautifulSoup( |
|
fp, features="xml" |
|
) |
|
pageText = soup.findAll(string=True) |
|
parsed_text = "\n".join(pageText) |
|
|
|
parsed_text_grouped = group_broken_paragraphs(parsed_text) |
|
|
|
|
|
try: |
|
from lxml import etree as ET |
|
|
|
tree = ET.parse(xml_file) |
|
|
|
|
|
ns = {"tei": "http://www.tei-c.org/ns/1.0"} |
|
|
|
pers_name_elements = tree.xpath( |
|
"tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName", |
|
namespaces=ns, |
|
) |
|
first_per = pers_name_elements[0].text |
|
author_info = first_per + " et al" |
|
|
|
title_elements = tree.xpath( |
|
"tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title", namespaces=ns |
|
) |
|
title = title_elements[0].text |
|
|
|
|
|
source_info = "_".join([author_info, title]) |
|
except: |
|
source_info = "unknown" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
doc = [ |
|
Document( |
|
page_content=parsed_text_grouped, metadata={"source": source_info} |
|
) |
|
] |
|
|
|
docs.extend(doc) |
|
|
|
return docs |
|
|
|
|
|
def get_doc_chunks(docs, splitter=None): |
|
"""Split docs into chunks.""" |
|
|
|
if splitter is None: |
|
|
|
|
|
|
|
|
|
|
|
splitter = SpacyTextSplitter.from_tiktoken_encoder( |
|
chunk_size=512, |
|
chunk_overlap=128, |
|
) |
|
chunks = splitter.split_documents(docs) |
|
|
|
return chunks |
|
|