LISA-demo / documents.py
Kadi-IAM's picture
Clean code and add readme
1a20a59
"""
Parse documents, currently pdf and xml are supported.
"""
import os
from langchain.document_loaders import (
PyMuPDFLoader,
)
from langchain.docstore.document import Document
from langchain.text_splitter import (
# RecursiveCharacterTextSplitter,
SpacyTextSplitter,
)
def load_pdf_as_docs(pdf_path, loader_module=None, load_kwargs=None):
"""Load and parse pdf file(s)."""
if pdf_path.endswith(".pdf"): # single file
pdf_docs = [pdf_path]
else: # a directory
pdf_docs = [
os.path.join(pdf_path, f)
for f in os.listdir(pdf_path)
if f.endswith(".pdf")
]
if load_kwargs is None:
load_kwargs = {}
docs = []
if loader_module is None: # set pdf loader
loader_module = PyMuPDFLoader
for pdf in pdf_docs:
loader = loader_module(pdf, **load_kwargs)
doc = loader.load()
docs.extend(doc)
return docs
def load_xml_as_docs(xml_path, loader_module=None, load_kwargs=None):
"""Load and parse xml file(s)."""
from bs4 import BeautifulSoup
from unstructured.cleaners.core import group_broken_paragraphs
if xml_path.endswith(".xml"): # single file
xml_docs = [xml_path]
else: # a directory
xml_docs = [
os.path.join(xml_path, f)
for f in os.listdir(xml_path)
if f.endswith(".xml")
]
if load_kwargs is None:
load_kwargs = {}
docs = []
for xml_file in xml_docs:
with open(xml_file) as fp:
soup = BeautifulSoup(
fp, features="xml"
) # txt is simply the a string with your XML file
pageText = soup.findAll(string=True)
parsed_text = "\n".join(pageText) # or " ".join, seems similar
# Clean text
parsed_text_grouped = group_broken_paragraphs(parsed_text)
# get metadata
try:
from lxml import etree as ET
tree = ET.parse(xml_file)
# Define namespace
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
# Read Author personal names as an example
pers_name_elements = tree.xpath(
"tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName",
namespaces=ns,
)
first_per = pers_name_elements[0].text
author_info = first_per + " et al"
title_elements = tree.xpath(
"tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title", namespaces=ns
)
title = title_elements[0].text
# Combine source info
source_info = "_".join([author_info, title])
except:
source_info = "unknown"
# maybe even better parsing method. TODO: discuss with TUD
# first_author = soup.find("author")
# publication_year = soup.find("date", attrs={'type': 'published'})
# title = soup.find("title")
# source_info = [first_author, publication_year, title]
# source_info_str = "_".join([info.text.strip() if info is not None else "unknown" for info in source_info])
doc = [
Document(
page_content=parsed_text_grouped, metadata={"source": source_info}
)
]
docs.extend(doc)
return docs
def get_doc_chunks(docs, splitter=None):
"""Split docs into chunks."""
if splitter is None:
# splitter = RecursiveCharacterTextSplitter( # original default
# # separators=["\n\n", "\n"], chunk_size=1024, chunk_overlap=256
# separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128
# )
# Spacy seems better
splitter = SpacyTextSplitter.from_tiktoken_encoder(
chunk_size=512,
chunk_overlap=128,
)
chunks = splitter.split_documents(docs)
return chunks