Spaces:

LISA-Kadi
/

LISA-demo

Sleeping

App Files Files Community

LISA-demo / documents.py

Kadi-IAM

Clean code and add readme

1a20a59 6 months ago

raw

history blame contribute delete

4.04 kB

	"""
	Parse documents, currently pdf and xml are supported.
	"""

	import os

	from langchain.document_loaders import (
	PyMuPDFLoader,
	)
	from langchain.docstore.document import Document
	from langchain.text_splitter import (
	# RecursiveCharacterTextSplitter,
	SpacyTextSplitter,
	)


	def load_pdf_as_docs(pdf_path, loader_module=None, load_kwargs=None):
	"""Load and parse pdf file(s)."""

	if pdf_path.endswith(".pdf"): # single file
	pdf_docs = [pdf_path]
	else: # a directory
	pdf_docs = [
	os.path.join(pdf_path, f)
	for f in os.listdir(pdf_path)
	if f.endswith(".pdf")
	]

	if load_kwargs is None:
	load_kwargs = {}

	docs = []
	if loader_module is None: # set pdf loader
	loader_module = PyMuPDFLoader
	for pdf in pdf_docs:
	loader = loader_module(pdf, **load_kwargs)
	doc = loader.load()
	docs.extend(doc)

	return docs


	def load_xml_as_docs(xml_path, loader_module=None, load_kwargs=None):
	"""Load and parse xml file(s)."""

	from bs4 import BeautifulSoup
	from unstructured.cleaners.core import group_broken_paragraphs

	if xml_path.endswith(".xml"): # single file
	xml_docs = [xml_path]
	else: # a directory
	xml_docs = [
	os.path.join(xml_path, f)
	for f in os.listdir(xml_path)
	if f.endswith(".xml")
	]

	if load_kwargs is None:
	load_kwargs = {}

	docs = []
	for xml_file in xml_docs:
	with open(xml_file) as fp:
	soup = BeautifulSoup(
	fp, features="xml"
	) # txt is simply the a string with your XML file
	pageText = soup.findAll(string=True)
	parsed_text = "\n".join(pageText) # or " ".join, seems similar
	# Clean text
	parsed_text_grouped = group_broken_paragraphs(parsed_text)

	# get metadata
	try:
	from lxml import etree as ET

	tree = ET.parse(xml_file)

	# Define namespace
	ns = {"tei": "http://www.tei-c.org/ns/1.0"}
	# Read Author personal names as an example
	pers_name_elements = tree.xpath(
	"tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:author/tei:persName",
	namespaces=ns,
	)
	first_per = pers_name_elements[0].text
	author_info = first_per + " et al"

	title_elements = tree.xpath(
	"tei:teiHeader/tei:fileDesc/tei:titleStmt/tei:title", namespaces=ns
	)
	title = title_elements[0].text

	# Combine source info
	source_info = "_".join([author_info, title])
	except:
	source_info = "unknown"

	# maybe even better parsing method. TODO: discuss with TUD
	# first_author = soup.find("author")
	# publication_year = soup.find("date", attrs={'type': 'published'})
	# title = soup.find("title")
	# source_info = [first_author, publication_year, title]
	# source_info_str = "_".join([info.text.strip() if info is not None else "unknown" for info in source_info])

	doc = [
	Document(
	page_content=parsed_text_grouped, metadata={"source": source_info}
	)
	]

	docs.extend(doc)

	return docs


	def get_doc_chunks(docs, splitter=None):
	"""Split docs into chunks."""

	if splitter is None:
	# splitter = RecursiveCharacterTextSplitter( # original default
	# # separators=["\n\n", "\n"], chunk_size=1024, chunk_overlap=256
	# separators=["\n\n", "\n"], chunk_size=256, chunk_overlap=128
	# )
	# Spacy seems better
	splitter = SpacyTextSplitter.from_tiktoken_encoder(
	chunk_size=512,
	chunk_overlap=128,
	)
	chunks = splitter.split_documents(docs)

	return chunks