Spaces:

LunaticMaestro
/

rag

Sleeping

rag / z_document_reader.py

Deepak Sahu

adding vector store

2fe32bb 11 months ago

2.03 kB

	# First creating Document reader

	from typing import List
	from langchain.docstore.document import Document as LangchainDocument
	from bs4 import BeautifulSoup
	import re


	def read_wiki_html(filename: str) -> List[List]:
	"""
	Reads an HTML file, extracts the contents of the <body> tag,
	finds all <figure> tags with their hrefs and <figcaption>,
	and returns the processed content as a string.

	Args:
	filename (str): The path to the HTML file.

	Returns:
	TEXT_KB: list of text extracted from the html
	Figure_KB: list of figure captions extracted
	"""
	try:
	with open(filename, 'r', encoding='utf-8') as file:
	content = file.read()

	# Parse the HTML content
	soup = BeautifulSoup(content, 'html.parser')

	# Focus only on the <body> tag
	body = soup.body
	if body is None:
	return "Error: No <body> tag found in the HTML file."

	body_text = re.sub(r'\n+', '\n', body.get_text(separator="\n").strip())


	TEXT_KB = [
	LangchainDocument(page_content=body_text)
	]


	# Extract all <figure> tags with their href and figcaption
	FIG_KB = []
	for figure in body.find_all('figure'):
	href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href'
	figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption'
	# figure_details.append(f"Figure: href={href}, figcaption={figcaption}")
	FIG_KB.append(
	LangchainDocument(page_content=figcaption, metadata={"url": href})
	)

	# Join the details into a single string
	return (TEXT_KB, FIG_KB)

	except FileNotFoundError:
	return f"Error: File '{filename}' not found."
	except Exception as e:
	return f"Error: {str(e)}"

	if __name__=="__main__":
	contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm")
	# read_pdf()
	pass