Spaces:
Sleeping
Sleeping
# First creating Document reader | |
from typing import List | |
from langchain.docstore.document import Document as LangchainDocument | |
from bs4 import BeautifulSoup | |
import re | |
def read_wiki_html(filename: str) -> List[List]: | |
""" | |
Reads an HTML file, extracts the contents of the <body> tag, | |
finds all <figure> tags with their hrefs and <figcaption>, | |
and returns the processed content as a string. | |
Args: | |
filename (str): The path to the HTML file. | |
Returns: | |
TEXT_KB: list of text extracted from the html | |
Figure_KB: list of figure captions extracted | |
""" | |
try: | |
with open(filename, 'r', encoding='utf-8') as file: | |
content = file.read() | |
# Parse the HTML content | |
soup = BeautifulSoup(content, 'html.parser') | |
# Focus only on the <body> tag | |
body = soup.body | |
if body is None: | |
return "Error: No <body> tag found in the HTML file." | |
body_text = re.sub(r'\n+', '\n', body.get_text(separator="\n").strip()) | |
TEXT_KB = [ | |
LangchainDocument(page_content=body_text) | |
] | |
# Extract all <figure> tags with their href and figcaption | |
FIG_KB = [] | |
for figure in body.find_all('figure'): | |
href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href' | |
figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption' | |
# figure_details.append(f"Figure: href={href}, figcaption={figcaption}") | |
FIG_KB.append( | |
LangchainDocument(page_content=figcaption, metadata={"url": href}) | |
) | |
# Join the details into a single string | |
return (TEXT_KB, FIG_KB) | |
except FileNotFoundError: | |
return f"Error: File '{filename}' not found." | |
except Exception as e: | |
return f"Error: {str(e)}" | |
if __name__=="__main__": | |
contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm") | |
# read_pdf() | |
pass |