Spaces:
Sleeping
Sleeping
| # First creating Document reader | |
| from typing import List | |
| from langchain.docstore.document import Document as LangchainDocument | |
| from bs4 import BeautifulSoup | |
| import re | |
| def read_wiki_html(filename: str) -> List[List]: | |
| """ | |
| Reads an HTML file, extracts the contents of the <body> tag, | |
| finds all <figure> tags with their hrefs and <figcaption>, | |
| and returns the processed content as a string. | |
| Args: | |
| filename (str): The path to the HTML file. | |
| Returns: | |
| TEXT_KB: list of text extracted from the html | |
| Figure_KB: list of figure captions extracted | |
| """ | |
| try: | |
| with open(filename, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| # Parse the HTML content | |
| soup = BeautifulSoup(content, 'html.parser') | |
| # Focus only on the <body> tag | |
| body = soup.body | |
| if body is None: | |
| return "Error: No <body> tag found in the HTML file." | |
| body_text = re.sub(r'\n+', '\n', body.get_text(separator="\n").strip()) | |
| TEXT_KB = [ | |
| LangchainDocument(page_content=body_text) | |
| ] | |
| # Extract all <figure> tags with their href and figcaption | |
| FIG_KB = [] | |
| for figure in body.find_all('figure'): | |
| href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href' | |
| figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption' | |
| # figure_details.append(f"Figure: href={href}, figcaption={figcaption}") | |
| FIG_KB.append( | |
| LangchainDocument(page_content=figcaption, metadata={"url": href}) | |
| ) | |
| # Join the details into a single string | |
| return (TEXT_KB, FIG_KB) | |
| except FileNotFoundError: | |
| return f"Error: File '{filename}' not found." | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| if __name__=="__main__": | |
| contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm") | |
| # read_pdf() | |
| pass |