Spaces:
Sleeping
Sleeping
File size: 2,033 Bytes
2fe32bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# First creating Document reader
from typing import List
from langchain.docstore.document import Document as LangchainDocument
from bs4 import BeautifulSoup
import re
def read_wiki_html(filename: str) -> List[List]:
"""
Reads an HTML file, extracts the contents of the <body> tag,
finds all <figure> tags with their hrefs and <figcaption>,
and returns the processed content as a string.
Args:
filename (str): The path to the HTML file.
Returns:
TEXT_KB: list of text extracted from the html
Figure_KB: list of figure captions extracted
"""
try:
with open(filename, 'r', encoding='utf-8') as file:
content = file.read()
# Parse the HTML content
soup = BeautifulSoup(content, 'html.parser')
# Focus only on the <body> tag
body = soup.body
if body is None:
return "Error: No <body> tag found in the HTML file."
body_text = re.sub(r'\n+', '\n', body.get_text(separator="\n").strip())
TEXT_KB = [
LangchainDocument(page_content=body_text)
]
# Extract all <figure> tags with their href and figcaption
FIG_KB = []
for figure in body.find_all('figure'):
href = figure.find('a').get('href', 'No href') if figure.find('a') else 'No href'
figcaption = figure.find('figcaption').get_text(strip=True) if figure.find('figcaption') else 'No figcaption'
# figure_details.append(f"Figure: href={href}, figcaption={figcaption}")
FIG_KB.append(
LangchainDocument(page_content=figcaption, metadata={"url": href})
)
# Join the details into a single string
return (TEXT_KB, FIG_KB)
except FileNotFoundError:
return f"Error: File '{filename}' not found."
except Exception as e:
return f"Error: {str(e)}"
if __name__=="__main__":
contents = read_wiki_html("_data/MS Dhoni - Wikipedia.htm")
# read_pdf()
pass |