Spaces:
Build error
Build error
import argparse | |
import os | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin | |
from html2text import html2text | |
from pathlib import Path | |
def is_writable_path(target_path): | |
""" | |
Check if a path is writable. | |
""" | |
path = Path(os.path.dirname(target_path)) | |
if path.is_dir(): | |
if os.access(path, os.W_OK): | |
return target_path | |
else: | |
raise argparse.ArgumentTypeError(f"Directory '{path}' is not writable.") | |
else: | |
raise argparse.ArgumentTypeError(f"Directory '{path}' does not exist.") | |
def main(url, markdown_path): | |
# Create a session object | |
with requests.Session() as session: | |
# Send HTTP request to the specified URL | |
response = session.get(url) | |
response.raise_for_status() # Check for HTTP issues | |
# Create a BeautifulSoup object and specify the parser | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Ensure the directory for saving images exists | |
os.makedirs("./logs", exist_ok=True) | |
# Find all image tags and save images | |
for image in soup.find_all('img'): | |
image_url = urljoin(url, image['src']) | |
try: | |
image_response = session.get(image_url, stream=True) | |
image_response.raise_for_status() | |
image_name = os.path.join("./logs", os.path.basename(image_url)) | |
with open(image_name, 'wb') as file: | |
file.write(image_response.content) | |
except requests.RequestException as e: | |
print(f"Failed to download {image_url}: {e}") | |
# Convert the HTML content to markdown | |
markdown_content = html2text(response.text) | |
# Save the markdown content to a file | |
try: | |
with open(markdown_path, "w", encoding="utf8") as file: | |
file.write(markdown_content) | |
print(f"Markdown content successfully written to {markdown_path}") | |
except Exception as e: | |
print(f"Failed to write markdown to {markdown_path}: {e}") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Convert HTML to Markdown") | |
parser.add_argument("url", help="The URL of the webpage to convert") | |
parser.add_argument("markdown_path", help="The path to save the converted markdown file", type=is_writable_path) | |
args = parser.parse_args() | |
main(args.url, args.markdown_path) | |