Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,109 +5,141 @@ import re
|
|
5 |
from secrets import token_hex
|
6 |
import logging
|
7 |
import os
|
8 |
-
from markitdown import MarkItDown
|
|
|
|
|
9 |
|
10 |
# Set up logging
|
11 |
-
logging.basicConfig(
|
|
|
|
|
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
-
# Initialize
|
15 |
md_converter = MarkItDown()
|
16 |
|
17 |
-
def
|
18 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
try:
|
20 |
-
adv.crawl(
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
return True
|
24 |
except Exception as e:
|
25 |
-
logger.error(f"Crawl error: {str(e)}")
|
26 |
return False
|
27 |
|
28 |
-
def
|
29 |
-
"""Process links
|
30 |
try:
|
31 |
-
link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode()
|
32 |
-
text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode()
|
33 |
all_links = []
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
text
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
return "\n\n".join(all_links)
|
42 |
except Exception as e:
|
43 |
-
logger.error(f"
|
44 |
return ""
|
45 |
|
46 |
-
def process_url(url, link_types):
|
47 |
-
"""Process
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
|
51 |
try:
|
52 |
-
if not url
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
output_file = token_hex(6)
|
57 |
-
jsonl_path = f"{output_file}.jsonl"
|
58 |
-
|
59 |
-
try:
|
60 |
-
if not safe_crawl(url, jsonl_path):
|
61 |
-
return "", "Crawl failed or timed out"
|
62 |
-
crawl_df = pd.read_json(jsonl_path, lines=True)
|
63 |
-
finally:
|
64 |
-
if os.path.exists(jsonl_path):
|
65 |
-
os.remove(jsonl_path)
|
66 |
-
|
67 |
if crawl_df.empty:
|
68 |
-
return "", "
|
69 |
-
|
70 |
-
#
|
71 |
-
title = "Untitled"
|
72 |
-
meta_desc = ""
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
if
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
if link_content:
|
85 |
-
all_links.append(link_content)
|
86 |
-
all_links.append('\n\n')
|
87 |
-
else:
|
88 |
-
# Process all links using advertools
|
89 |
-
link_df = adv.crawlytics.links(crawl_df)
|
90 |
-
for link, text in link_df[['link', 'text']].values:
|
91 |
-
if text and text.strip():
|
92 |
-
text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
|
93 |
-
text = re.sub(r"\s{3,}", " ", text)
|
94 |
-
all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
|
95 |
|
96 |
-
links_text = "\n\n".join(all_links)
|
97 |
-
final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
|
98 |
-
return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
|
99 |
-
|
100 |
except Exception as e:
|
101 |
-
logger.error(f"Error processing
|
102 |
return "", f"Error: {str(e)}"
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
try:
|
107 |
result = md_converter.convert(file.name)
|
108 |
-
return result.text_content, "File processed successfully
|
109 |
except Exception as e:
|
110 |
-
logger.error(f"
|
111 |
return "", f"Error processing file: {str(e)}"
|
112 |
|
113 |
# Custom CSS for styling
|
@@ -145,52 +177,50 @@ theme = gr.themes.Soft(
|
|
145 |
)
|
146 |
)
|
147 |
|
148 |
-
|
149 |
-
|
|
|
150 |
|
151 |
-
with gr.Tab("Website URL
|
152 |
url_input = gr.Textbox(
|
153 |
-
label="
|
154 |
-
placeholder="example
|
155 |
-
lines=1,
|
156 |
)
|
157 |
link_types = gr.Dropdown(
|
158 |
-
|
159 |
-
choices=["<header> links", "<nav> links", "<footer> links", "All links"],
|
160 |
multiselect=True,
|
161 |
-
value=["All links"]
|
|
|
162 |
)
|
163 |
-
|
164 |
-
|
165 |
-
label="Generated
|
166 |
lines=20,
|
167 |
-
show_copy_button=True
|
168 |
-
container=True,
|
169 |
)
|
170 |
-
|
171 |
|
172 |
-
|
173 |
-
|
174 |
inputs=[url_input, link_types],
|
175 |
-
outputs=[
|
176 |
)
|
177 |
|
178 |
-
with gr.Tab("File
|
179 |
-
file_input = gr.File(label="Upload
|
180 |
-
|
181 |
file_output = gr.Textbox(
|
182 |
-
label="Converted
|
183 |
lines=20,
|
184 |
-
show_copy_button=True
|
185 |
-
container=True,
|
186 |
)
|
187 |
-
file_status = gr.Textbox(label="Status"
|
188 |
|
189 |
-
|
190 |
-
|
191 |
inputs=[file_input],
|
192 |
-
outputs=[file_output, file_status]
|
193 |
)
|
194 |
-
|
195 |
if __name__ == "__main__":
|
196 |
iface.launch()
|
|
|
5 |
from secrets import token_hex
|
6 |
import logging
|
7 |
import os
|
8 |
+
from markitdown import MarkItDown
|
9 |
+
from typing import Tuple, List, Optional
|
10 |
+
import validators
|
11 |
|
12 |
# Set up logging
|
13 |
+
logging.basicConfig(
|
14 |
+
level=logging.INFO,
|
15 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
16 |
+
)
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
19 |
+
# Initialize MarkItDown
|
20 |
md_converter = MarkItDown()
|
21 |
|
22 |
+
def validate_url(url: str) -> Tuple[bool, str]:
|
23 |
+
"""Validate URL format and accessibility."""
|
24 |
+
if not url:
|
25 |
+
return False, "URL is required"
|
26 |
+
|
27 |
+
if not url.startswith(('http://', 'https://')):
|
28 |
+
url = 'https://' + url
|
29 |
+
|
30 |
+
if not validators.url(url):
|
31 |
+
return False, "Invalid URL format"
|
32 |
+
|
33 |
+
return True, url
|
34 |
+
|
35 |
+
def safe_crawl(url: str, output_file: str) -> bool:
|
36 |
+
"""Safely perform a web crawl with timeout and error handling."""
|
37 |
try:
|
38 |
+
adv.crawl(
|
39 |
+
url,
|
40 |
+
output_file,
|
41 |
+
follow_links=False,
|
42 |
+
custom_settings={
|
43 |
+
'CLOSESPIDER_TIMEOUT': 30,
|
44 |
+
'ROBOTSTXT_OBEY': True,
|
45 |
+
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
|
46 |
+
'USER_AGENT': 'Mozilla/5.0 (compatible; LLMContentBot/1.0)',
|
47 |
+
'DOWNLOAD_TIMEOUT': 10
|
48 |
+
}
|
49 |
+
)
|
50 |
return True
|
51 |
except Exception as e:
|
52 |
+
logger.error(f"Crawl error for {url}: {str(e)}")
|
53 |
return False
|
54 |
|
55 |
+
def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
|
56 |
+
"""Process links based on selected types."""
|
57 |
try:
|
|
|
|
|
58 |
all_links = []
|
59 |
|
60 |
+
if "All links" in link_types or not link_types:
|
61 |
+
link_df = adv.crawlytics.links(crawl_df)
|
62 |
+
for link, text in link_df[['link', 'text']].dropna().values:
|
63 |
+
if text := text.strip():
|
64 |
+
text = re.sub(r'[\n\s]+', ' ', text)
|
65 |
+
all_links.append(f"## {text}\n[{text}]({link})")
|
66 |
+
else:
|
67 |
+
for link_type in link_types:
|
68 |
+
type_match = re.findall(r"header|footer|nav", link_type.lower())
|
69 |
+
if type_match:
|
70 |
+
col_prefix = type_match[0]
|
71 |
+
urls = crawl_df[f'{col_prefix}_links_url'].iloc[0]
|
72 |
+
texts = crawl_df[f'{col_prefix}_links_text'].iloc[0]
|
73 |
+
|
74 |
+
if urls and texts:
|
75 |
+
urls = urls.split('@@')
|
76 |
+
texts = texts.split('@@')
|
77 |
+
|
78 |
+
for url, text in zip(urls, texts):
|
79 |
+
if text := text.strip():
|
80 |
+
text = re.sub(r'[\n\s]+', ' ', text)
|
81 |
+
all_links.append(f"## {text}\n[{text}]({url})")
|
82 |
|
83 |
return "\n\n".join(all_links)
|
84 |
except Exception as e:
|
85 |
+
logger.error(f"Link processing error: {str(e)}")
|
86 |
return ""
|
87 |
|
88 |
+
def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
|
89 |
+
"""Process website URL and generate markdown content."""
|
90 |
+
valid, result = validate_url(url)
|
91 |
+
if not valid:
|
92 |
+
return "", result
|
93 |
+
|
94 |
+
url = result
|
95 |
+
output_file = f"crawl_{token_hex(6)}.jsonl"
|
96 |
|
97 |
try:
|
98 |
+
if not safe_crawl(url, output_file):
|
99 |
+
return "", "Crawl failed or timed out"
|
100 |
+
|
101 |
+
crawl_df = pd.read_json(output_file, lines=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
if crawl_df.empty:
|
103 |
+
return "", "No data found for the URL"
|
104 |
+
|
105 |
+
# Extract title and description
|
106 |
+
title = crawl_df['title'].iloc[0] if 'title' in crawl_df.columns else "Untitled"
|
107 |
+
meta_desc = crawl_df['meta_desc'].iloc[0] if 'meta_desc' in crawl_df.columns else ""
|
108 |
+
|
109 |
+
# Process links
|
110 |
+
links_content = process_links(crawl_df, link_types)
|
111 |
+
|
112 |
+
# Generate final markdown
|
113 |
+
content = f"# {title}\n\n"
|
114 |
+
if meta_desc:
|
115 |
+
content += f"> {meta_desc}\n\n"
|
116 |
+
content += links_content
|
117 |
+
|
118 |
+
return content, f"Successfully processed {url}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
|
|
|
|
|
|
|
|
120 |
except Exception as e:
|
121 |
+
logger.error(f"Error processing {url}: {str(e)}")
|
122 |
return "", f"Error: {str(e)}"
|
123 |
+
finally:
|
124 |
+
if os.path.exists(output_file):
|
125 |
+
os.remove(output_file)
|
126 |
+
|
127 |
+
def process_file(file: gr.File) -> Tuple[str, str]:
|
128 |
+
"""Convert uploaded file to markdown."""
|
129 |
+
if not file:
|
130 |
+
return "", "No file uploaded"
|
131 |
+
|
132 |
+
supported_extensions = {'.pdf', '.docx', '.pptx', '.xlsx', '.html', '.txt'}
|
133 |
+
file_ext = os.path.splitext(file.name)[1].lower()
|
134 |
+
|
135 |
+
if file_ext not in supported_extensions:
|
136 |
+
return "", f"Unsupported file type: {file_ext}"
|
137 |
+
|
138 |
try:
|
139 |
result = md_converter.convert(file.name)
|
140 |
+
return result.text_content, "File processed successfully"
|
141 |
except Exception as e:
|
142 |
+
logger.error(f"File processing error: {str(e)}")
|
143 |
return "", f"Error processing file: {str(e)}"
|
144 |
|
145 |
# Custom CSS for styling
|
|
|
177 |
)
|
178 |
)
|
179 |
|
180 |
+
# Create interface
|
181 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=css) as iface:
|
182 |
+
gr.Markdown("# LLMs.txt Generator")
|
183 |
|
184 |
+
with gr.Tab("Website URL"):
|
185 |
url_input = gr.Textbox(
|
186 |
+
label="Website URL",
|
187 |
+
placeholder="example.com"
|
|
|
188 |
)
|
189 |
link_types = gr.Dropdown(
|
190 |
+
choices=["All links", "<header> links", "<nav> links", "<footer> links"],
|
|
|
191 |
multiselect=True,
|
192 |
+
value=["All links"],
|
193 |
+
label="Link Types to Extract"
|
194 |
)
|
195 |
+
url_button = gr.Button("Process URL", variant="primary")
|
196 |
+
url_output = gr.Textbox(
|
197 |
+
label="Generated Content",
|
198 |
lines=20,
|
199 |
+
show_copy_button=True
|
|
|
200 |
)
|
201 |
+
url_status = gr.Textbox(label="Status")
|
202 |
|
203 |
+
url_button.click(
|
204 |
+
process_url,
|
205 |
inputs=[url_input, link_types],
|
206 |
+
outputs=[url_output, url_status]
|
207 |
)
|
208 |
|
209 |
+
with gr.Tab("File Converter"):
|
210 |
+
file_input = gr.File(label="Upload Document")
|
211 |
+
file_button = gr.Button("Convert to Markdown", variant="primary")
|
212 |
file_output = gr.Textbox(
|
213 |
+
label="Converted Content",
|
214 |
lines=20,
|
215 |
+
show_copy_button=True
|
|
|
216 |
)
|
217 |
+
file_status = gr.Textbox(label="Status")
|
218 |
|
219 |
+
file_button.click(
|
220 |
+
process_file,
|
221 |
inputs=[file_input],
|
222 |
+
outputs=[file_output, file_status]
|
223 |
)
|
224 |
+
|
225 |
if __name__ == "__main__":
|
226 |
iface.launch()
|