cyberandy commited on
Commit
0221da4
·
verified ·
1 Parent(s): 2b3088a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -104
app.py CHANGED
@@ -5,109 +5,141 @@ import re
5
  from secrets import token_hex
6
  import logging
7
  import os
8
- from markitdown import MarkItDown # Import MarkItDown
 
 
9
 
10
  # Set up logging
11
- logging.basicConfig(level=logging.INFO)
 
 
 
12
  logger = logging.getLogger(__name__)
13
 
14
- # Initialize the MarkItDown converter
15
  md_converter = MarkItDown()
16
 
17
- def safe_crawl(url, output_file):
18
- """Safely perform a web crawl with a timeout"""
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  try:
20
- adv.crawl(url, output_file,
21
- follow_links=False, # Only crawl the main page
22
- custom_settings={'CLOSESPIDER_TIMEOUT': 30}) # 30-second timeout
 
 
 
 
 
 
 
 
 
23
  return True
24
  except Exception as e:
25
- logger.error(f"Crawl error: {str(e)}")
26
  return False
27
 
28
- def explode_link_df(crawl_df, col_group):
29
- """Process links from a specific column group in the crawl dataframe"""
30
  try:
31
- link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode()
32
- text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode()
33
  all_links = []
34
 
35
- for link, text in zip(link.dropna(), text.dropna()):
36
- if text and text.strip():
37
- text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
38
- text = re.sub(r"\s{3,}", " ", text)
39
- all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  return "\n\n".join(all_links)
42
  except Exception as e:
43
- logger.error(f"Error processing {col_group} links: {str(e)}")
44
  return ""
45
 
46
- def process_url(url, link_types):
47
- """Process a website URL and generate the llms.txt content"""
48
- if not url:
49
- return "", "Please enter a URL"
 
 
 
 
50
 
51
  try:
52
- if not url.startswith(("http://", "https://")):
53
- url = "https://" + url
54
-
55
- # Generate a unique filename for this crawl
56
- output_file = token_hex(6)
57
- jsonl_path = f"{output_file}.jsonl"
58
-
59
- try:
60
- if not safe_crawl(url, jsonl_path):
61
- return "", "Crawl failed or timed out"
62
- crawl_df = pd.read_json(jsonl_path, lines=True)
63
- finally:
64
- if os.path.exists(jsonl_path):
65
- os.remove(jsonl_path)
66
-
67
  if crawl_df.empty:
68
- return "", "Crawl produced no data for the URL."
69
-
70
- # Use default values if the expected columns are missing or empty
71
- title = "Untitled"
72
- meta_desc = ""
73
- if 'title' in crawl_df.columns and not pd.isna(crawl_df['title'].iloc[0]):
74
- title = crawl_df['title'].iloc[0]
75
- if 'meta_desc' in crawl_df.columns and not pd.isna(crawl_df['meta_desc'].iloc[0]):
76
- meta_desc = crawl_df['meta_desc'].iloc[0]
77
-
78
- all_links = []
79
- if link_types and "All links" not in link_types:
80
- for link_type in link_types:
81
- type_match = re.findall(r"header|footer|nav", link_type)
82
- if type_match:
83
- link_content = explode_link_df(crawl_df, type_match[0])
84
- if link_content:
85
- all_links.append(link_content)
86
- all_links.append('\n\n')
87
- else:
88
- # Process all links using advertools
89
- link_df = adv.crawlytics.links(crawl_df)
90
- for link, text in link_df[['link', 'text']].values:
91
- if text and text.strip():
92
- text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
93
- text = re.sub(r"\s{3,}", " ", text)
94
- all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
95
 
96
- links_text = "\n\n".join(all_links)
97
- final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
98
- return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
99
-
100
  except Exception as e:
101
- logger.error(f"Error processing URL {url}: {str(e)}")
102
  return "", f"Error: {str(e)}"
103
-
104
- def process_file(file):
105
- """Convert an uploaded file into Markdown using MarkItDown"""
 
 
 
 
 
 
 
 
 
 
 
 
106
  try:
107
  result = md_converter.convert(file.name)
108
- return result.text_content, "File processed successfully."
109
  except Exception as e:
110
- logger.error(f"Error processing file {file.name}: {str(e)}")
111
  return "", f"Error processing file: {str(e)}"
112
 
113
  # Custom CSS for styling
@@ -145,52 +177,50 @@ theme = gr.themes.Soft(
145
  )
146
  )
147
 
148
- with gr.Blocks(theme=theme, css=css) as iface:
149
- gr.Markdown("# Generate an `llms.txt` file for GenAI Search and Agents")
 
150
 
151
- with gr.Tab("Website URL Processing"):
152
  url_input = gr.Textbox(
153
- label="Enter the home page of a website:",
154
- placeholder="example: https://example.com",
155
- lines=1,
156
  )
157
  link_types = gr.Dropdown(
158
- label="Select types of links to extract (leave empty to get all links)",
159
- choices=["<header> links", "<nav> links", "<footer> links", "All links"],
160
  multiselect=True,
161
- value=["All links"]
 
162
  )
163
- generate_btn = gr.Button("Submit", variant="primary", elem_classes=["primary-btn"])
164
- output = gr.Textbox(
165
- label="Generated llms.txt Content",
166
  lines=20,
167
- show_copy_button=True,
168
- container=True,
169
  )
170
- status = gr.Textbox(label="Status", interactive=False)
171
 
172
- generate_btn.click(
173
- fn=process_url,
174
  inputs=[url_input, link_types],
175
- outputs=[output, status],
176
  )
177
 
178
- with gr.Tab("File to Markdown Converter"):
179
- file_input = gr.File(label="Upload a file (e.g., PDF, DOCX, PPTX, etc.)")
180
- convert_btn = gr.Button("Convert to Markdown", variant="primary", elem_classes=["primary-btn"])
181
  file_output = gr.Textbox(
182
- label="Converted Markdown (llms.txt content)",
183
  lines=20,
184
- show_copy_button=True,
185
- container=True,
186
  )
187
- file_status = gr.Textbox(label="Status", interactive=False)
188
 
189
- convert_btn.click(
190
- fn=process_file,
191
  inputs=[file_input],
192
- outputs=[file_output, file_status],
193
  )
194
-
195
  if __name__ == "__main__":
196
  iface.launch()
 
5
  from secrets import token_hex
6
  import logging
7
  import os
8
+ from markitdown import MarkItDown
9
+ from typing import Tuple, List, Optional
10
+ import validators
11
 
12
  # Set up logging
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
16
+ )
17
  logger = logging.getLogger(__name__)
18
 
19
+ # Initialize MarkItDown
20
  md_converter = MarkItDown()
21
 
22
+ def validate_url(url: str) -> Tuple[bool, str]:
23
+ """Validate URL format and accessibility."""
24
+ if not url:
25
+ return False, "URL is required"
26
+
27
+ if not url.startswith(('http://', 'https://')):
28
+ url = 'https://' + url
29
+
30
+ if not validators.url(url):
31
+ return False, "Invalid URL format"
32
+
33
+ return True, url
34
+
35
+ def safe_crawl(url: str, output_file: str) -> bool:
36
+ """Safely perform a web crawl with timeout and error handling."""
37
  try:
38
+ adv.crawl(
39
+ url,
40
+ output_file,
41
+ follow_links=False,
42
+ custom_settings={
43
+ 'CLOSESPIDER_TIMEOUT': 30,
44
+ 'ROBOTSTXT_OBEY': True,
45
+ 'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
46
+ 'USER_AGENT': 'Mozilla/5.0 (compatible; LLMContentBot/1.0)',
47
+ 'DOWNLOAD_TIMEOUT': 10
48
+ }
49
+ )
50
  return True
51
  except Exception as e:
52
+ logger.error(f"Crawl error for {url}: {str(e)}")
53
  return False
54
 
55
+ def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
56
+ """Process links based on selected types."""
57
  try:
 
 
58
  all_links = []
59
 
60
+ if "All links" in link_types or not link_types:
61
+ link_df = adv.crawlytics.links(crawl_df)
62
+ for link, text in link_df[['link', 'text']].dropna().values:
63
+ if text := text.strip():
64
+ text = re.sub(r'[\n\s]+', ' ', text)
65
+ all_links.append(f"## {text}\n[{text}]({link})")
66
+ else:
67
+ for link_type in link_types:
68
+ type_match = re.findall(r"header|footer|nav", link_type.lower())
69
+ if type_match:
70
+ col_prefix = type_match[0]
71
+ urls = crawl_df[f'{col_prefix}_links_url'].iloc[0]
72
+ texts = crawl_df[f'{col_prefix}_links_text'].iloc[0]
73
+
74
+ if urls and texts:
75
+ urls = urls.split('@@')
76
+ texts = texts.split('@@')
77
+
78
+ for url, text in zip(urls, texts):
79
+ if text := text.strip():
80
+ text = re.sub(r'[\n\s]+', ' ', text)
81
+ all_links.append(f"## {text}\n[{text}]({url})")
82
 
83
  return "\n\n".join(all_links)
84
  except Exception as e:
85
+ logger.error(f"Link processing error: {str(e)}")
86
  return ""
87
 
88
+ def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
89
+ """Process website URL and generate markdown content."""
90
+ valid, result = validate_url(url)
91
+ if not valid:
92
+ return "", result
93
+
94
+ url = result
95
+ output_file = f"crawl_{token_hex(6)}.jsonl"
96
 
97
  try:
98
+ if not safe_crawl(url, output_file):
99
+ return "", "Crawl failed or timed out"
100
+
101
+ crawl_df = pd.read_json(output_file, lines=True)
 
 
 
 
 
 
 
 
 
 
 
102
  if crawl_df.empty:
103
+ return "", "No data found for the URL"
104
+
105
+ # Extract title and description
106
+ title = crawl_df['title'].iloc[0] if 'title' in crawl_df.columns else "Untitled"
107
+ meta_desc = crawl_df['meta_desc'].iloc[0] if 'meta_desc' in crawl_df.columns else ""
108
+
109
+ # Process links
110
+ links_content = process_links(crawl_df, link_types)
111
+
112
+ # Generate final markdown
113
+ content = f"# {title}\n\n"
114
+ if meta_desc:
115
+ content += f"> {meta_desc}\n\n"
116
+ content += links_content
117
+
118
+ return content, f"Successfully processed {url}"
 
 
 
 
 
 
 
 
 
 
 
119
 
 
 
 
 
120
  except Exception as e:
121
+ logger.error(f"Error processing {url}: {str(e)}")
122
  return "", f"Error: {str(e)}"
123
+ finally:
124
+ if os.path.exists(output_file):
125
+ os.remove(output_file)
126
+
127
+ def process_file(file: gr.File) -> Tuple[str, str]:
128
+ """Convert uploaded file to markdown."""
129
+ if not file:
130
+ return "", "No file uploaded"
131
+
132
+ supported_extensions = {'.pdf', '.docx', '.pptx', '.xlsx', '.html', '.txt'}
133
+ file_ext = os.path.splitext(file.name)[1].lower()
134
+
135
+ if file_ext not in supported_extensions:
136
+ return "", f"Unsupported file type: {file_ext}"
137
+
138
  try:
139
  result = md_converter.convert(file.name)
140
+ return result.text_content, "File processed successfully"
141
  except Exception as e:
142
+ logger.error(f"File processing error: {str(e)}")
143
  return "", f"Error processing file: {str(e)}"
144
 
145
  # Custom CSS for styling
 
177
  )
178
  )
179
 
180
+ # Create interface
181
+ with gr.Blocks(theme=gr.themes.Soft(), css=css) as iface:
182
+ gr.Markdown("# LLMs.txt Generator")
183
 
184
+ with gr.Tab("Website URL"):
185
  url_input = gr.Textbox(
186
+ label="Website URL",
187
+ placeholder="example.com"
 
188
  )
189
  link_types = gr.Dropdown(
190
+ choices=["All links", "<header> links", "<nav> links", "<footer> links"],
 
191
  multiselect=True,
192
+ value=["All links"],
193
+ label="Link Types to Extract"
194
  )
195
+ url_button = gr.Button("Process URL", variant="primary")
196
+ url_output = gr.Textbox(
197
+ label="Generated Content",
198
  lines=20,
199
+ show_copy_button=True
 
200
  )
201
+ url_status = gr.Textbox(label="Status")
202
 
203
+ url_button.click(
204
+ process_url,
205
  inputs=[url_input, link_types],
206
+ outputs=[url_output, url_status]
207
  )
208
 
209
+ with gr.Tab("File Converter"):
210
+ file_input = gr.File(label="Upload Document")
211
+ file_button = gr.Button("Convert to Markdown", variant="primary")
212
  file_output = gr.Textbox(
213
+ label="Converted Content",
214
  lines=20,
215
+ show_copy_button=True
 
216
  )
217
+ file_status = gr.Textbox(label="Status")
218
 
219
+ file_button.click(
220
+ process_file,
221
  inputs=[file_input],
222
+ outputs=[file_output, file_status]
223
  )
224
+
225
  if __name__ == "__main__":
226
  iface.launch()