Add preprocessor.py
Browse files- app.py +41 -5
- modal_app.py +12 -0
- pyproject.toml +3 -0
- utils/ignore.txt +0 -0
- utils/preprocessor.py +48 -0
- uv.lock +0 -0
app.py
CHANGED
@@ -1,12 +1,48 @@
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
demo = gr.Interface(
|
7 |
-
fn=
|
8 |
-
inputs=
|
9 |
-
outputs=
|
|
|
10 |
)
|
11 |
|
12 |
demo.launch()
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
from markdownify import markdownify
|
4 |
+
from requests.exceptions import RequestException
|
5 |
import gradio as gr
|
6 |
|
7 |
+
# Import the Preprocessor class
|
8 |
+
from utils.preprocessor import Preprocessor
|
9 |
+
|
10 |
+
def visit_webpage(url, max_output_length=40000):
|
11 |
+
"""
|
12 |
+
Fetch the webpage, convert to markdown, and use Preprocessor methods.
|
13 |
+
"""
|
14 |
+
try:
|
15 |
+
response = requests.get(url, timeout=20)
|
16 |
+
response.raise_for_status()
|
17 |
+
markdown_content = markdownify(response.text).strip()
|
18 |
+
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
|
19 |
+
if len(markdown_content) > max_output_length:
|
20 |
+
markdown_content = (
|
21 |
+
markdown_content[: max_output_length // 2]
|
22 |
+
+ f"\n..._This content has been truncated to stay below {max_output_length} characters_...\n"
|
23 |
+
+ markdown_content[-max_output_length // 2 :]
|
24 |
+
)
|
25 |
+
# Use Preprocessor class methods
|
26 |
+
section = Preprocessor.extract_section(markdown_content)
|
27 |
+
dir_paths, files = Preprocessor.extract_dirs_from_text(section)
|
28 |
+
# Format the result
|
29 |
+
result = (
|
30 |
+
f"paths: {dir_paths}\n\n"
|
31 |
+
f"files: {files}"
|
32 |
+
)
|
33 |
+
return result
|
34 |
+
except requests.exceptions.Timeout:
|
35 |
+
return "The request timed out. Please try again later or check the URL."
|
36 |
+
except RequestException as e:
|
37 |
+
return f"Error fetching the webpage: {str(e)}"
|
38 |
+
except Exception as e:
|
39 |
+
return f"An unexpected error occurred: {str(e)}"
|
40 |
|
41 |
demo = gr.Interface(
|
42 |
+
fn=visit_webpage,
|
43 |
+
inputs=gr.Textbox(label="Website URL"),
|
44 |
+
outputs=gr.Textbox(label="Extracted Section, Directory Paths, and File Paths"),
|
45 |
+
title="Webpage Section and Path Extractor"
|
46 |
)
|
47 |
|
48 |
demo.launch()
|
modal_app.py
CHANGED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import modal
|
2 |
+
|
3 |
+
app = modal.App("auto-readme-agent")
|
4 |
+
|
5 |
+
@app.function()
|
6 |
+
def square(x):
|
7 |
+
print("This code is running on a remote worker!")
|
8 |
+
return x**2
|
9 |
+
|
10 |
+
@app.local_entrypoint()
|
11 |
+
def main():
|
12 |
+
print("the square is", square.remote(42))
|
pyproject.toml
CHANGED
@@ -6,4 +6,7 @@ readme = "README.md"
|
|
6 |
requires-python = ">=3.11"
|
7 |
dependencies = [
|
8 |
"gradio>=5.33.0",
|
|
|
|
|
|
|
9 |
]
|
|
|
6 |
requires-python = ">=3.11"
|
7 |
dependencies = [
|
8 |
"gradio>=5.33.0",
|
9 |
+
"markdownify>=1.1.0",
|
10 |
+
"modal>=1.0.3",
|
11 |
+
"smolagents>=1.17.0",
|
12 |
]
|
utils/ignore.txt
ADDED
File without changes
|
utils/preprocessor.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
class Preprocessor:
|
4 |
+
"""
|
5 |
+
Provides methods to preprocess markdown content:
|
6 |
+
- extract a section after a specific keyword
|
7 |
+
- extract directories from markdown links
|
8 |
+
- extract file paths from markdown links
|
9 |
+
"""
|
10 |
+
|
11 |
+
@staticmethod
|
12 |
+
def extract_section(markdown_content, keyword="Latest commit"):
|
13 |
+
"""
|
14 |
+
Extract lines starting from the line after the one containing
|
15 |
+
'keyword' up to the next empty line.
|
16 |
+
"""
|
17 |
+
lines = markdown_content.splitlines()
|
18 |
+
extract = []
|
19 |
+
found = False
|
20 |
+
for i, line in enumerate(lines):
|
21 |
+
if not found and keyword in line:
|
22 |
+
found = True
|
23 |
+
# Start collecting from the next line
|
24 |
+
start_idx = i + 1
|
25 |
+
while start_idx < len(lines):
|
26 |
+
next_line = lines[start_idx]
|
27 |
+
if next_line.strip() == "":
|
28 |
+
break
|
29 |
+
extract.append(next_line)
|
30 |
+
start_idx += 1
|
31 |
+
break
|
32 |
+
return "\n".join(extract) if extract else f"No content found after '{keyword}'."
|
33 |
+
|
34 |
+
@staticmethod
|
35 |
+
def extract_dirs_from_text(text):
|
36 |
+
"""
|
37 |
+
Extract unique directory paths from markdown links in text.
|
38 |
+
Only the path part before the filename is kept.
|
39 |
+
"""
|
40 |
+
pattern = r'\((/[^ )]+)\s+"([^"]+)"\)'
|
41 |
+
matches = re.findall(pattern, text)
|
42 |
+
dirs = set()
|
43 |
+
files = set()
|
44 |
+
for match in matches:
|
45 |
+
dirs.add(match[0])
|
46 |
+
if not match[1].startswith("."):
|
47 |
+
files.add(match[1])
|
48 |
+
return dirs, files
|
uv.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|