bogeumkim commited on
Commit
b57f297
·
1 Parent(s): 87fb172

Add preprocessor.py

Browse files
Files changed (6) hide show
  1. app.py +41 -5
  2. modal_app.py +12 -0
  3. pyproject.toml +3 -0
  4. utils/ignore.txt +0 -0
  5. utils/preprocessor.py +48 -0
  6. uv.lock +0 -0
app.py CHANGED
@@ -1,12 +1,48 @@
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name, intensity):
4
- return "Hello, " + name + "!" * int(intensity)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  demo = gr.Interface(
7
- fn=greet,
8
- inputs=["text", "slider"],
9
- outputs=["text"],
 
10
  )
11
 
12
  demo.launch()
 
1
+ import re
2
+ import requests
3
+ from markdownify import markdownify
4
+ from requests.exceptions import RequestException
5
  import gradio as gr
6
 
7
+ # Import the Preprocessor class
8
+ from utils.preprocessor import Preprocessor
9
+
10
+ def visit_webpage(url, max_output_length=40000):
11
+ """
12
+ Fetch the webpage, convert to markdown, and use Preprocessor methods.
13
+ """
14
+ try:
15
+ response = requests.get(url, timeout=20)
16
+ response.raise_for_status()
17
+ markdown_content = markdownify(response.text).strip()
18
+ markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
19
+ if len(markdown_content) > max_output_length:
20
+ markdown_content = (
21
+ markdown_content[: max_output_length // 2]
22
+ + f"\n..._This content has been truncated to stay below {max_output_length} characters_...\n"
23
+ + markdown_content[-max_output_length // 2 :]
24
+ )
25
+ # Use Preprocessor class methods
26
+ section = Preprocessor.extract_section(markdown_content)
27
+ dir_paths, files = Preprocessor.extract_dirs_from_text(section)
28
+ # Format the result
29
+ result = (
30
+ f"paths: {dir_paths}\n\n"
31
+ f"files: {files}"
32
+ )
33
+ return result
34
+ except requests.exceptions.Timeout:
35
+ return "The request timed out. Please try again later or check the URL."
36
+ except RequestException as e:
37
+ return f"Error fetching the webpage: {str(e)}"
38
+ except Exception as e:
39
+ return f"An unexpected error occurred: {str(e)}"
40
 
41
  demo = gr.Interface(
42
+ fn=visit_webpage,
43
+ inputs=gr.Textbox(label="Website URL"),
44
+ outputs=gr.Textbox(label="Extracted Section, Directory Paths, and File Paths"),
45
+ title="Webpage Section and Path Extractor"
46
  )
47
 
48
  demo.launch()
modal_app.py CHANGED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+
3
+ app = modal.App("auto-readme-agent")
4
+
5
+ @app.function()
6
+ def square(x):
7
+ print("This code is running on a remote worker!")
8
+ return x**2
9
+
10
+ @app.local_entrypoint()
11
+ def main():
12
+ print("the square is", square.remote(42))
pyproject.toml CHANGED
@@ -6,4 +6,7 @@ readme = "README.md"
6
  requires-python = ">=3.11"
7
  dependencies = [
8
  "gradio>=5.33.0",
 
 
 
9
  ]
 
6
  requires-python = ">=3.11"
7
  dependencies = [
8
  "gradio>=5.33.0",
9
+ "markdownify>=1.1.0",
10
+ "modal>=1.0.3",
11
+ "smolagents>=1.17.0",
12
  ]
utils/ignore.txt ADDED
File without changes
utils/preprocessor.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ class Preprocessor:
4
+ """
5
+ Provides methods to preprocess markdown content:
6
+ - extract a section after a specific keyword
7
+ - extract directories from markdown links
8
+ - extract file paths from markdown links
9
+ """
10
+
11
+ @staticmethod
12
+ def extract_section(markdown_content, keyword="Latest commit"):
13
+ """
14
+ Extract lines starting from the line after the one containing
15
+ 'keyword' up to the next empty line.
16
+ """
17
+ lines = markdown_content.splitlines()
18
+ extract = []
19
+ found = False
20
+ for i, line in enumerate(lines):
21
+ if not found and keyword in line:
22
+ found = True
23
+ # Start collecting from the next line
24
+ start_idx = i + 1
25
+ while start_idx < len(lines):
26
+ next_line = lines[start_idx]
27
+ if next_line.strip() == "":
28
+ break
29
+ extract.append(next_line)
30
+ start_idx += 1
31
+ break
32
+ return "\n".join(extract) if extract else f"No content found after '{keyword}'."
33
+
34
+ @staticmethod
35
+ def extract_dirs_from_text(text):
36
+ """
37
+ Extract unique directory paths from markdown links in text.
38
+ Only the path part before the filename is kept.
39
+ """
40
+ pattern = r'\((/[^ )]+)\s+"([^"]+)"\)'
41
+ matches = re.findall(pattern, text)
42
+ dirs = set()
43
+ files = set()
44
+ for match in matches:
45
+ dirs.add(match[0])
46
+ if not match[1].startswith("."):
47
+ files.add(match[1])
48
+ return dirs, files
uv.lock CHANGED
The diff for this file is too large to render. See raw diff