Spaces:

Sanshruth
/

GrepVault

Sleeping

App Files Files Community

Sanshruth commited on Mar 8

Commit

8ba948f

verified ·

1 Parent(s): 52863cc

initial_commit

Browse files

Files changed (2) hide show

app.py +231 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import gradio as gr
+import requests
+import bs4
+import re
+import uuid
+import csv
+import time
+import pandas as pd
+class Hits:
+    def __init__(self):
+        self.mark_start_placeholder = str(uuid.uuid4())
+        self.mark_end_placeholder = str(uuid.uuid4())
+        self.hits = {}
+    def _parse_snippet(self, snippet):
+        matches = {}
+        soup = bs4.BeautifulSoup(snippet, 'lxml')
+        for tr in soup.select('tr'):
+            line_num = tr.select("div.lineno")[0].text.strip()
+            line = tr.select("pre")[0].decode_contents()
+            if "<mark" not in line:
+                continue
+            else:
+                line = re.sub(r'<mark[^<]*>',  self.mark_start_placeholder, line)
+                line = line.replace("</mark>", self.mark_end_placeholder)
+                line = bs4.BeautifulSoup(line, 'lxml').text
+                matches[line_num] = line
+        return matches
+    def add_hit(self, repo, path, snippet):
+        if repo not in self.hits:
+            self.hits[repo] = {}
+        if path not in self.hits[repo]:
+            self.hits[repo][path] = {}
+        for line_num, line in self._parse_snippet(snippet).items():
+            self.hits[repo][path][line_num] = line
+    def merge(self, hits2):
+        for hit_repo, path_data in hits2.hits.items():
+            if hit_repo not in self.hits:
+                self.hits[hit_repo] = {}
+            for path, lines in path_data.items():
+                if path not in self.hits[hit_repo]:
+                    self.hits[hit_repo][path] = {}
+                for line_num, line in lines.items():
+                    self.hits[hit_repo][path][line_num] = line
+def fetch_grep_app(page, query, use_regex, whole_words, case_sensitive, repo_filter, path_filter):
+    params = {
+        'q': query,
+        'page': page
+    }
+    url = "https://grep.app/api/search"
+    if use_regex:
+        params['regexp'] = 'true'
+    elif whole_words:
+        params['words'] = 'true'
+    if case_sensitive:
+        params['case'] = 'true'
+    if repo_filter:
+        params['f.repo.pattern'] = repo_filter
+    if path_filter:
+        params['f.path.pattern'] = path_filter
+    response = requests.get(url, params=params)
+    if response.status_code != 200:
+        return None, None, 0
+    data = response.json()
+    count = data['facets']['count']
+    hits = Hits()
+    for hit_data in data['hits']['hits']:
+        repo = hit_data['repo']['raw']
+        path = hit_data['path']['raw']
+        snippet = hit_data['content']['snippet']
+        hits.add_hit(repo, path, snippet)
+    if count > 10 * page:
+        return page + 1, hits, count
+    else:
+        return None, hits, count
+def extract_query_content(line, query):
+    """
+    Extracts everything after the query string until it encounters a `=`, backtick (`` ` ``), or closing double quote (`"`).
+    """
+    # Find the query string in the line
+    query_index = line.find(query)
+    if query_index == -1:
+        return ""  # Query not found in the line
+    # Extract everything after the query string
+    remaining_line = line[query_index + len(query):]
+    # Use regex to match until `=`, backtick, or closing double quote
+    match = re.search(r'^([^=`"]+)', remaining_line)
+    if match:
+        return match.group(1).strip()
+    return ""
+def search_and_export(query, use_regex, whole_words, case_sensitive, repo_filter, path_filter, progress=gr.Progress()):
+    hits = Hits()
+    next_page = 1
+    total_pages = 1
+    total_results = 0
+    while next_page and next_page < 101:
+        progress(next_page / 100, desc="Fetching data...")
+        next_page, page_hits, count = fetch_grep_app(next_page, query, use_regex, whole_words, case_sensitive, repo_filter, path_filter)
+        if page_hits:
+            hits.merge(page_hits)
+            total_results += count
+        if next_page is None:
+            break
+        time.sleep(1)
+    # Export to CSV
+    csv_filename = "search_results.csv"
+    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        writer.writerow(["Repository", "Line Number", "Extracted Query", "Content", "Path"])  # Path moved to the end
+        for repo, paths in hits.hits.items():
+            for path, lines in paths.items():
+                for line_num, content in lines.items():
+                    extracted_query = extract_query_content(content, query)
+                    writer.writerow([repo, line_num, extracted_query, content, path])  # Path moved to the end
+    # Read CSV into a DataFrame for display
+    df = pd.read_csv(csv_filename)
+    # Filter to show only rows where `Extracted Query` is unique
+    df_unique = df.drop_duplicates(subset=["Extracted Query"])
+    # Limit to top 6 unique results
+    df_top6_unique = df_unique.head(6)
+    return csv_filename, df_top6_unique, f"**Total Results: {total_results}**", f"Displaying top 6 unique results. Download the CSV for all {total_results} results."
+# Custom CSS for a modern look
+custom_css = """
+.gradio-container {
+    font-family: 'Arial', sans-serif;
+    background: linear-gradient(135deg, #f5f7fa, #c3cfe2);
+    padding: 20px;
+    border-radius: 10px;
+}
+.gradio-header {
+    text-align: center;
+    font-size: 24px;
+    font-weight: bold;
+    color: #333;
+}
+.gradio-header h1 {
+    text-decoration: underline;
+}
+.gradio-header p {
+    text-decoration: underline;
+}
+.gradio-inputs {
+    background: white;
+    padding: 20px;
+    border-radius: 10px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+.gradio-button {
+    background: #4CAF50 !important;
+    color: white !important;
+    border-radius: 5px !important;
+    padding: 10px 20px !important;
+    font-size: 16px !important;
+}
+.gradio-button:hover {
+    background: #45a049 !important;
+}
+.gradio-outputs {
+    background: white;
+    padding: 20px;
+    border-radius: 10px;
+    margin-top: 20px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+.gradio-dataframe {
+    max-height: 400px;
+    overflow-y: auto;
+    border: 1px solid #ddd;
+    border-radius: 5px;
+    padding: 10px;
+}
+.gradio-results-count {
+    font-size: 18px;
+    font-weight: bold;
+    color: #000; /* Black color */
+    margin-bottom: 10px;
+}
+.gradio-download-message {
+    font-size: 16px;
+    color: #333;
+    margin-top: 10px;
+}
+"""
+# UI using Gradio Blocks
+with gr.Blocks(css=custom_css, theme="default") as demo:
+    gr.Markdown("""
+    <div class="gradio-header">
+        <h1>GrepVault: Search Github for API Keys</h1>
+        <p><a href="https://github.com/SanshruthR/GrepVault">https://github.com/SanshruthR/GrepVault</a></p>
+    </div>
+    """)
+    with gr.Row():
+        with gr.Column(scale=2, elem_classes="gradio-inputs"):
+            query = gr.Textbox(label="Search Query", placeholder="Enter your search query for example :generateContent?key=", lines=1)
+            use_regex = gr.Checkbox(label="Use Regular Expression", value=False)
+            whole_words = gr.Checkbox(label="Match Whole Words", value=False)
+            case_sensitive = gr.Checkbox(label="Case Sensitive Search", value=False)
+            repo_filter = gr.Textbox(label="Repository Filter", placeholder="e.g., user/repo", lines=1)
+            path_filter = gr.Textbox(label="Path Filter", placeholder="e.g., src/", lines=1)
+            search_button = gr.Button("Search and Export", elem_classes="gradio-button")
+        with gr.Column(scale=3, elem_classes="gradio-outputs"):
+            results_count = gr.Markdown("**Total Results: 0**", elem_classes="gradio-results-count")
+            csv_download = gr.File(label="Download CSV")
+            csv_preview = gr.Dataframe(label="CSV Preview (Top 6 Unique Results)", headers=["Repository", "Line Number", "Extracted Query", "Content", "Path"], elem_classes="gradio-dataframe")
+            download_message = gr.Markdown("Displaying top 6 unique results. Download the CSV for all results.", elem_classes="gradio-download-message")
+    search_button.click(
+        search_and_export,
+        inputs=[query, use_regex, whole_words, case_sensitive, repo_filter, path_filter],
+        outputs=[csv_download, csv_preview, results_count, download_message]
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+beautifulsoup4==4.11.1
+lxml==4.9.1
+gradio