initial_commit
Browse files- app.py +231 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
import bs4
|
4 |
+
import re
|
5 |
+
import uuid
|
6 |
+
import csv
|
7 |
+
import time
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
class Hits:
|
11 |
+
def __init__(self):
|
12 |
+
self.mark_start_placeholder = str(uuid.uuid4())
|
13 |
+
self.mark_end_placeholder = str(uuid.uuid4())
|
14 |
+
self.hits = {}
|
15 |
+
|
16 |
+
def _parse_snippet(self, snippet):
|
17 |
+
matches = {}
|
18 |
+
soup = bs4.BeautifulSoup(snippet, 'lxml')
|
19 |
+
for tr in soup.select('tr'):
|
20 |
+
line_num = tr.select("div.lineno")[0].text.strip()
|
21 |
+
line = tr.select("pre")[0].decode_contents()
|
22 |
+
if "<mark" not in line:
|
23 |
+
continue
|
24 |
+
else:
|
25 |
+
line = re.sub(r'<mark[^<]*>', self.mark_start_placeholder, line)
|
26 |
+
line = line.replace("</mark>", self.mark_end_placeholder)
|
27 |
+
line = bs4.BeautifulSoup(line, 'lxml').text
|
28 |
+
matches[line_num] = line
|
29 |
+
return matches
|
30 |
+
|
31 |
+
def add_hit(self, repo, path, snippet):
|
32 |
+
if repo not in self.hits:
|
33 |
+
self.hits[repo] = {}
|
34 |
+
if path not in self.hits[repo]:
|
35 |
+
self.hits[repo][path] = {}
|
36 |
+
for line_num, line in self._parse_snippet(snippet).items():
|
37 |
+
self.hits[repo][path][line_num] = line
|
38 |
+
|
39 |
+
def merge(self, hits2):
|
40 |
+
for hit_repo, path_data in hits2.hits.items():
|
41 |
+
if hit_repo not in self.hits:
|
42 |
+
self.hits[hit_repo] = {}
|
43 |
+
for path, lines in path_data.items():
|
44 |
+
if path not in self.hits[hit_repo]:
|
45 |
+
self.hits[hit_repo][path] = {}
|
46 |
+
for line_num, line in lines.items():
|
47 |
+
self.hits[hit_repo][path][line_num] = line
|
48 |
+
|
49 |
+
def fetch_grep_app(page, query, use_regex, whole_words, case_sensitive, repo_filter, path_filter):
|
50 |
+
params = {
|
51 |
+
'q': query,
|
52 |
+
'page': page
|
53 |
+
}
|
54 |
+
url = "https://grep.app/api/search"
|
55 |
+
|
56 |
+
if use_regex:
|
57 |
+
params['regexp'] = 'true'
|
58 |
+
elif whole_words:
|
59 |
+
params['words'] = 'true'
|
60 |
+
|
61 |
+
if case_sensitive:
|
62 |
+
params['case'] = 'true'
|
63 |
+
if repo_filter:
|
64 |
+
params['f.repo.pattern'] = repo_filter
|
65 |
+
if path_filter:
|
66 |
+
params['f.path.pattern'] = path_filter
|
67 |
+
response = requests.get(url, params=params)
|
68 |
+
if response.status_code != 200:
|
69 |
+
return None, None, 0
|
70 |
+
data = response.json()
|
71 |
+
count = data['facets']['count']
|
72 |
+
hits = Hits()
|
73 |
+
for hit_data in data['hits']['hits']:
|
74 |
+
repo = hit_data['repo']['raw']
|
75 |
+
path = hit_data['path']['raw']
|
76 |
+
snippet = hit_data['content']['snippet']
|
77 |
+
hits.add_hit(repo, path, snippet)
|
78 |
+
|
79 |
+
if count > 10 * page:
|
80 |
+
return page + 1, hits, count
|
81 |
+
else:
|
82 |
+
return None, hits, count
|
83 |
+
|
84 |
+
def extract_query_content(line, query):
|
85 |
+
"""
|
86 |
+
Extracts everything after the query string until it encounters a `=`, backtick (`` ` ``), or closing double quote (`"`).
|
87 |
+
"""
|
88 |
+
# Find the query string in the line
|
89 |
+
query_index = line.find(query)
|
90 |
+
if query_index == -1:
|
91 |
+
return "" # Query not found in the line
|
92 |
+
|
93 |
+
# Extract everything after the query string
|
94 |
+
remaining_line = line[query_index + len(query):]
|
95 |
+
|
96 |
+
# Use regex to match until `=`, backtick, or closing double quote
|
97 |
+
match = re.search(r'^([^=`"]+)', remaining_line)
|
98 |
+
if match:
|
99 |
+
return match.group(1).strip()
|
100 |
+
return ""
|
101 |
+
|
102 |
+
def search_and_export(query, use_regex, whole_words, case_sensitive, repo_filter, path_filter, progress=gr.Progress()):
|
103 |
+
hits = Hits()
|
104 |
+
next_page = 1
|
105 |
+
total_pages = 1
|
106 |
+
total_results = 0
|
107 |
+
while next_page and next_page < 101:
|
108 |
+
progress(next_page / 100, desc="Fetching data...")
|
109 |
+
next_page, page_hits, count = fetch_grep_app(next_page, query, use_regex, whole_words, case_sensitive, repo_filter, path_filter)
|
110 |
+
if page_hits:
|
111 |
+
hits.merge(page_hits)
|
112 |
+
total_results += count
|
113 |
+
if next_page is None:
|
114 |
+
break
|
115 |
+
time.sleep(1)
|
116 |
+
|
117 |
+
# Export to CSV
|
118 |
+
csv_filename = "search_results.csv"
|
119 |
+
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
|
120 |
+
writer = csv.writer(file)
|
121 |
+
writer.writerow(["Repository", "Line Number", "Extracted Query", "Content", "Path"]) # Path moved to the end
|
122 |
+
for repo, paths in hits.hits.items():
|
123 |
+
for path, lines in paths.items():
|
124 |
+
for line_num, content in lines.items():
|
125 |
+
extracted_query = extract_query_content(content, query)
|
126 |
+
writer.writerow([repo, line_num, extracted_query, content, path]) # Path moved to the end
|
127 |
+
|
128 |
+
# Read CSV into a DataFrame for display
|
129 |
+
df = pd.read_csv(csv_filename)
|
130 |
+
# Filter to show only rows where `Extracted Query` is unique
|
131 |
+
df_unique = df.drop_duplicates(subset=["Extracted Query"])
|
132 |
+
# Limit to top 6 unique results
|
133 |
+
df_top6_unique = df_unique.head(6)
|
134 |
+
return csv_filename, df_top6_unique, f"**Total Results: {total_results}**", f"Displaying top 6 unique results. Download the CSV for all {total_results} results."
|
135 |
+
|
136 |
+
# Custom CSS for a modern look
|
137 |
+
custom_css = """
|
138 |
+
.gradio-container {
|
139 |
+
font-family: 'Arial', sans-serif;
|
140 |
+
background: linear-gradient(135deg, #f5f7fa, #c3cfe2);
|
141 |
+
padding: 20px;
|
142 |
+
border-radius: 10px;
|
143 |
+
}
|
144 |
+
.gradio-header {
|
145 |
+
text-align: center;
|
146 |
+
font-size: 24px;
|
147 |
+
font-weight: bold;
|
148 |
+
color: #333;
|
149 |
+
}
|
150 |
+
.gradio-header h1 {
|
151 |
+
text-decoration: underline;
|
152 |
+
}
|
153 |
+
.gradio-header p {
|
154 |
+
text-decoration: underline;
|
155 |
+
}
|
156 |
+
.gradio-inputs {
|
157 |
+
background: white;
|
158 |
+
padding: 20px;
|
159 |
+
border-radius: 10px;
|
160 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
161 |
+
}
|
162 |
+
.gradio-button {
|
163 |
+
background: #4CAF50 !important;
|
164 |
+
color: white !important;
|
165 |
+
border-radius: 5px !important;
|
166 |
+
padding: 10px 20px !important;
|
167 |
+
font-size: 16px !important;
|
168 |
+
}
|
169 |
+
.gradio-button:hover {
|
170 |
+
background: #45a049 !important;
|
171 |
+
}
|
172 |
+
.gradio-outputs {
|
173 |
+
background: white;
|
174 |
+
padding: 20px;
|
175 |
+
border-radius: 10px;
|
176 |
+
margin-top: 20px;
|
177 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
178 |
+
}
|
179 |
+
.gradio-dataframe {
|
180 |
+
max-height: 400px;
|
181 |
+
overflow-y: auto;
|
182 |
+
border: 1px solid #ddd;
|
183 |
+
border-radius: 5px;
|
184 |
+
padding: 10px;
|
185 |
+
}
|
186 |
+
.gradio-results-count {
|
187 |
+
font-size: 18px;
|
188 |
+
font-weight: bold;
|
189 |
+
color: #000; /* Black color */
|
190 |
+
margin-bottom: 10px;
|
191 |
+
}
|
192 |
+
.gradio-download-message {
|
193 |
+
font-size: 16px;
|
194 |
+
color: #333;
|
195 |
+
margin-top: 10px;
|
196 |
+
}
|
197 |
+
"""
|
198 |
+
|
199 |
+
# UI using Gradio Blocks
|
200 |
+
with gr.Blocks(css=custom_css, theme="default") as demo:
|
201 |
+
gr.Markdown("""
|
202 |
+
<div class="gradio-header">
|
203 |
+
<h1>GrepVault: Search Github for API Keys</h1>
|
204 |
+
<p><a href="https://github.com/SanshruthR/GrepVault">https://github.com/SanshruthR/GrepVault</a></p>
|
205 |
+
|
206 |
+
</div>
|
207 |
+
""")
|
208 |
+
|
209 |
+
with gr.Row():
|
210 |
+
with gr.Column(scale=2, elem_classes="gradio-inputs"):
|
211 |
+
query = gr.Textbox(label="Search Query", placeholder="Enter your search query for example :generateContent?key=", lines=1)
|
212 |
+
use_regex = gr.Checkbox(label="Use Regular Expression", value=False)
|
213 |
+
whole_words = gr.Checkbox(label="Match Whole Words", value=False)
|
214 |
+
case_sensitive = gr.Checkbox(label="Case Sensitive Search", value=False)
|
215 |
+
repo_filter = gr.Textbox(label="Repository Filter", placeholder="e.g., user/repo", lines=1)
|
216 |
+
path_filter = gr.Textbox(label="Path Filter", placeholder="e.g., src/", lines=1)
|
217 |
+
search_button = gr.Button("Search and Export", elem_classes="gradio-button")
|
218 |
+
|
219 |
+
with gr.Column(scale=3, elem_classes="gradio-outputs"):
|
220 |
+
results_count = gr.Markdown("**Total Results: 0**", elem_classes="gradio-results-count")
|
221 |
+
csv_download = gr.File(label="Download CSV")
|
222 |
+
csv_preview = gr.Dataframe(label="CSV Preview (Top 6 Unique Results)", headers=["Repository", "Line Number", "Extracted Query", "Content", "Path"], elem_classes="gradio-dataframe")
|
223 |
+
download_message = gr.Markdown("Displaying top 6 unique results. Download the CSV for all results.", elem_classes="gradio-download-message")
|
224 |
+
|
225 |
+
search_button.click(
|
226 |
+
search_and_export,
|
227 |
+
inputs=[query, use_regex, whole_words, case_sensitive, repo_filter, path_filter],
|
228 |
+
outputs=[csv_download, csv_preview, results_count, download_message]
|
229 |
+
)
|
230 |
+
|
231 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
beautifulsoup4==4.11.1
|
2 |
+
lxml==4.9.1
|
3 |
+
gradio
|