Sanshruth commited on
Commit
8ba948f
·
verified ·
1 Parent(s): 52863cc

initial_commit

Browse files
Files changed (2) hide show
  1. app.py +231 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import bs4
4
+ import re
5
+ import uuid
6
+ import csv
7
+ import time
8
+ import pandas as pd
9
+
10
+ class Hits:
11
+ def __init__(self):
12
+ self.mark_start_placeholder = str(uuid.uuid4())
13
+ self.mark_end_placeholder = str(uuid.uuid4())
14
+ self.hits = {}
15
+
16
+ def _parse_snippet(self, snippet):
17
+ matches = {}
18
+ soup = bs4.BeautifulSoup(snippet, 'lxml')
19
+ for tr in soup.select('tr'):
20
+ line_num = tr.select("div.lineno")[0].text.strip()
21
+ line = tr.select("pre")[0].decode_contents()
22
+ if "<mark" not in line:
23
+ continue
24
+ else:
25
+ line = re.sub(r'<mark[^<]*>', self.mark_start_placeholder, line)
26
+ line = line.replace("</mark>", self.mark_end_placeholder)
27
+ line = bs4.BeautifulSoup(line, 'lxml').text
28
+ matches[line_num] = line
29
+ return matches
30
+
31
+ def add_hit(self, repo, path, snippet):
32
+ if repo not in self.hits:
33
+ self.hits[repo] = {}
34
+ if path not in self.hits[repo]:
35
+ self.hits[repo][path] = {}
36
+ for line_num, line in self._parse_snippet(snippet).items():
37
+ self.hits[repo][path][line_num] = line
38
+
39
+ def merge(self, hits2):
40
+ for hit_repo, path_data in hits2.hits.items():
41
+ if hit_repo not in self.hits:
42
+ self.hits[hit_repo] = {}
43
+ for path, lines in path_data.items():
44
+ if path not in self.hits[hit_repo]:
45
+ self.hits[hit_repo][path] = {}
46
+ for line_num, line in lines.items():
47
+ self.hits[hit_repo][path][line_num] = line
48
+
49
+ def fetch_grep_app(page, query, use_regex, whole_words, case_sensitive, repo_filter, path_filter):
50
+ params = {
51
+ 'q': query,
52
+ 'page': page
53
+ }
54
+ url = "https://grep.app/api/search"
55
+
56
+ if use_regex:
57
+ params['regexp'] = 'true'
58
+ elif whole_words:
59
+ params['words'] = 'true'
60
+
61
+ if case_sensitive:
62
+ params['case'] = 'true'
63
+ if repo_filter:
64
+ params['f.repo.pattern'] = repo_filter
65
+ if path_filter:
66
+ params['f.path.pattern'] = path_filter
67
+ response = requests.get(url, params=params)
68
+ if response.status_code != 200:
69
+ return None, None, 0
70
+ data = response.json()
71
+ count = data['facets']['count']
72
+ hits = Hits()
73
+ for hit_data in data['hits']['hits']:
74
+ repo = hit_data['repo']['raw']
75
+ path = hit_data['path']['raw']
76
+ snippet = hit_data['content']['snippet']
77
+ hits.add_hit(repo, path, snippet)
78
+
79
+ if count > 10 * page:
80
+ return page + 1, hits, count
81
+ else:
82
+ return None, hits, count
83
+
84
+ def extract_query_content(line, query):
85
+ """
86
+ Extracts everything after the query string until it encounters a `=`, backtick (`` ` ``), or closing double quote (`"`).
87
+ """
88
+ # Find the query string in the line
89
+ query_index = line.find(query)
90
+ if query_index == -1:
91
+ return "" # Query not found in the line
92
+
93
+ # Extract everything after the query string
94
+ remaining_line = line[query_index + len(query):]
95
+
96
+ # Use regex to match until `=`, backtick, or closing double quote
97
+ match = re.search(r'^([^=`"]+)', remaining_line)
98
+ if match:
99
+ return match.group(1).strip()
100
+ return ""
101
+
102
+ def search_and_export(query, use_regex, whole_words, case_sensitive, repo_filter, path_filter, progress=gr.Progress()):
103
+ hits = Hits()
104
+ next_page = 1
105
+ total_pages = 1
106
+ total_results = 0
107
+ while next_page and next_page < 101:
108
+ progress(next_page / 100, desc="Fetching data...")
109
+ next_page, page_hits, count = fetch_grep_app(next_page, query, use_regex, whole_words, case_sensitive, repo_filter, path_filter)
110
+ if page_hits:
111
+ hits.merge(page_hits)
112
+ total_results += count
113
+ if next_page is None:
114
+ break
115
+ time.sleep(1)
116
+
117
+ # Export to CSV
118
+ csv_filename = "search_results.csv"
119
+ with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
120
+ writer = csv.writer(file)
121
+ writer.writerow(["Repository", "Line Number", "Extracted Query", "Content", "Path"]) # Path moved to the end
122
+ for repo, paths in hits.hits.items():
123
+ for path, lines in paths.items():
124
+ for line_num, content in lines.items():
125
+ extracted_query = extract_query_content(content, query)
126
+ writer.writerow([repo, line_num, extracted_query, content, path]) # Path moved to the end
127
+
128
+ # Read CSV into a DataFrame for display
129
+ df = pd.read_csv(csv_filename)
130
+ # Filter to show only rows where `Extracted Query` is unique
131
+ df_unique = df.drop_duplicates(subset=["Extracted Query"])
132
+ # Limit to top 6 unique results
133
+ df_top6_unique = df_unique.head(6)
134
+ return csv_filename, df_top6_unique, f"**Total Results: {total_results}**", f"Displaying top 6 unique results. Download the CSV for all {total_results} results."
135
+
136
+ # Custom CSS for a modern look
137
+ custom_css = """
138
+ .gradio-container {
139
+ font-family: 'Arial', sans-serif;
140
+ background: linear-gradient(135deg, #f5f7fa, #c3cfe2);
141
+ padding: 20px;
142
+ border-radius: 10px;
143
+ }
144
+ .gradio-header {
145
+ text-align: center;
146
+ font-size: 24px;
147
+ font-weight: bold;
148
+ color: #333;
149
+ }
150
+ .gradio-header h1 {
151
+ text-decoration: underline;
152
+ }
153
+ .gradio-header p {
154
+ text-decoration: underline;
155
+ }
156
+ .gradio-inputs {
157
+ background: white;
158
+ padding: 20px;
159
+ border-radius: 10px;
160
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
161
+ }
162
+ .gradio-button {
163
+ background: #4CAF50 !important;
164
+ color: white !important;
165
+ border-radius: 5px !important;
166
+ padding: 10px 20px !important;
167
+ font-size: 16px !important;
168
+ }
169
+ .gradio-button:hover {
170
+ background: #45a049 !important;
171
+ }
172
+ .gradio-outputs {
173
+ background: white;
174
+ padding: 20px;
175
+ border-radius: 10px;
176
+ margin-top: 20px;
177
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
178
+ }
179
+ .gradio-dataframe {
180
+ max-height: 400px;
181
+ overflow-y: auto;
182
+ border: 1px solid #ddd;
183
+ border-radius: 5px;
184
+ padding: 10px;
185
+ }
186
+ .gradio-results-count {
187
+ font-size: 18px;
188
+ font-weight: bold;
189
+ color: #000; /* Black color */
190
+ margin-bottom: 10px;
191
+ }
192
+ .gradio-download-message {
193
+ font-size: 16px;
194
+ color: #333;
195
+ margin-top: 10px;
196
+ }
197
+ """
198
+
199
+ # UI using Gradio Blocks
200
+ with gr.Blocks(css=custom_css, theme="default") as demo:
201
+ gr.Markdown("""
202
+ <div class="gradio-header">
203
+ <h1>GrepVault: Search Github for API Keys</h1>
204
+ <p><a href="https://github.com/SanshruthR/GrepVault">https://github.com/SanshruthR/GrepVault</a></p>
205
+
206
+ </div>
207
+ """)
208
+
209
+ with gr.Row():
210
+ with gr.Column(scale=2, elem_classes="gradio-inputs"):
211
+ query = gr.Textbox(label="Search Query", placeholder="Enter your search query for example :generateContent?key=", lines=1)
212
+ use_regex = gr.Checkbox(label="Use Regular Expression", value=False)
213
+ whole_words = gr.Checkbox(label="Match Whole Words", value=False)
214
+ case_sensitive = gr.Checkbox(label="Case Sensitive Search", value=False)
215
+ repo_filter = gr.Textbox(label="Repository Filter", placeholder="e.g., user/repo", lines=1)
216
+ path_filter = gr.Textbox(label="Path Filter", placeholder="e.g., src/", lines=1)
217
+ search_button = gr.Button("Search and Export", elem_classes="gradio-button")
218
+
219
+ with gr.Column(scale=3, elem_classes="gradio-outputs"):
220
+ results_count = gr.Markdown("**Total Results: 0**", elem_classes="gradio-results-count")
221
+ csv_download = gr.File(label="Download CSV")
222
+ csv_preview = gr.Dataframe(label="CSV Preview (Top 6 Unique Results)", headers=["Repository", "Line Number", "Extracted Query", "Content", "Path"], elem_classes="gradio-dataframe")
223
+ download_message = gr.Markdown("Displaying top 6 unique results. Download the CSV for all results.", elem_classes="gradio-download-message")
224
+
225
+ search_button.click(
226
+ search_and_export,
227
+ inputs=[query, use_regex, whole_words, case_sensitive, repo_filter, path_filter],
228
+ outputs=[csv_download, csv_preview, results_count, download_message]
229
+ )
230
+
231
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ beautifulsoup4==4.11.1
2
+ lxml==4.9.1
3
+ gradio