File size: 16,729 Bytes
e293bcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
# app.py
import gradio as gr
import pandas as pd
import requests
import io
import dask.dataframe as dd
from datasets import load_dataset, Image
from mlcroissant import Dataset as CroissantDataset
from huggingface_hub import get_token
import polars as pl
import warnings
import traceback
import json
import tempfile # Added for creating temporary files

# 🀫 Let's ignore those pesky warnings, shall we?
warnings.filterwarnings("ignore")

# --- βš™οΈ Configuration & Constants ---
DATASET_CONFIG = {
    "caselaw": {
        "name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
        "methods": ["πŸ’¨ API (requests)", "🧊 Dask", "πŸ₯ Croissant"], "is_public": True,
    },
    "prompts": {
        "name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
        "methods": ["🐼 Pandas", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": True,
    },
    "finance": {
        "name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
        "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
    },
    "medical": {
        "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
        "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
    },
    "inscene": {
        "name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
        "methods": ["πŸ€— Datasets", "🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
    },
}

# --- πŸ”§ Helpers & Utility Functions ---

def get_auth_headers():
    token = get_token()
    return {"Authorization": f"Bearer {token}"} if token else {}

# --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
def dataframe_to_outputs(df: pd.DataFrame):
    """
    πŸ“œ Takes a DataFrame and transforms it into various formats.
    Now uses temporary files for maximum Gradio compatibility.
    """
    if df.empty:
        return "No results found. 🀷", None, None, "No results to copy."

    df_str = df.astype(str)
    markdown_output = df_str.to_markdown(index=False)
    
    # Create a temporary CSV file
    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
        df.to_csv(tmp_csv.name, index=False)
        csv_path = tmp_csv.name

    # Create a temporary XLSX file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
        df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
        xlsx_path = tmp_xlsx.name

    tab_delimited_output = df.to_csv(sep='\t', index=False)
    
    return (
        markdown_output,
        csv_path,
        xlsx_path,
        tab_delimited_output,
    )

def handle_error(e: Exception, request=None, response=None):
    """
    😱 Oh no! An error! This function now creates a detailed debug log.
    """
    error_message = f"🚨 An error occurred: {str(e)}\n"
    auth_tip = "πŸ”‘ For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
    full_trace = traceback.format_exc()
    print(full_trace)
    if "401" in str(e) or "Gated" in str(e):
        error_message += auth_tip
    
    debug_log = f"""--- 🐞 DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n"""
    if request:
        debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n"""
    if response is not None:
        try:
            response_text = json.dumps(response.json(), indent=2)
        except json.JSONDecodeError:
            response_text = response.text
        debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n"""
    
    return (
        pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.",
        "", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```",
        gr.Code(value=debug_log, visible=True)
    )

def search_dataframe(df: pd.DataFrame, query: str):
    if not query:
        return df.head(100)
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    if string_cols.empty:
        return pd.DataFrame()
    mask = pd.Series([False] * len(df))
    for col in string_cols:
        mask |= df[col].astype(str).str.contains(query, case=False, na=False)
    return df[mask]

def generate_code_snippet(dataset_key: str, access_method: str, query: str):
    """
    πŸ’» Generate Python code snippet for the current operation
    """
    config = DATASET_CONFIG[dataset_key]
    repo_id = config["name"]
    
    if "API" in access_method:
        return f'''# 🌐 API Access for {repo_id}
import requests
import pandas as pd

url = "https://datasets-server.huggingface.co/rows"
params = {{
    "dataset": "{repo_id}",
    "config": "default",
    "split": "train",
    "offset": 0,
    "length": 100
}}

headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} if needed else {{}}
response = requests.get(url, params=params, headers=headers)

if response.status_code == 200:
    data = response.json()
    rows_data = [item['row'] for item in data['rows']]
    df = pd.json_normalize(rows_data)
    
    # Search for: "{query}"
    if "{query}":
        string_cols = df.select_dtypes(include=['object', 'string']).columns
        mask = pd.Series([False] * len(df))
        for col in string_cols:
            mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
        df = df[mask]
    
    print(f"Found {{len(df)}} results")
    print(df.head())
else:
    print(f"Error: {{response.status_code}} - {{response.text}}")
'''
    
    elif "Pandas" in access_method:
        file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
        return f'''# 🐼 Pandas Access for {repo_id}
import pandas as pd

# You may need: huggingface-cli login
df = pd.read_{"csv" if "csv" in file_path else "parquet"}("hf://datasets/{repo_id}/{file_path}")

# Search for: "{query}"
if "{query}":
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    mask = pd.Series([False] * len(df))
    for col in string_cols:
        mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
    df = df[mask]

print(f"Found {{len(df)}} results")
print(df.head())
'''
    
    elif "Datasets" in access_method:
        return f'''# πŸ€— Datasets Library Access for {repo_id}
from datasets import load_dataset
import pandas as pd

# You may need: huggingface-cli login
ds = load_dataset("{repo_id}", split="train", streaming=True)
data = list(ds.take(1000))
df = pd.DataFrame(data)

# Search for: "{query}"
if "{query}":
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    mask = pd.Series([False] * len(df))
    for col in string_cols:
        mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
    df = df[mask]

print(f"Found {{len(df)}} results")
print(df.head())
'''
    
    else:
        return f"# Code generation for {access_method} not implemented yet"

# --- 🎣 Data Fetching & Processing Functions ---
def fetch_data(dataset_key: str, access_method: str, query: str):
    """
    πŸš€ Main mission control. Always yields a tuple of 9 values to match the UI components.
    """
    outputs = [pd.DataFrame(), None, "🏁 Ready.", "", None, None, "", "", gr.Code(visible=False)]
    req, res = None, None
    try:
        config = DATASET_CONFIG[dataset_key]
        repo_id = config["name"]
        
        # Generate code snippet
        code_snippet = generate_code_snippet(dataset_key, access_method, query)
        outputs[7] = code_snippet
        
        if "API" in access_method:
            all_results_df = pd.DataFrame()
            MAX_PAGES = 5
            PAGE_SIZE = 100

            if not query:
                MAX_PAGES = 1
                outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
                yield tuple(outputs)

            for page in range(MAX_PAGES):
                if query:
                    outputs[2] = f"⏳ Searching page {page + 1}..."
                    yield tuple(outputs)
                
                offset = page * PAGE_SIZE
                url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
                headers = get_auth_headers() if not config["is_public"] else {}
                
                res = requests.get(url, headers=headers)
                req = res.request
                res.raise_for_status()
                data = res.json()

                if not data.get('rows'):
                    outputs[2] = "🏁 No more data to search."
                    yield tuple(outputs)
                    break

                # --- ✨ FIXED: JSON processing logic ---
                # Extract the actual data from the 'row' key of each item in the list
                rows_data = [item['row'] for item in data['rows']]
                page_df = pd.json_normalize(rows_data)
                
                found_in_page = search_dataframe(page_df, query)

                if not found_in_page.empty:
                    all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True)
                    outputs[0] = all_results_df
                    outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
                    outputs[2] = f"βœ… Found **{len(all_results_df)}** results so far..."
                    
                    if dataset_key == 'inscene':
                        gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
                        outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
                    yield tuple(outputs)

            outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
            yield tuple(outputs)
            return

        outputs[2] = f"⏳ Loading data via `{access_method}`..."
        yield tuple(outputs)
        
        df = pd.DataFrame()
        
        if "Pandas" in access_method:
            file_path = f"hf://datasets/{repo_id}/"
            if repo_id == "fka/awesome-chatgpt-prompts": 
                file_path += "prompts.csv"
                df = pd.read_csv(file_path)
            else: 
                try: 
                    df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
                except:
                     try: 
                         df = pd.read_parquet(f"{file_path}train.parquet")
                     except: 
                         df = pd.read_json(f"{file_path}medical_o1_sft.json")
                         
        elif "Datasets" in access_method:
            ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
            df = pd.DataFrame(ds)
            
        elif "Polars" in access_method:
            outputs[2] = "⏳ Loading with Polars..."
            yield tuple(outputs)
            if repo_id == "fka/awesome-chatgpt-prompts":
                pl_df = pl.read_csv(f"hf://datasets/{repo_id}/prompts.csv")
            else:
                pl_df = pl.read_parquet(f"hf://datasets/{repo_id}/train.parquet")
            df = pl_df.to_pandas()
            
        elif "Dask" in access_method:
            outputs[2] = "⏳ Loading with Dask..."
            yield tuple(outputs)
            dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
            df = dask_df.head(1000)  # Convert to pandas for processing
            
        elif "Croissant" in access_method:
            outputs[2] = "⏳ Loading with Croissant..."
            yield tuple(outputs)
            headers = get_auth_headers() if not config["is_public"] else {}
            croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
            response = requests.get(croissant_url, headers=headers)
            response.raise_for_status()
            jsonld = response.json()
            ds = CroissantDataset(jsonld=jsonld)
            records = list(ds.records("default"))[:1000]  # Take first 1000
            df = pd.DataFrame(records)
        
        outputs[2] = "πŸ” Searching loaded data..."
        yield tuple(outputs)

        final_df = search_dataframe(df, query)
        
        outputs[0] = final_df
        outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
        outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
        
        if dataset_key == 'inscene' and not final_df.empty:
            gallery_data = [(row['image'], row.get('text', '')) for _, row in final_df.iterrows() if 'image' in row and isinstance(row.get('image'), Image.Image)]
            outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
        
        yield tuple(outputs)

    except Exception as e:
        yield handle_error(e, req, res)


# --- πŸ–ΌοΈ UI Generation ---
def create_dataset_tab(dataset_key: str):
    config = DATASET_CONFIG[dataset_key]
    
    with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
        gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
        if not config['is_public']:
            gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
        
        with gr.Row():
            access_method = gr.Radio(config['methods'], label="πŸ”‘ Access Method", value=config['methods'][0])
            query = gr.Textbox(label="πŸ” Search Query", placeholder="Enter any text to search, or leave blank for samples...")
        
        fetch_button = gr.Button("πŸš€ Go Fetch!")
        status_output = gr.Markdown("🏁 Ready to search.")
        df_output = gr.DataFrame(label="πŸ“Š Results DataFrame", interactive=False, wrap=True)
        gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="πŸ–ΌοΈ Image Results")

        with gr.Accordion("πŸ“‚ View/Export Full Results", open=False):
            markdown_output = gr.Markdown(label="πŸ“ Markdown View")
            with gr.Row():
                csv_output = gr.File(label="⬇️ Download CSV")
                xlsx_output = gr.File(label="⬇️ Download XLSX")
            copy_output = gr.Code(label="πŸ“‹ Copy-Paste (Tab-Delimited)")
        
        code_output = gr.Code(label="πŸ’» Python Code Snippet", language="python")
        
        debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
        
        fetch_button.click(
            fn=fetch_data,
            inputs=[gr.State(dataset_key), access_method, query],
            outputs=[
                df_output, gallery_output, status_output, markdown_output,
                csv_output, xlsx_output, copy_output, code_output,
                debug_log_output
            ]
        )

# --- πŸš€ Main App ---
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
    gr.Markdown("# πŸ€— Hugging Face Dataset Explorer")
    gr.Markdown(
        "Select a dataset, choose an access method, and type a query. "
        "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
    )
    
    with gr.Accordion("πŸ”§ Quick Start Guide", open=False):
        gr.Markdown("""
        ### πŸš€ Quick Start:
        1. **πŸ€– Prompts Tab**: Try API method, search for "translator" or "linux"
        2. **βš–οΈ Caselaw Tab**: Try API method, search for "contract" or "court"
        3. **πŸ’° Finance Tab**: Requires login, try API method first
        4. **🩺 Medical Tab**: Requires login, try API method first  
        5. **πŸ–ΌοΈ InScene Tab**: Requires login, try Datasets method for images
        
        ### πŸ”‘ Authentication:
        For gated datasets, run in terminal: `huggingface-cli login`
        
        ### πŸ› οΈ Methods:
        - **πŸ’¨ API**: Fast, reliable, works without login (100 rows max)
        - **🐼 Pandas**: Full dataset access, requires login for gated datasets
        - **πŸ€— Datasets**: Good for streaming large datasets
        - **🧊 Polars/Dask**: Alternative fast data processing
        - **πŸ₯ Croissant**: Metadata-aware loading
        """)
    
    with gr.Tabs():
        for key in DATASET_CONFIG.keys():
            create_dataset_tab(key)

if __name__ == "__main__":
    demo.launch(debug=True)