awacke1 commited on
Commit
7f41697
Β·
verified Β·
1 Parent(s): e09f513

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +703 -0
app.py ADDED
@@ -0,0 +1,703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ 🌟 Multi-Dataset Explorer 🌟
4
+ A comprehensive Gradio app for exploring datasets with multiple access patterns
5
+ Built with emojis, wit, and international accessibility in mind!
6
+ """
7
+
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import requests
11
+ import json
12
+ import io
13
+ import base64
14
+ from typing import Dict, List, Tuple, Optional, Any
15
+ import asyncio
16
+ import aiohttp
17
+ from datasets import load_dataset
18
+ from huggingface_hub import HfApi
19
+ from PIL import Image
20
+ import numpy as np
21
+
22
+ # 🎨 Dataset configurations with emojis for easy identification
23
+ DATASETS = {
24
+ "βš–οΈ Caselaw": {
25
+ "name": "common-pile/caselaw_access_project",
26
+ "description": "Legal cases from Caselaw Access Project",
27
+ "emoji": "βš–οΈ",
28
+ "has_images": False,
29
+ "sample_fields": ["id", "source", "added", "created", "metadata", "text"]
30
+ },
31
+ "πŸ’¬ ChatGPT": {
32
+ "name": "fka/awesome-chatgpt-prompts",
33
+ "description": "Awesome ChatGPT prompts collection",
34
+ "emoji": "πŸ’¬",
35
+ "has_images": False,
36
+ "sample_fields": ["act", "prompt"]
37
+ },
38
+ "πŸ’° Finance": {
39
+ "name": "snorkelai/agent-finance-reasoning",
40
+ "description": "Agent finance reasoning dataset",
41
+ "emoji": "πŸ’°",
42
+ "has_images": False,
43
+ "sample_fields": ["id", "question", "answer", "reasoning"]
44
+ },
45
+ "πŸ₯ Medical": {
46
+ "name": "FreedomIntelligence/medical-o1-reasoning-SFT",
47
+ "description": "Medical reasoning for SFT training",
48
+ "emoji": "πŸ₯",
49
+ "has_images": False,
50
+ "sample_fields": ["instruction", "output", "reasoning"]
51
+ },
52
+ "πŸ–ΌοΈ InScene": {
53
+ "name": "peteromallet/InScene-Dataset",
54
+ "description": "Image scene understanding dataset",
55
+ "emoji": "πŸ–ΌοΈ",
56
+ "has_images": True,
57
+ "sample_fields": ["image", "text", "scene_type"]
58
+ }
59
+ }
60
+
61
+ # πŸ› οΈ Access pattern configurations
62
+ ACCESS_PATTERNS = {
63
+ "🌐 API": "Direct API calls with curl",
64
+ "🐼 Pandas": "Load with pandas library",
65
+ "πŸ₯ Croissant": "MLCroissant metadata format",
66
+ "πŸ“š Datasets": "HuggingFace datasets library",
67
+ "πŸ” Search": "Smart search functionality"
68
+ }
69
+
70
+ class DatasetExplorer:
71
+ """🎯 Main class for dataset exploration with multiple access patterns"""
72
+
73
+ def __init__(self):
74
+ self.api = HfApi()
75
+ self.cache = {}
76
+
77
+ async def fetch_api_data(self, dataset_name: str, limit: int = 100) -> Dict:
78
+ """🌐 Fetch data using HuggingFace API with async magic"""
79
+ try:
80
+ url = f"https://datasets-server.huggingface.co/rows"
81
+ params = {
82
+ "dataset": dataset_name,
83
+ "config": "default",
84
+ "split": "train",
85
+ "offset": 0,
86
+ "length": min(limit, 100)
87
+ }
88
+
89
+ timeout = aiohttp.ClientTimeout(total=30) # 30 second timeout
90
+ async with aiohttp.ClientSession(timeout=timeout) as session:
91
+ async with session.get(url, params=params) as response:
92
+ if response.status == 200:
93
+ data = await response.json()
94
+ return {"success": True, "data": data, "total_rows": len(data.get("rows", []))}
95
+ elif response.status == 404:
96
+ return {"success": False, "error": "Dataset not found or not accessible"}
97
+ elif response.status == 403:
98
+ return {"success": False, "error": "Access denied - dataset may require authentication"}
99
+ else:
100
+ return {"success": False, "error": f"API returned {response.status}"}
101
+ except asyncio.TimeoutError:
102
+ return {"success": False, "error": "Request timed out - dataset may be too large"}
103
+ except Exception as e:
104
+ return {"success": False, "error": f"Network error: {str(e)}"}
105
+
106
+ def load_with_pandas(self, dataset_name: str, limit: int = 100) -> Dict:
107
+ """🐼 Load data using pandas - because who doesn't love pandas?"""
108
+ try:
109
+ df = None
110
+
111
+ # Dataset-specific loading logic
112
+ if dataset_name == "fka/awesome-chatgpt-prompts":
113
+ df = pd.read_csv(f"hf://datasets/{dataset_name}/prompts.csv")
114
+ elif dataset_name == "snorkelai/agent-finance-reasoning":
115
+ df = pd.read_parquet(f"hf://datasets/{dataset_name}/train.parquet")
116
+ elif dataset_name == "peteromallet/InScene-Dataset":
117
+ splits = {'train': 'data/train-00000-of-00001.parquet'}
118
+ df = pd.read_parquet(f"hf://datasets/{dataset_name}/" + splits["train"])
119
+ elif dataset_name == "FreedomIntelligence/medical-o1-reasoning-SFT":
120
+ # Try different file formats
121
+ try:
122
+ df = pd.read_json(f"hf://datasets/{dataset_name}/medical_o1_sft.json", lines=True)
123
+ except:
124
+ df = pd.read_json(f"hf://datasets/{dataset_name}/medical_o1_sft.json")
125
+ elif dataset_name == "common-pile/caselaw_access_project":
126
+ # For large jsonl.gz files, use streaming
127
+ try:
128
+ import gzip
129
+ # This is a workaround for large compressed files
130
+ df = pd.read_json(f"hf://datasets/{dataset_name}/data/train-00000-of-00001.jsonl.gz",
131
+ lines=True, compression='gzip')
132
+ except:
133
+ # Fallback to API if direct file access fails
134
+ return {"success": False, "error": "Large dataset - please use API access method"}
135
+ else:
136
+ # Generic fallback
137
+ try:
138
+ df = pd.read_parquet(f"hf://datasets/{dataset_name}/train.parquet")
139
+ except:
140
+ df = pd.read_json(f"hf://datasets/{dataset_name}/train.json", lines=True)
141
+
142
+ if df is None:
143
+ return {"success": False, "error": "Could not determine appropriate loading method"}
144
+
145
+ total_rows = len(df)
146
+ df_limited = df.head(limit)
147
+
148
+ return {
149
+ "success": True,
150
+ "data": df_limited,
151
+ "total_rows": total_rows
152
+ }
153
+
154
+ except FileNotFoundError:
155
+ return {"success": False, "error": "Dataset files not found - try API access method"}
156
+ except pd.errors.EmptyDataError:
157
+ return {"success": False, "error": "Dataset appears to be empty"}
158
+ except pd.errors.ParserError as e:
159
+ return {"success": False, "error": f"Data parsing error: {str(e)}"}
160
+ except PermissionError:
161
+ return {"success": False, "error": "Dataset requires authentication - please login first"}
162
+ except Exception as e:
163
+ return {"success": False, "error": f"Pandas loading failed: {str(e)}"}
164
+
165
+ def load_with_datasets(self, dataset_name: str, limit: int = 100) -> Dict:
166
+ """πŸ“š Load using HuggingFace datasets library - the OG way"""
167
+ try:
168
+ ds = load_dataset(dataset_name, split="train", streaming=True)
169
+ data = list(ds.take(limit))
170
+ df = pd.DataFrame(data)
171
+
172
+ return {
173
+ "success": True,
174
+ "data": df,
175
+ "total_rows": len(data)
176
+ }
177
+ except Exception as e:
178
+ return {"success": False, "error": f"Datasets loading failed: {str(e)}"}
179
+
180
+ def search_dataset(self, dataset_name: str, query: str, limit: int = 100) -> Dict:
181
+ """πŸ” Smart search functionality - finding needles in data haystacks"""
182
+ try:
183
+ # First try to load some data
184
+ result = self.load_with_pandas(dataset_name, limit=1000)
185
+ if not result["success"]:
186
+ result = self.load_with_datasets(dataset_name, limit=1000)
187
+
188
+ if not result["success"]:
189
+ return {"success": False, "error": "Could not load data for search"}
190
+
191
+ df = result["data"]
192
+
193
+ # Perform search across text columns
194
+ text_columns = df.select_dtypes(include=['object']).columns
195
+ search_results = pd.DataFrame()
196
+
197
+ for col in text_columns:
198
+ mask = df[col].astype(str).str.contains(query, case=False, na=False)
199
+ matches = df[mask]
200
+ if not matches.empty:
201
+ search_results = pd.concat([search_results, matches])
202
+
203
+ # Remove duplicates and limit results
204
+ search_results = search_results.drop_duplicates().head(limit)
205
+
206
+ return {
207
+ "success": True,
208
+ "data": search_results,
209
+ "total_matches": len(search_results)
210
+ }
211
+ except Exception as e:
212
+ return {"success": False, "error": f"Search failed: {str(e)}"}
213
+
214
+ # 🎨 Initialize our explorer
215
+ explorer = DatasetExplorer()
216
+
217
+ def format_results(result: Dict, format_type: str) -> str:
218
+ """🎨 Format results in different ways - because variety is the spice of life"""
219
+ if not result["success"]:
220
+ return f"❌ Error: {result['error']}"
221
+
222
+ df = result["data"]
223
+
224
+ if format_type == "πŸ“Š DataFrame":
225
+ return df.to_string(max_rows=50, max_cols=10)
226
+ elif format_type == "πŸ“ Markdown":
227
+ return df.to_markdown(index=False, max_cols=10)
228
+ elif format_type == "πŸ“‹ Tab-Delimited":
229
+ return df.to_csv(sep='\t', index=False)
230
+ else:
231
+ return str(df)
232
+
233
+ def export_data(df: pd.DataFrame, format_type: str) -> str:
234
+ """πŸ’Ύ Export data in various formats - take your data to go!"""
235
+ if format_type == "CSV":
236
+ return df.to_csv(index=False)
237
+ elif format_type == "XLSX":
238
+ buffer = io.BytesIO()
239
+ df.to_excel(buffer, index=False)
240
+ buffer.seek(0)
241
+ return base64.b64encode(buffer.getvalue()).decode()
242
+ elif format_type == "JSON":
243
+ return df.to_json(orient='records', indent=2)
244
+ else:
245
+ return df.to_string()
246
+
247
+ async def query_dataset(dataset_key: str, access_pattern: str, query: str = "", limit: int = 100) -> Tuple[str, str, str, str]:
248
+ """🎯 Main query function - the heart of our operation"""
249
+
250
+ dataset_info = DATASETS[dataset_key]
251
+ dataset_name = dataset_info["name"]
252
+ emoji = dataset_info["emoji"]
253
+
254
+ # Show progress
255
+ status = f"{emoji} Fetching data using {access_pattern}..."
256
+
257
+ try:
258
+ result = None
259
+
260
+ if access_pattern == "🌐 API":
261
+ result = await explorer.fetch_api_data(dataset_name, limit)
262
+ if result["success"] and "data" in result:
263
+ # Handle API response format
264
+ if "rows" in result["data"]:
265
+ df = pd.DataFrame(result["data"]["rows"])
266
+ else:
267
+ df = pd.DataFrame(result["data"])
268
+ result["data"] = df
269
+
270
+ elif access_pattern == "🐼 Pandas":
271
+ result = explorer.load_with_pandas(dataset_name, limit)
272
+
273
+ elif access_pattern == "πŸ“š Datasets":
274
+ result = explorer.load_with_datasets(dataset_name, limit)
275
+
276
+ elif access_pattern == "πŸ” Search":
277
+ if not query.strip():
278
+ return "❌ Please enter a search query for search mode", "", "", ""
279
+ result = explorer.search_dataset(dataset_name, query, limit)
280
+
281
+ elif access_pattern == "πŸ₯ Croissant":
282
+ # Add Croissant loading logic
283
+ result = {"success": False, "error": "Croissant loading not yet implemented - coming soon! 🚧"}
284
+
285
+ else:
286
+ result = {"success": False, "error": "Unknown access pattern"}
287
+
288
+ if not result or not result["success"]:
289
+ error_msg = result.get("error", "Unknown error") if result else "No result returned"
290
+ return f"❌ {error_msg}", "", "", ""
291
+
292
+ df = result["data"]
293
+
294
+ # Ensure we have a valid DataFrame
295
+ if df is None or df.empty:
296
+ return "❌ No data returned from dataset", "", "", ""
297
+
298
+ # Add metadata info
299
+ metadata_info = f"πŸ“Š Loaded {len(df)} rows"
300
+ if "total_rows" in result:
301
+ metadata_info += f" (of {result['total_rows']} total)"
302
+ metadata_info += f" using {access_pattern}\n\n"
303
+
304
+ # Format in different ways
305
+ dataframe_view = metadata_info + format_results(result, "πŸ“Š DataFrame")
306
+ markdown_view = metadata_info + format_results(result, "πŸ“ Markdown")
307
+ tab_delimited = format_results(result, "πŸ“‹ Tab-Delimited")
308
+
309
+ # Generate access code
310
+ access_code = generate_access_code(dataset_name, access_pattern, query)
311
+
312
+ return dataframe_view, markdown_view, tab_delimited, access_code
313
+
314
+ except Exception as e:
315
+ error_details = f"Unexpected error in {access_pattern}: {str(e)}"
316
+ return f"❌ {error_details}", "", "", ""
317
+
318
+ def generate_access_code(dataset_name: str, access_pattern: str, query: str = "") -> str:
319
+ """πŸ’» Generate Python code for the selected access pattern"""
320
+
321
+ if access_pattern == "🌐 API":
322
+ return f'''# 🌐 API Access Code
323
+ import requests
324
+
325
+ url = "https://datasets-server.huggingface.co/rows"
326
+ params = {{
327
+ "dataset": "{dataset_name}",
328
+ "config": "default",
329
+ "split": "train",
330
+ "offset": 0,
331
+ "length": 100
332
+ }}
333
+
334
+ response = requests.get(url, params=params)
335
+ data = response.json()
336
+ print(f"Loaded {{len(data['rows'])}} rows")
337
+ '''
338
+
339
+ elif access_pattern == "🐼 Pandas":
340
+ if dataset_name == "fka/awesome-chatgpt-prompts":
341
+ return f'''# 🐼 Pandas Access Code
342
+ import pandas as pd
343
+
344
+ df = pd.read_csv("hf://datasets/{dataset_name}/prompts.csv")
345
+ print(f"Loaded {{len(df)}} rows")
346
+ print(df.head())
347
+ '''
348
+ else:
349
+ return f'''# 🐼 Pandas Access Code
350
+ import pandas as pd
351
+
352
+ df = pd.read_parquet("hf://datasets/{dataset_name}/train.parquet")
353
+ print(f"Loaded {{len(df)}} rows")
354
+ print(df.head())
355
+ '''
356
+
357
+ elif access_pattern == "πŸ“š Datasets":
358
+ return f'''# πŸ“š Datasets Library Access Code
359
+ from datasets import load_dataset
360
+
361
+ ds = load_dataset("{dataset_name}", split="train")
362
+ print(f"Loaded {{len(ds)}} rows")
363
+ print(ds[0])
364
+ '''
365
+
366
+ elif access_pattern == "πŸ” Search":
367
+ return f'''# πŸ” Search Code
368
+ import pandas as pd
369
+
370
+ # Load the dataset
371
+ df = pd.read_parquet("hf://datasets/{dataset_name}/train.parquet")
372
+
373
+ # Search for: "{query}"
374
+ text_columns = df.select_dtypes(include=['object']).columns
375
+ search_results = pd.DataFrame()
376
+
377
+ for col in text_columns:
378
+ mask = df[col].astype(str).str.contains("{query}", case=False, na=False)
379
+ matches = df[mask]
380
+ if not matches.empty:
381
+ search_results = pd.concat([search_results, matches])
382
+
383
+ search_results = search_results.drop_duplicates()
384
+ print(f"Found {{len(search_results)}} matching rows")
385
+ '''
386
+
387
+ else:
388
+ return "# Code generation not available for this pattern"
389
+
390
+ def create_image_viewer(dataset_key: str, current_data: str = "") -> Tuple[str, str]:
391
+ """πŸ–ΌοΈ Create image viewer for datasets with images"""
392
+ if dataset_key != "πŸ–ΌοΈ InScene":
393
+ return "This dataset does not contain images", ""
394
+
395
+ try:
396
+ # Parse current data to look for image information
397
+ if not current_data or "❌" in current_data:
398
+ return """
399
+ πŸ–ΌοΈ **Image Viewer for InScene Dataset**
400
+
401
+ To view images, first query the dataset using any access method.
402
+ The image viewer will then display available images with their metadata.
403
+
404
+ **Features coming in this viewer:**
405
+ - πŸ–ΌοΈ Image thumbnails and full-size viewing
406
+ - πŸ“ Image metadata and annotations
407
+ - πŸ” Search images by scene type
408
+ - πŸ“Š Navigation between images
409
+ - πŸ’Ύ Download individual images
410
+ """, ""
411
+
412
+ # If we have data, try to extract image info
413
+ image_info = """
414
+ πŸ–ΌοΈ **InScene Dataset Images**
415
+
416
+ **Sample Image Metadata:**
417
+ - Scene types: Indoor, Outdoor, Urban, Natural
418
+ - Annotations: Object detection, scene classification
419
+ - Format: Various (JPG, PNG)
420
+ - Resolution: Mixed resolutions
421
+
422
+ **Navigation:**
423
+ - Use the query controls above to load specific images
424
+ - Search for scene types like "indoor", "outdoor", "kitchen", etc.
425
+ - Images will be displayed with their metadata
426
+
427
+ 🚧 **Full image viewer implementation coming soon!**
428
+ For now, use the data tabs above to explore image metadata.
429
+ """
430
+
431
+ return image_info, ""
432
+
433
+ except Exception as e:
434
+ return f"Error in image viewer: {str(e)}", ""
435
+
436
+ def get_export_data(dataframe_content: str, format_type: str) -> Tuple[str, str]:
437
+ """πŸ’Ύ Prepare data for export in various formats"""
438
+ try:
439
+ if not dataframe_content or "❌" in dataframe_content:
440
+ return "No data to export", ""
441
+
442
+ # Extract actual data from the display format
443
+ # This is a simplified version - in production you'd want to maintain
444
+ # the actual DataFrame separately
445
+
446
+ if format_type == "CSV":
447
+ filename = "dataset_export.csv"
448
+ # In a real implementation, you'd export the actual DataFrame
449
+ content = "# Export functionality will be implemented with actual DataFrame data\n"
450
+ content += "# This is a placeholder showing the export structure\n"
451
+ content += dataframe_content
452
+
453
+ elif format_type == "XLSX":
454
+ filename = "dataset_export.xlsx"
455
+ content = "Excel export will be available in full implementation"
456
+
457
+ elif format_type == "JSON":
458
+ filename = "dataset_export.json"
459
+ content = '{"note": "JSON export will contain actual DataFrame data"}'
460
+
461
+ else:
462
+ filename = "dataset_export.txt"
463
+ content = dataframe_content
464
+
465
+ return content, filename
466
+
467
+ except Exception as e:
468
+ return f"Export error: {str(e)}", "error.txt"
469
+
470
+ # 🎨 Create the Gradio interface
471
+ def create_interface():
472
+ """🎨 Create the main Gradio interface - where the magic happens"""
473
+
474
+ with gr.Blocks(
475
+ title="🌟 Multi-Dataset Explorer",
476
+ theme=gr.themes.Soft(),
477
+ css="""
478
+ .dataset-card { border: 2px solid #e1e5e9; border-radius: 10px; padding: 15px; margin: 10px; }
479
+ .emoji-large { font-size: 2em; }
480
+ """
481
+ ) as demo:
482
+
483
+ gr.Markdown("""
484
+ # 🌟 Multi-Dataset Explorer 🌟
485
+ ### Explore 5 amazing datasets with multiple access patterns!
486
+ Choose your dataset πŸ“Š, pick your method πŸ› οΈ, and dive deep into the data πŸŠβ€β™€οΈ
487
+ """)
488
+
489
+ with gr.Row():
490
+ dataset_dropdown = gr.Dropdown(
491
+ choices=list(DATASETS.keys()),
492
+ value=list(DATASETS.keys())[0],
493
+ label="πŸ“Š Select Dataset",
494
+ interactive=True
495
+ )
496
+
497
+ access_dropdown = gr.Dropdown(
498
+ choices=list(ACCESS_PATTERNS.keys()),
499
+ value=list(ACCESS_PATTERNS.keys())[0],
500
+ label="πŸ› οΈ Access Method",
501
+ interactive=True
502
+ )
503
+
504
+ with gr.Row():
505
+ query_input = gr.Textbox(
506
+ placeholder="πŸ” Enter search query (for search mode)",
507
+ label="Search Query",
508
+ interactive=True
509
+ )
510
+
511
+ limit_slider = gr.Slider(
512
+ minimum=10,
513
+ maximum=500,
514
+ value=100,
515
+ label="πŸ“ Result Limit",
516
+ interactive=True
517
+ )
518
+
519
+ query_button = gr.Button("πŸš€ Query Dataset", variant="primary", size="lg")
520
+
521
+ with gr.Tabs():
522
+
523
+ with gr.Tab("πŸ“Š Data View"):
524
+ dataframe_output = gr.Textbox(
525
+ label="πŸ“Š DataFrame View",
526
+ lines=20,
527
+ max_lines=30
528
+ )
529
+
530
+ with gr.Tab("πŸ“ Markdown"):
531
+ markdown_output = gr.Textbox(
532
+ label="πŸ“ Markdown Format",
533
+ lines=20,
534
+ max_lines=30
535
+ )
536
+
537
+ with gr.Tab("πŸ“‹ Copy-Paste"):
538
+ tab_output = gr.Textbox(
539
+ label="πŸ“‹ Tab-Delimited (Copy-Ready)",
540
+ lines=20,
541
+ max_lines=30
542
+ )
543
+
544
+ with gr.Tab("πŸ’» Access Code"):
545
+ code_output = gr.Code(
546
+ label="πŸ’» Python Access Code",
547
+ language="python",
548
+ lines=15
549
+ )
550
+
551
+ with gr.Tab("πŸ–ΌοΈ Images"):
552
+ image_output = gr.Textbox(
553
+ label="πŸ–ΌοΈ Image Viewer",
554
+ lines=10
555
+ )
556
+
557
+ with gr.Row():
558
+ gr.Markdown("### πŸ’Ύ Export Options")
559
+ with gr.Column():
560
+ export_format = gr.Dropdown(
561
+ choices=["CSV", "XLSX", "JSON", "TXT"],
562
+ value="CSV",
563
+ label="Export Format"
564
+ )
565
+ export_button = gr.Button("πŸ’Ύ Export Data", variant="secondary")
566
+ export_output = gr.File(label="πŸ“ Download", visible=False)
567
+
568
+ # πŸ”§ Status and help section
569
+ with gr.Row():
570
+ status_display = gr.Textbox(
571
+ label="πŸ“Š Status",
572
+ value="Ready to explore datasets! πŸš€",
573
+ interactive=False
574
+ )
575
+
576
+ # πŸ“– Dataset info display
577
+ def update_dataset_info(dataset_key):
578
+ info = DATASETS[dataset_key]
579
+ return f"""
580
+ ## {info['emoji']} {dataset_key}
581
+ **Description:** {info['description']}
582
+ **Dataset:** `{info['name']}`
583
+ **Has Images:** {'Yes πŸ–ΌοΈ' if info['has_images'] else 'No πŸ“'}
584
+ **Sample Fields:** {', '.join(info['sample_fields'])}
585
+
586
+ ### πŸ”§ Recommended Access Methods:
587
+ - **🌐 API**: Fast, always works, limited to 100 rows
588
+ - **🐼 Pandas**: Full dataset access, may require authentication
589
+ - **πŸ“š Datasets**: Streaming support, good for large datasets
590
+ - **πŸ” Search**: Find specific content within the dataset
591
+ """
592
+
593
+ dataset_info = gr.Markdown()
594
+
595
+ # πŸ”— Event handlers
596
+ dataset_dropdown.change(
597
+ update_dataset_info,
598
+ inputs=[dataset_dropdown],
599
+ outputs=[dataset_info]
600
+ )
601
+
602
+ # Update image viewer when dataset changes
603
+ def update_image_viewer(dataset_key, current_data):
604
+ return create_image_viewer(dataset_key, current_data)
605
+
606
+ dataset_dropdown.change(
607
+ update_image_viewer,
608
+ inputs=[dataset_dropdown, dataframe_output],
609
+ outputs=[image_output]
610
+ )
611
+
612
+ # Async wrapper for the query function
613
+ def query_wrapper(dataset_key, access_pattern, query, limit):
614
+ try:
615
+ return asyncio.run(query_dataset(dataset_key, access_pattern, query, limit))
616
+ except Exception as e:
617
+ error_msg = f"Query failed: {str(e)}"
618
+ return error_msg, error_msg, error_msg, f"# Error: {str(e)}"
619
+
620
+ # Update status on query start
621
+ def update_status_start(dataset_key, access_pattern):
622
+ dataset_emoji = DATASETS[dataset_key]["emoji"]
623
+ return f"{dataset_emoji} Querying with {access_pattern}... Please wait ⏳"
624
+
625
+ query_button.click(
626
+ update_status_start,
627
+ inputs=[dataset_dropdown, access_dropdown],
628
+ outputs=[status_display]
629
+ )
630
+
631
+ def query_and_update_status(dataset_key, access_pattern, query, limit):
632
+ results = query_wrapper(dataset_key, access_pattern, query, limit)
633
+
634
+ # Update status based on results
635
+ if results[0].startswith("❌"):
636
+ status = f"❌ Query failed - see data tabs for details"
637
+ else:
638
+ dataset_emoji = DATASETS[dataset_key]["emoji"]
639
+ status = f"βœ… {dataset_emoji} Data loaded successfully!"
640
+
641
+ return results + (status,)
642
+
643
+ query_button.click(
644
+ query_and_update_status,
645
+ inputs=[dataset_dropdown, access_dropdown, query_input, limit_slider],
646
+ outputs=[dataframe_output, markdown_output, tab_output, code_output, status_display]
647
+ )
648
+
649
+ # Export functionality
650
+ def handle_export(format_type, dataframe_content):
651
+ content, filename = get_export_data(dataframe_content, format_type)
652
+
653
+ # Create a temporary file for download
654
+ import tempfile
655
+ import os
656
+
657
+ temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=f'.{format_type.lower()}')
658
+ temp_file.write(content)
659
+ temp_file.close()
660
+
661
+ return temp_file.name
662
+
663
+ export_button.click(
664
+ handle_export,
665
+ inputs=[export_format, dataframe_output],
666
+ outputs=[export_output]
667
+ )
668
+
669
+ # Initialize with first dataset info
670
+ demo.load(
671
+ update_dataset_info,
672
+ inputs=[dataset_dropdown],
673
+ outputs=[dataset_info]
674
+ )
675
+
676
+ gr.Markdown("""
677
+ ---
678
+ ### 🎯 Quick Tips:
679
+ - **βš–οΈ Caselaw**: Legal document analysis
680
+ - **πŸ’¬ ChatGPT**: Prompt engineering examples
681
+ - **πŸ’° Finance**: Financial reasoning chains
682
+ - **πŸ₯ Medical**: Medical AI training data
683
+ - **πŸ–ΌοΈ InScene**: Computer vision datasets
684
+
685
+ ### πŸ› οΈ Access Patterns:
686
+ - **🌐 API**: Direct HTTP calls
687
+ - **🐼 Pandas**: DataFrame magic
688
+ - **πŸ“š Datasets**: HF standard
689
+ - **πŸ” Search**: Smart filtering
690
+
691
+ Made with ❀️ and lots of β˜• for the global data community 🌍
692
+ """)
693
+
694
+ return demo
695
+
696
+ if __name__ == "__main__":
697
+ demo = create_interface()
698
+ demo.launch(
699
+ server_name="0.0.0.0",
700
+ server_port=7860,
701
+ share=True,
702
+ show_error=True
703
+ )