Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Claude

Sleeping

App Files Files Community

awacke1 commited on Jul 19

Commit

7f41697

verified ·

1 Parent(s): e09f513

Create app.py

Browse files

Files changed (1) hide show

app.py +703 -0

app.py ADDED Viewed

	@@ -0,0 +1,703 @@

+#!/usr/bin/env python3
+"""
+🌟 Multi-Dataset Explorer 🌟
+A comprehensive Gradio app for exploring datasets with multiple access patterns
+Built with emojis, wit, and international accessibility in mind!
+"""
+import gradio as gr
+import pandas as pd
+import requests
+import json
+import io
+import base64
+from typing import Dict, List, Tuple, Optional, Any
+import asyncio
+import aiohttp
+from datasets import load_dataset
+from huggingface_hub import HfApi
+from PIL import Image
+import numpy as np
+# 🎨 Dataset configurations with emojis for easy identification
+DATASETS = {
+    "⚖️ Caselaw": {
+        "name": "common-pile/caselaw_access_project",
+        "description": "Legal cases from Caselaw Access Project",
+        "emoji": "⚖️",
+        "has_images": False,
+        "sample_fields": ["id", "source", "added", "created", "metadata", "text"]
+    },
+    "💬 ChatGPT": {
+        "name": "fka/awesome-chatgpt-prompts",
+        "description": "Awesome ChatGPT prompts collection",
+        "emoji": "💬",
+        "has_images": False,
+        "sample_fields": ["act", "prompt"]
+    },
+    "💰 Finance": {
+        "name": "snorkelai/agent-finance-reasoning",
+        "description": "Agent finance reasoning dataset",
+        "emoji": "💰",
+        "has_images": False,
+        "sample_fields": ["id", "question", "answer", "reasoning"]
+    },
+    "🏥 Medical": {
+        "name": "FreedomIntelligence/medical-o1-reasoning-SFT",
+        "description": "Medical reasoning for SFT training",
+        "emoji": "🏥",
+        "has_images": False,
+        "sample_fields": ["instruction", "output", "reasoning"]
+    },
+    "🖼️ InScene": {
+        "name": "peteromallet/InScene-Dataset",
+        "description": "Image scene understanding dataset",
+        "emoji": "🖼️",
+        "has_images": True,
+        "sample_fields": ["image", "text", "scene_type"]
+    }
+}
+# 🛠️ Access pattern configurations
+ACCESS_PATTERNS = {
+    "🌐 API": "Direct API calls with curl",
+    "🐼 Pandas": "Load with pandas library",
+    "🥐 Croissant": "MLCroissant metadata format",
+    "📚 Datasets": "HuggingFace datasets library",
+    "🔍 Search": "Smart search functionality"
+}
+class DatasetExplorer:
+    """🎯 Main class for dataset exploration with multiple access patterns"""
+    def __init__(self):
+        self.api = HfApi()
+        self.cache = {}
+    async def fetch_api_data(self, dataset_name: str, limit: int = 100) -> Dict:
+        """🌐 Fetch data using HuggingFace API with async magic"""
+        try:
+            url = f"https://datasets-server.huggingface.co/rows"
+            params = {
+                "dataset": dataset_name,
+                "config": "default",
+                "split": "train",
+                "offset": 0,
+                "length": min(limit, 100)
+            }
+            timeout = aiohttp.ClientTimeout(total=30)  # 30 second timeout
+            async with aiohttp.ClientSession(timeout=timeout) as session:
+                async with session.get(url, params=params) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        return {"success": True, "data": data, "total_rows": len(data.get("rows", []))}
+                    elif response.status == 404:
+                        return {"success": False, "error": "Dataset not found or not accessible"}
+                    elif response.status == 403:
+                        return {"success": False, "error": "Access denied - dataset may require authentication"}
+                    else:
+                        return {"success": False, "error": f"API returned {response.status}"}
+        except asyncio.TimeoutError:
+            return {"success": False, "error": "Request timed out - dataset may be too large"}
+        except Exception as e:
+            return {"success": False, "error": f"Network error: {str(e)}"}
+    def load_with_pandas(self, dataset_name: str, limit: int = 100) -> Dict:
+        """🐼 Load data using pandas - because who doesn't love pandas?"""
+        try:
+            df = None
+            # Dataset-specific loading logic
+            if dataset_name == "fka/awesome-chatgpt-prompts":
+                df = pd.read_csv(f"hf://datasets/{dataset_name}/prompts.csv")
+            elif dataset_name == "snorkelai/agent-finance-reasoning":
+                df = pd.read_parquet(f"hf://datasets/{dataset_name}/train.parquet")
+            elif dataset_name == "peteromallet/InScene-Dataset":
+                splits = {'train': 'data/train-00000-of-00001.parquet'}
+                df = pd.read_parquet(f"hf://datasets/{dataset_name}/" + splits["train"])
+            elif dataset_name == "FreedomIntelligence/medical-o1-reasoning-SFT":
+                # Try different file formats
+                try:
+                    df = pd.read_json(f"hf://datasets/{dataset_name}/medical_o1_sft.json", lines=True)
+                except:
+                    df = pd.read_json(f"hf://datasets/{dataset_name}/medical_o1_sft.json")
+            elif dataset_name == "common-pile/caselaw_access_project":
+                # For large jsonl.gz files, use streaming
+                try:
+                    import gzip
+                    # This is a workaround for large compressed files
+                    df = pd.read_json(f"hf://datasets/{dataset_name}/data/train-00000-of-00001.jsonl.gz",
+                                    lines=True, compression='gzip')
+                except:
+                    # Fallback to API if direct file access fails
+                    return {"success": False, "error": "Large dataset - please use API access method"}
+            else:
+                # Generic fallback
+                try:
+                    df = pd.read_parquet(f"hf://datasets/{dataset_name}/train.parquet")
+                except:
+                    df = pd.read_json(f"hf://datasets/{dataset_name}/train.json", lines=True)
+            if df is None:
+                return {"success": False, "error": "Could not determine appropriate loading method"}
+            total_rows = len(df)
+            df_limited = df.head(limit)
+            return {
+                "success": True,
+                "data": df_limited,
+                "total_rows": total_rows
+            }
+        except FileNotFoundError:
+            return {"success": False, "error": "Dataset files not found - try API access method"}
+        except pd.errors.EmptyDataError:
+            return {"success": False, "error": "Dataset appears to be empty"}
+        except pd.errors.ParserError as e:
+            return {"success": False, "error": f"Data parsing error: {str(e)}"}
+        except PermissionError:
+            return {"success": False, "error": "Dataset requires authentication - please login first"}
+        except Exception as e:
+            return {"success": False, "error": f"Pandas loading failed: {str(e)}"}
+    def load_with_datasets(self, dataset_name: str, limit: int = 100) -> Dict:
+        """📚 Load using HuggingFace datasets library - the OG way"""
+        try:
+            ds = load_dataset(dataset_name, split="train", streaming=True)
+            data = list(ds.take(limit))
+            df = pd.DataFrame(data)
+            return {
+                "success": True,
+                "data": df,
+                "total_rows": len(data)
+            }
+        except Exception as e:
+            return {"success": False, "error": f"Datasets loading failed: {str(e)}"}
+    def search_dataset(self, dataset_name: str, query: str, limit: int = 100) -> Dict:
+        """🔍 Smart search functionality - finding needles in data haystacks"""
+        try:
+            # First try to load some data
+            result = self.load_with_pandas(dataset_name, limit=1000)
+            if not result["success"]:
+                result = self.load_with_datasets(dataset_name, limit=1000)
+            if not result["success"]:
+                return {"success": False, "error": "Could not load data for search"}
+            df = result["data"]
+            # Perform search across text columns
+            text_columns = df.select_dtypes(include=['object']).columns
+            search_results = pd.DataFrame()
+            for col in text_columns:
+                mask = df[col].astype(str).str.contains(query, case=False, na=False)
+                matches = df[mask]
+                if not matches.empty:
+                    search_results = pd.concat([search_results, matches])
+            # Remove duplicates and limit results
+            search_results = search_results.drop_duplicates().head(limit)
+            return {
+                "success": True,
+                "data": search_results,
+                "total_matches": len(search_results)
+            }
+        except Exception as e:
+            return {"success": False, "error": f"Search failed: {str(e)}"}
+# 🎨 Initialize our explorer
+explorer = DatasetExplorer()
+def format_results(result: Dict, format_type: str) -> str:
+    """🎨 Format results in different ways - because variety is the spice of life"""
+    if not result["success"]:
+        return f"❌ Error: {result['error']}"
+    df = result["data"]
+    if format_type == "📊 DataFrame":
+        return df.to_string(max_rows=50, max_cols=10)
+    elif format_type == "📝 Markdown":
+        return df.to_markdown(index=False, max_cols=10)
+    elif format_type == "📋 Tab-Delimited":
+        return df.to_csv(sep='\t', index=False)
+    else:
+        return str(df)
+def export_data(df: pd.DataFrame, format_type: str) -> str:
+    """💾 Export data in various formats - take your data to go!"""
+    if format_type == "CSV":
+        return df.to_csv(index=False)
+    elif format_type == "XLSX":
+        buffer = io.BytesIO()
+        df.to_excel(buffer, index=False)
+        buffer.seek(0)
+        return base64.b64encode(buffer.getvalue()).decode()
+    elif format_type == "JSON":
+        return df.to_json(orient='records', indent=2)
+    else:
+        return df.to_string()
+async def query_dataset(dataset_key: str, access_pattern: str, query: str = "", limit: int = 100) -> Tuple[str, str, str, str]:
+    """🎯 Main query function - the heart of our operation"""
+    dataset_info = DATASETS[dataset_key]
+    dataset_name = dataset_info["name"]
+    emoji = dataset_info["emoji"]
+    # Show progress
+    status = f"{emoji} Fetching data using {access_pattern}..."
+    try:
+        result = None
+        if access_pattern == "🌐 API":
+            result = await explorer.fetch_api_data(dataset_name, limit)
+            if result["success"] and "data" in result:
+                # Handle API response format
+                if "rows" in result["data"]:
+                    df = pd.DataFrame(result["data"]["rows"])
+                else:
+                    df = pd.DataFrame(result["data"])
+                result["data"] = df
+        elif access_pattern == "🐼 Pandas":
+            result = explorer.load_with_pandas(dataset_name, limit)
+        elif access_pattern == "📚 Datasets":
+            result = explorer.load_with_datasets(dataset_name, limit)
+        elif access_pattern == "🔍 Search":
+            if not query.strip():
+                return "❌ Please enter a search query for search mode", "", "", ""
+            result = explorer.search_dataset(dataset_name, query, limit)
+        elif access_pattern == "🥐 Croissant":
+            # Add Croissant loading logic
+            result = {"success": False, "error": "Croissant loading not yet implemented - coming soon! 🚧"}
+        else:
+            result = {"success": False, "error": "Unknown access pattern"}
+        if not result or not result["success"]:
+            error_msg = result.get("error", "Unknown error") if result else "No result returned"
+            return f"❌ {error_msg}", "", "", ""
+        df = result["data"]
+        # Ensure we have a valid DataFrame
+        if df is None or df.empty:
+            return "❌ No data returned from dataset", "", "", ""
+        # Add metadata info
+        metadata_info = f"📊 Loaded {len(df)} rows"
+        if "total_rows" in result:
+            metadata_info += f" (of {result['total_rows']} total)"
+        metadata_info += f" using {access_pattern}\n\n"
+        # Format in different ways
+        dataframe_view = metadata_info + format_results(result, "📊 DataFrame")
+        markdown_view = metadata_info + format_results(result, "📝 Markdown")
+        tab_delimited = format_results(result, "📋 Tab-Delimited")
+        # Generate access code
+        access_code = generate_access_code(dataset_name, access_pattern, query)
+        return dataframe_view, markdown_view, tab_delimited, access_code
+    except Exception as e:
+        error_details = f"Unexpected error in {access_pattern}: {str(e)}"
+        return f"❌ {error_details}", "", "", ""
+def generate_access_code(dataset_name: str, access_pattern: str, query: str = "") -> str:
+    """💻 Generate Python code for the selected access pattern"""
+    if access_pattern == "🌐 API":
+        return f'''# 🌐 API Access Code
+import requests
+url = "https://datasets-server.huggingface.co/rows"
+params = {{
+    "dataset": "{dataset_name}",
+    "config": "default",
+    "split": "train",
+    "offset": 0,
+    "length": 100
+}}
+response = requests.get(url, params=params)
+data = response.json()
+print(f"Loaded {{len(data['rows'])}} rows")
+'''
+    elif access_pattern == "🐼 Pandas":
+        if dataset_name == "fka/awesome-chatgpt-prompts":
+            return f'''# 🐼 Pandas Access Code
+import pandas as pd
+df = pd.read_csv("hf://datasets/{dataset_name}/prompts.csv")
+print(f"Loaded {{len(df)}} rows")
+print(df.head())
+'''
+        else:
+            return f'''# 🐼 Pandas Access Code
+import pandas as pd
+df = pd.read_parquet("hf://datasets/{dataset_name}/train.parquet")
+print(f"Loaded {{len(df)}} rows")
+print(df.head())
+'''
+    elif access_pattern == "📚 Datasets":
+        return f'''# 📚 Datasets Library Access Code
+from datasets import load_dataset
+ds = load_dataset("{dataset_name}", split="train")
+print(f"Loaded {{len(ds)}} rows")
+print(ds[0])
+'''
+    elif access_pattern == "🔍 Search":
+        return f'''# 🔍 Search Code
+import pandas as pd
+# Load the dataset
+df = pd.read_parquet("hf://datasets/{dataset_name}/train.parquet")
+# Search for: "{query}"
+text_columns = df.select_dtypes(include=['object']).columns
+search_results = pd.DataFrame()
+for col in text_columns:
+    mask = df[col].astype(str).str.contains("{query}", case=False, na=False)
+    matches = df[mask]
+    if not matches.empty:
+        search_results = pd.concat([search_results, matches])
+search_results = search_results.drop_duplicates()
+print(f"Found {{len(search_results)}} matching rows")
+'''
+    else:
+        return "# Code generation not available for this pattern"
+def create_image_viewer(dataset_key: str, current_data: str = "") -> Tuple[str, str]:
+    """🖼️ Create image viewer for datasets with images"""
+    if dataset_key != "🖼️ InScene":
+        return "This dataset does not contain images", ""
+    try:
+        # Parse current data to look for image information
+        if not current_data or "❌" in current_data:
+            return """
+🖼️ **Image Viewer for InScene Dataset**
+To view images, first query the dataset using any access method.
+The image viewer will then display available images with their metadata.
+**Features coming in this viewer:**
+- 🖼️ Image thumbnails and full-size viewing
+- 📝 Image metadata and annotations
+- 🔍 Search images by scene type
+- 📊 Navigation between images
+- 💾 Download individual images
+""", ""
+        # If we have data, try to extract image info
+        image_info = """
+🖼️ **InScene Dataset Images**
+**Sample Image Metadata:**
+- Scene types: Indoor, Outdoor, Urban, Natural
+- Annotations: Object detection, scene classification
+- Format: Various (JPG, PNG)
+- Resolution: Mixed resolutions
+**Navigation:**
+- Use the query controls above to load specific images
+- Search for scene types like "indoor", "outdoor", "kitchen", etc.
+- Images will be displayed with their metadata
+🚧 **Full image viewer implementation coming soon!**
+For now, use the data tabs above to explore image metadata.
+"""
+        return image_info, ""
+    except Exception as e:
+        return f"Error in image viewer: {str(e)}", ""
+def get_export_data(dataframe_content: str, format_type: str) -> Tuple[str, str]:
+    """💾 Prepare data for export in various formats"""
+    try:
+        if not dataframe_content or "❌" in dataframe_content:
+            return "No data to export", ""
+        # Extract actual data from the display format
+        # This is a simplified version - in production you'd want to maintain
+        # the actual DataFrame separately
+        if format_type == "CSV":
+            filename = "dataset_export.csv"
+            # In a real implementation, you'd export the actual DataFrame
+            content = "# Export functionality will be implemented with actual DataFrame data\n"
+            content += "# This is a placeholder showing the export structure\n"
+            content += dataframe_content
+        elif format_type == "XLSX":
+            filename = "dataset_export.xlsx"
+            content = "Excel export will be available in full implementation"
+        elif format_type == "JSON":
+            filename = "dataset_export.json"
+            content = '{"note": "JSON export will contain actual DataFrame data"}'
+        else:
+            filename = "dataset_export.txt"
+            content = dataframe_content
+        return content, filename
+    except Exception as e:
+        return f"Export error: {str(e)}", "error.txt"
+# 🎨 Create the Gradio interface
+def create_interface():
+    """🎨 Create the main Gradio interface - where the magic happens"""
+    with gr.Blocks(
+        title="🌟 Multi-Dataset Explorer",
+        theme=gr.themes.Soft(),
+        css="""
+        .dataset-card { border: 2px solid #e1e5e9; border-radius: 10px; padding: 15px; margin: 10px; }
+        .emoji-large { font-size: 2em; }
+        """
+    ) as demo:
+        gr.Markdown("""
+        # 🌟 Multi-Dataset Explorer 🌟
+        ### Explore 5 amazing datasets with multiple access patterns!
+        Choose your dataset 📊, pick your method 🛠️, and dive deep into the data 🏊‍♀️
+        """)
+        with gr.Row():
+            dataset_dropdown = gr.Dropdown(
+                choices=list(DATASETS.keys()),
+                value=list(DATASETS.keys())[0],
+                label="📊 Select Dataset",
+                interactive=True
+            )
+            access_dropdown = gr.Dropdown(
+                choices=list(ACCESS_PATTERNS.keys()),
+                value=list(ACCESS_PATTERNS.keys())[0],
+                label="🛠️ Access Method",
+                interactive=True
+            )
+        with gr.Row():
+            query_input = gr.Textbox(
+                placeholder="🔍 Enter search query (for search mode)",
+                label="Search Query",
+                interactive=True
+            )
+            limit_slider = gr.Slider(
+                minimum=10,
+                maximum=500,
+                value=100,
+                label="📏 Result Limit",
+                interactive=True
+            )
+        query_button = gr.Button("🚀 Query Dataset", variant="primary", size="lg")
+        with gr.Tabs():
+            with gr.Tab("📊 Data View"):
+                dataframe_output = gr.Textbox(
+                    label="📊 DataFrame View",
+                    lines=20,
+                    max_lines=30
+                )
+            with gr.Tab("📝 Markdown"):
+                markdown_output = gr.Textbox(
+                    label="📝 Markdown Format",
+                    lines=20,
+                    max_lines=30
+                )
+            with gr.Tab("📋 Copy-Paste"):
+                tab_output = gr.Textbox(
+                    label="📋 Tab-Delimited (Copy-Ready)",
+                    lines=20,
+                    max_lines=30
+                )
+            with gr.Tab("💻 Access Code"):
+                code_output = gr.Code(
+                    label="💻 Python Access Code",
+                    language="python",
+                    lines=15
+                )
+            with gr.Tab("🖼️ Images"):
+                image_output = gr.Textbox(
+                    label="🖼️ Image Viewer",
+                    lines=10
+                )
+        with gr.Row():
+            gr.Markdown("### 💾 Export Options")
+            with gr.Column():
+                export_format = gr.Dropdown(
+                    choices=["CSV", "XLSX", "JSON", "TXT"],
+                    value="CSV",
+                    label="Export Format"
+                )
+                export_button = gr.Button("💾 Export Data", variant="secondary")
+                export_output = gr.File(label="📁 Download", visible=False)
+        # 🔧 Status and help section
+        with gr.Row():
+            status_display = gr.Textbox(
+                label="📊 Status",
+                value="Ready to explore datasets! 🚀",
+                interactive=False
+            )
+        # 📖 Dataset info display
+        def update_dataset_info(dataset_key):
+            info = DATASETS[dataset_key]
+            return f"""
+## {info['emoji']} {dataset_key}
+**Description:** {info['description']}
+**Dataset:** `{info['name']}`
+**Has Images:** {'Yes 🖼️' if info['has_images'] else 'No 📝'}
+**Sample Fields:** {', '.join(info['sample_fields'])}
+### 🔧 Recommended Access Methods:
+- **🌐 API**: Fast, always works, limited to 100 rows
+- **🐼 Pandas**: Full dataset access, may require authentication
+- **📚 Datasets**: Streaming support, good for large datasets
+- **🔍 Search**: Find specific content within the dataset
+"""
+        dataset_info = gr.Markdown()
+        # 🔗 Event handlers
+        dataset_dropdown.change(
+            update_dataset_info,
+            inputs=[dataset_dropdown],
+            outputs=[dataset_info]
+        )
+        # Update image viewer when dataset changes
+        def update_image_viewer(dataset_key, current_data):
+            return create_image_viewer(dataset_key, current_data)
+        dataset_dropdown.change(
+            update_image_viewer,
+            inputs=[dataset_dropdown, dataframe_output],
+            outputs=[image_output]
+        )
+        # Async wrapper for the query function
+        def query_wrapper(dataset_key, access_pattern, query, limit):
+            try:
+                return asyncio.run(query_dataset(dataset_key, access_pattern, query, limit))
+            except Exception as e:
+                error_msg = f"Query failed: {str(e)}"
+                return error_msg, error_msg, error_msg, f"# Error: {str(e)}"
+        # Update status on query start
+        def update_status_start(dataset_key, access_pattern):
+            dataset_emoji = DATASETS[dataset_key]["emoji"]
+            return f"{dataset_emoji} Querying with {access_pattern}... Please wait ⏳"
+        query_button.click(
+            update_status_start,
+            inputs=[dataset_dropdown, access_dropdown],
+            outputs=[status_display]
+        )
+        def query_and_update_status(dataset_key, access_pattern, query, limit):
+            results = query_wrapper(dataset_key, access_pattern, query, limit)
+            # Update status based on results
+            if results[0].startswith("❌"):
+                status = f"❌ Query failed - see data tabs for details"
+            else:
+                dataset_emoji = DATASETS[dataset_key]["emoji"]
+                status = f"✅ {dataset_emoji} Data loaded successfully!"
+            return results + (status,)
+        query_button.click(
+            query_and_update_status,
+            inputs=[dataset_dropdown, access_dropdown, query_input, limit_slider],
+            outputs=[dataframe_output, markdown_output, tab_output, code_output, status_display]
+        )
+        # Export functionality
+        def handle_export(format_type, dataframe_content):
+            content, filename = get_export_data(dataframe_content, format_type)
+            # Create a temporary file for download
+            import tempfile
+            import os
+            temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=f'.{format_type.lower()}')
+            temp_file.write(content)
+            temp_file.close()
+            return temp_file.name
+        export_button.click(
+            handle_export,
+            inputs=[export_format, dataframe_output],
+            outputs=[export_output]
+        )
+        # Initialize with first dataset info
+        demo.load(
+            update_dataset_info,
+            inputs=[dataset_dropdown],
+            outputs=[dataset_info]
+        )
+        gr.Markdown("""
+        ---
+        ### 🎯 Quick Tips:
+        - **⚖️ Caselaw**: Legal document analysis
+        - **💬 ChatGPT**: Prompt engineering examples
+        - **💰 Finance**: Financial reasoning chains
+        - **🏥 Medical**: Medical AI training data
+        - **🖼️ InScene**: Computer vision datasets
+        ### 🛠️ Access Patterns:
+        - **🌐 API**: Direct HTTP calls
+        - **🐼 Pandas**: DataFrame magic
+        - **📚 Datasets**: HF standard
+        - **🔍 Search**: Smart filtering
+        Made with ❤️ and lots of ☕ for the global data community 🌍
+        """)
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        show_error=True
+    )