Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Claude

Running

File size: 22,648 Bytes

# app.py
import gradio as gr
import pandas as pd
import requests
import io
import warnings
import traceback
import json
import tempfile
import os
import logging

# 🤫 Suppress warnings and set logging levels
warnings.filterwarnings("ignore")
logging.getLogger("absl").setLevel(logging.ERROR)  # Suppress MLCroissant warnings
os.environ["ABSL_LOG_LEVEL"] = "2"  # Only show errors

# Import optional dependencies with fallbacks
try:
    import dask.dataframe as dd
    DASK_AVAILABLE = True
except ImportError:
    DASK_AVAILABLE = False

try:
    from datasets import load_dataset, Image
    DATASETS_AVAILABLE = True
except ImportError:
    DATASETS_AVAILABLE = False

try:
    from mlcroissant import Dataset as CroissantDataset
    CROISSANT_AVAILABLE = True
except ImportError:
    CROISSANT_AVAILABLE = False

try:
    from huggingface_hub import get_token
    HF_HUB_AVAILABLE = True
except ImportError:
    HF_HUB_AVAILABLE = False

try:
    import polars as pl
    POLARS_AVAILABLE = True
except ImportError:
    POLARS_AVAILABLE = False

# --- ⚙️ Configuration & Constants ---
DATASET_CONFIG = {
    "caselaw": {
        "name": "common-pile/caselaw_access_project", "emoji": "⚖️",
        "methods": ["💨 API (requests)"], "is_public": True,
    },
    "prompts": {
        "name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
        "methods": ["🐼 Pandas", "💨 API (requests)"], "is_public": True,
    },
    "finance": {
        "name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
        "methods": ["🐼 Pandas", "💨 API (requests)"], "is_public": False,
    },
    "medical": {
        "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
        "methods": ["🐼 Pandas"], "is_public": False,
    },
    "inscene": {
        "name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
        "methods": ["🤗 Datasets", "🖼️ Datasets with Images"], "is_public": False,
    },
}

# --- 🔧 Helpers & Utility Functions ---

def get_auth_headers():
    """🔑 Get authentication headers if available"""
    if not HF_HUB_AVAILABLE:
        return {}
    try:
        token = get_token()
        return {"Authorization": f"Bearer {token}"} if token else {}
    except Exception:
        return {}

# --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
def dataframe_to_outputs(df: pd.DataFrame):
    """
    📜 Takes a DataFrame and transforms it into various formats.
    Now uses temporary files for maximum Gradio compatibility.
    """
    if df.empty:
        return "No results found. 🤷", None, None, "No results to copy."

    df_str = df.astype(str)
    markdown_output = df_str.to_markdown(index=False)
    
    # Create a temporary CSV file
    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
        df.to_csv(tmp_csv.name, index=False)
        csv_path = tmp_csv.name

    # Create a temporary XLSX file
    with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
        df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
        xlsx_path = tmp_xlsx.name

    tab_delimited_output = df.to_csv(sep='\t', index=False)
    
    return (
        markdown_output,
        csv_path,
        xlsx_path,
        tab_delimited_output,
    )

def handle_error(e: Exception, request=None, response=None):
    """
    😱 Oh no! An error! This function now creates a detailed debug log.
    """
    error_message = f"🚨 An error occurred: {str(e)}\n"
    auth_tip = "🔑 For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
    full_trace = traceback.format_exc()
    print(full_trace)
    if "401" in str(e) or "Gated" in str(e):
        error_message += auth_tip
    
    debug_log = f"""--- 🐞 DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n"""
    if request:
        debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n"""
    if response is not None:
        try:
            response_text = json.dumps(response.json(), indent=2)
        except json.JSONDecodeError:
            response_text = response.text
        debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n"""
    
    return (
        pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.",
        "", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```",
        gr.Code(value=debug_log, visible=True)
    )

def search_dataframe(df: pd.DataFrame, query: str):
    if not query:
        return df.head(100)
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    if string_cols.empty:
        return pd.DataFrame()
    mask = pd.Series([False] * len(df))
    for col in string_cols:
        mask |= df[col].astype(str).str.contains(query, case=False, na=False)
    return df[mask]

def generate_code_snippet(dataset_key: str, access_method: str, query: str):
    """
    💻 Generate Python code snippet for the current operation
    """
    config = DATASET_CONFIG[dataset_key]
    repo_id = config["name"]
    
    if "API" in access_method:
        return f'''# 🌐 API Access for {repo_id}
import requests
import pandas as pd

url = "https://datasets-server.huggingface.co/rows"
params = {{
    "dataset": "{repo_id}",
    "config": "default",
    "split": "train",
    "offset": 0,
    "length": 100
}}

headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} if needed else {{}}
response = requests.get(url, params=params, headers=headers)

if response.status_code == 200:
    data = response.json()
    rows_data = [item['row'] for item in data['rows']]
    df = pd.json_normalize(rows_data)
    
    # Search for: "{query}"
    if "{query}":
        string_cols = df.select_dtypes(include=['object', 'string']).columns
        mask = pd.Series([False] * len(df))
        for col in string_cols:
            mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
        df = df[mask]
    
    print(f"Found {{len(df)}} results")
    print(df.head())
else:
    print(f"Error: {{response.status_code}} - {{response.text}}")
'''
    
    elif "Pandas" in access_method:
        file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
        read_function = "read_csv" if "csv" in file_path else "read_parquet"
        
        return f'''# 🐼 Pandas Access for {repo_id}
import pandas as pd

# You may need: huggingface-cli login
df = pd.{read_function}("hf://datasets/{repo_id}/{file_path}")

# Search for: "{query}"
if "{query}":
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    mask = pd.Series([False] * len(df))
    for col in string_cols:
        mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
    df = df[mask]

print(f"Found {{len(df)}} results")
print(df.head())
'''
    
    elif "Datasets" in access_method:
        if "Images" in access_method:
            return f'''# 🖼️ Datasets Library with Image Access for {repo_id}
from datasets import load_dataset
import pandas as pd

# You may need: huggingface-cli login
ds = load_dataset("{repo_id}", split="train", streaming=True)
data = list(ds.take(50))  # Smaller sample for images
df = pd.DataFrame(data)

# Process images
images = []
for item in data:
    if 'image' in item and item['image'] is not None:
        images.append((item['image'], item.get('text', '')))

print(f"Found {{len(df)}} records with {{len(images)}} images")
print(df.head())

# Display first image
if images:
    first_image, caption = images[0]
    first_image.show()  # If PIL Image
    print(f"Caption: {{caption}}")
'''
        else:
            return f'''# 🤗 Datasets Library Access for {repo_id}
from datasets import load_dataset
import pandas as pd

# You may need: huggingface-cli login
ds = load_dataset("{repo_id}", split="train", streaming=True)
data = list(ds.take(1000))
df = pd.DataFrame(data)

# Search for: "{query}"
if "{query}":
    string_cols = df.select_dtypes(include=['object', 'string']).columns
    mask = pd.Series([False] * len(df))
    for col in string_cols:
        mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
    df = df[mask]

print(f"Found {{len(df)}} results")
print(df.head())
'''
    
    else:
        return f"# Code generation for {access_method} not implemented yet"

# --- 🎣 Data Fetching & Processing Functions ---
def fetch_data(dataset_key: str, access_method: str, query: str):
    """
    🚀 Main mission control. Always yields a tuple of 9 values to match the UI components.
    """
    outputs = [pd.DataFrame(), None, "🏁 Ready.", "", None, None, "", "", gr.Code(visible=False)]
    req, res = None, None
    try:
        config = DATASET_CONFIG[dataset_key]
        repo_id = config["name"]
        
        # Generate code snippet
        code_snippet = generate_code_snippet(dataset_key, access_method, query)
        outputs[7] = code_snippet
        
        if "API" in access_method:
            all_results_df = pd.DataFrame()
            MAX_PAGES = 5
            PAGE_SIZE = 100

            if not query:
                MAX_PAGES = 1
                outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
                yield tuple(outputs)

            for page in range(MAX_PAGES):
                if query:
                    outputs[2] = f"⏳ Searching page {page + 1}..."
                    yield tuple(outputs)
                
                offset = page * PAGE_SIZE
                url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
                headers = get_auth_headers() if not config["is_public"] else {}
                
                res = requests.get(url, headers=headers)
                req = res.request
                res.raise_for_status()
                data = res.json()

                if not data.get('rows'):
                    outputs[2] = "🏁 No more data to search."
                    yield tuple(outputs)
                    break

                # --- ✨ FIXED: JSON processing logic ---
                # Extract the actual data from the 'row' key of each item in the list
                rows_data = [item['row'] for item in data['rows']]
                page_df = pd.json_normalize(rows_data)
                
                found_in_page = search_dataframe(page_df, query)

                if not found_in_page.empty:
                    all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True)
                    outputs[0] = all_results_df
                    outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
                    outputs[2] = f"✅ Found **{len(all_results_df)}** results so far..."
                    
                    if dataset_key == 'inscene':
                        try:
                            gallery_data = []
                            for _, row in all_results_df.iterrows():
                                if 'image' in row:
                                    image_data = row.get('image')
                                    text_data = row.get('text', '')
                                    
                                    # Handle different image formats safely
                                    if hasattr(image_data, 'save'):  # PIL Image
                                        gallery_data.append((image_data, text_data))
                                    elif isinstance(image_data, str):  # Image path or URL
                                        gallery_data.append((image_data, text_data))
                                        
                            if gallery_data:
                                outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
                        except Exception as img_error:
                            # Don't break the flow for image errors
                            pass
                    yield tuple(outputs)

            outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
            yield tuple(outputs)
            return

        outputs[2] = f"⏳ Loading data via `{access_method}`..."
        yield tuple(outputs)
        
        df = pd.DataFrame()
        
        if "Pandas" in access_method:
            file_path = f"hf://datasets/{repo_id}/"
            if repo_id == "fka/awesome-chatgpt-prompts": 
                file_path += "prompts.csv"
                df = pd.read_csv(file_path)
            else: 
                try: 
                    df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
                except:
                     try: 
                         df = pd.read_parquet(f"{file_path}train.parquet")
                     except: 
                         df = pd.read_json(f"{file_path}medical_o1_sft.json")
                         
        elif "Datasets" in access_method:
            if not DATASETS_AVAILABLE:
                raise ImportError("datasets library not available. Install with: pip install datasets")
            
            # Special handling for image datasets
            if dataset_key == 'inscene' and "Images" in access_method:
                outputs[2] = "🖼️ Loading InScene dataset with image processing..."
                yield tuple(outputs)
                
                # Load with image processing
                ds = load_dataset(repo_id, split='train', streaming=True)
                data_list = list(ds.take(50))  # Smaller sample for images
                df = pd.DataFrame(data_list)
                
                # Process images for gallery display
                gallery_data = []
                for i, item in enumerate(data_list):
                    try:
                        if 'image' in item and item['image'] is not None:
                            image = item['image']
                            caption = item.get('text', f'Image {i+1}')
                            
                            # Convert PIL Image to displayable format
                            if hasattr(image, 'save'):
                                gallery_data.append((image, caption))
                            elif isinstance(image, str):
                                gallery_data.append((image, caption))
                        
                        # Limit to first 20 images for performance
                        if len(gallery_data) >= 20:
                            break
                            
                    except Exception as img_error:
                        continue
                
                # Update gallery with images
                if gallery_data:
                    outputs[1] = gr.Gallery(gallery_data, label=f"🖼️ Found {len(gallery_data)} Images", height=400, columns=4, rows=2)
                    outputs[2] = f"🖼️ Loaded {len(df)} records with {len(gallery_data)} images"
                else:
                    outputs[2] = "🖼️ Loaded data but no images found to display"
                
            else:
                # Regular datasets loading
                ds = load_dataset(repo_id, split='train', streaming=True)
                data_list = list(ds.take(1000))
                df = pd.DataFrame(data_list)
                outputs[2] = f"📚 Loaded {len(df)} records via Datasets library"
        
        outputs[2] = "🔍 Searching loaded data..."
        yield tuple(outputs)

        final_df = search_dataframe(df, query)
        
        outputs[0] = final_df
        outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
        outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
        
        if dataset_key == 'inscene' and not final_df.empty:
            # Handle image data more safely
            try:
                gallery_data = []
                for _, row in final_df.iterrows():
                    if 'image' in row:
                        image_data = row.get('image')
                        text_data = row.get('text', '')
                        
                        # Handle different image formats
                        if hasattr(image_data, 'save'):  # PIL Image
                            gallery_data.append((image_data, text_data))
                        elif isinstance(image_data, str):  # Image path or URL
                            gallery_data.append((image_data, text_data))
                        
                if gallery_data:
                    outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
            except Exception as img_error:
                outputs[2] += f"\n⚠️ Image display error: {str(img_error)}"
        
        yield tuple(outputs)

    except Exception as e:
        yield handle_error(e, req, res)


# --- 🖼️ UI Generation ---
def create_dataset_tab(dataset_key: str):
    config = DATASET_CONFIG[dataset_key]
    
    with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
        gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
        if not config['is_public']:
            gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
        
        # Show available methods for this dataset
        available_methods = config['methods']
        methods_note = f"**Available methods:** {len(available_methods)} tested and working methods"
        if dataset_key == 'inscene':
            methods_note += " (🖼️ = Image viewer included)"
        gr.Markdown(methods_note)
        
        with gr.Row():
            access_method = gr.Radio(
                available_methods, 
                label="🔑 Access Method", 
                value=available_methods[0] if available_methods else "💨 API (requests)"
            )
            query = gr.Textbox(
                label="🔍 Search Query", 
                placeholder="Enter any text to search, or leave blank for samples..."
            )
        
        fetch_button = gr.Button("🚀 Go Fetch!")
        status_output = gr.Markdown("🏁 Ready to search.")
        df_output = gr.DataFrame(label="📊 Results DataFrame", interactive=False, wrap=True)
        
        # Show gallery for InScene dataset or when using image methods
        show_gallery = (dataset_key == 'inscene')
        gallery_output = gr.Gallery(visible=show_gallery, label="🖼️ Image Results", height=400, columns=4, rows=2)

        with gr.Accordion("📂 View/Export Full Results", open=False):
            markdown_output = gr.Markdown(label="📝 Markdown View")
            with gr.Row():
                csv_output = gr.File(label="⬇️ Download CSV")
                xlsx_output = gr.File(label="⬇️ Download XLSX")
            copy_output = gr.Code(label="📋 Copy-Paste (Tab-Delimited)")
        
        code_output = gr.Code(label="💻 Python Code Snippet", language="python")
        
        debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
        
        fetch_button.click(
            fn=fetch_data,
            inputs=[gr.State(dataset_key), access_method, query],
            outputs=[
                df_output, gallery_output, status_output, markdown_output,
                csv_output, xlsx_output, copy_output, code_output,
                debug_log_output
            ]
        )

# --- 🚀 Main App ---
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
    gr.Markdown("# 🤗 Hugging Face Dataset Explorer")
    gr.Markdown(
        "Select a dataset, choose an access method, and type a query. "
        "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
    )
    
    # Show dependency status and dataset-specific methods
    def get_dependency_status():
        status = "### 🔧 Dataset-Specific Methods (Only Working Methods Shown):\n"
        for key, config in DATASET_CONFIG.items():
            methods_str = ", ".join(config['methods'])
            auth_status = "🔐 Requires Auth" if not config['is_public'] else "✅ Public"
            status += f"- **{config['emoji']} {key.capitalize()}**: {methods_str} ({auth_status})\n"
        
        status += "\n### 📚 Library Dependencies:\n"
        status += f"- **🐼 Pandas**: ✅ Available\n"
        status += f"- **💨 Requests**: ✅ Available\n"
        status += f"- **🤗 Datasets**: {'✅ Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"
        
        return status
    
    with gr.Accordion("🔧 Library Status & Quick Start Guide", open=False):
        gr.Markdown(get_dependency_status())
        gr.Markdown("""
        ### 🚀 Quick Start Guide:
        1. **🤖 Prompts**: Try Pandas or API method, search for "translator", "linux", or "writer"
        2. **⚖️ Caselaw**: Try API method only, search for "contract", "court", or "appeal"
        3. **💰 Finance**: Try Pandas or API method (requires auth), search for "interest" or "market"
        4. **🩺 Medical**: Try Pandas method only (requires auth), search for "diagnosis" or "treatment"
        5. **🖼️ InScene**: Try "🖼️ Datasets with Images" to see actual images, search for "kitchen" or "outdoor"
        
        ### 🔑 Authentication:
        For gated datasets (Finance, Medical, InScene), run: `huggingface-cli login`
        
        ### 🛠️ Method Explanations:
        - **💨 API**: Fast, reliable, works without login (100 rows max)
        - **🐼 Pandas**: Full dataset access, requires login for gated datasets
        - **🤗 Datasets**: Standard HuggingFace datasets library
        - **🖼️ Datasets with Images**: Special image viewer for InScene dataset
        
        ### ⚠️ Note:
        Only working methods are shown for each dataset. Non-functional methods have been removed.
        """)
        
        if not DATASETS_AVAILABLE:
            gr.Markdown("**⚠️ Install datasets library for image viewing:** `pip install datasets`")
    
    with gr.Tabs():
        for key in DATASET_CONFIG.keys():
            create_dataset_tab(key)

if __name__ == "__main__":
    demo.launch(debug=True)