Spaces:
Sleeping
Sleeping
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| import torch | |
| import json | |
| import re | |
| import os | |
| hf_token = os.environ.get("HUGGINGFACE_TOKEN") | |
| model_id = "google/gemma-3n-E4B" | |
| cache_dir = "/tmp/hf_cache" | |
| tokenizer = AutoProcessor.from_pretrained(model_id, token=hf_token, cache_dir=cache_dir) | |
| model = AutoModelForImageTextToText.from_pretrained(model_id, token=hf_token, cache_dir=cache_dir) | |
| def call_llm(prompt): | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True) | |
| outputs = model.generate(**inputs, max_new_tokens=2048) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| prompt_template = """ | |
| You are a highly skilled data cleaning agent. Your job is to deeply understand the structure and meaning of each column in a dataset based on its summary statistics and example values, and generate a detailed, justified cleaning plan. | |
| The dataset may contain: | |
| - numeric columns (age, price, income, etc) | |
| - categorical columns (gender, country, status, etc) | |
| - identifiers (id, uuid, etc) | |
| - text fields (comments, descriptions, etc) | |
| - dates or timestamps | |
| - unexpected or noisy values | |
| - missing data | |
| - inconsistent formatting | |
| Your goal is to: | |
| 1. Identify what each column most likely represents. | |
| 2. Decide if it should be cleaned, imputed, dropped, mapped, scaled, or standardized. | |
| 3. Choose appropriate cleaning methods (e.g., impute with median, map inconsistent values, detect and fill outliers). | |
| 4. Add reasoning for each step to explain **why** you made that decision. | |
| 5. At the end, summarize what you did overall in plain language for a human to understand. | |
| Output JSON with this exact format: | |
| Note: The key must be spelled "column" exactly. Do not use "colum" or any other variant. | |
| { | |
| "plan": [ | |
| { | |
| "column": "col_name", | |
| "action": "impute" | "drop" | "standardize" | "normalize" | "scale" | "clip_outliers" | "fill_outliers" | "convert_dtype" | "map_values" | "strip_whitespace" | "remove_duplicates", | |
| "method": "mean" | "median" | "mode" | "minmax" | "zscore" | "constant" | "int" | "float" | "datetime" | null, | |
| "params": { optional dictionary of extra parameters }, | |
| "reason": "Detailed and logical explanation of why this cleaning step is needed." | |
| }, | |
| ... | |
| ], | |
| "explanation": "A clear, human-friendly summary of the full cleaning plan." | |
| } | |
| Think carefully. Only propose changes that are statistically or logically justified. Be rigorous but practical. | |
| Column Analysis: | |
| {column_data} | |
| """ | |
| def generate_cleaning_plan(analysis_dict): | |
| column_data = json.dumps(analysis_dict["columns"], indent=2) | |
| prompt = prompt_template.format(column_data=column_data) | |
| result = call_llm(prompt) | |
| match = re.search(r"\{.*\}", result, re.DOTALL) | |
| if match: | |
| try: | |
| parsed = json.loads(match.group(0)) | |
| return parsed.get("plan", []), parsed.get("explanation", "") | |
| except json.JSONDecodeError: | |
| print("Failed to parse JSON. Raw output:\n", result) | |
| return [], "" | |
| else: | |
| print("No valid JSON object found. Raw output:\n", result) | |
| return [], "" | |