Spaces:

Osnly
/

smart-data-cleaning-agent

Sleeping

App Files Files Community

smart-data-cleaning-agent / src /plan.py

Osnly

Update src/plan.py

18ea131 verified 4 months ago

raw

history blame

3.18 kB

	from transformers import AutoModelForCausalLM, AutoTokenizer
	from transformers import AutoProcessor, AutoModelForImageTextToText

	import torch
	import json
	import re
	import os


	hf_token = os.environ.get("HUGGINGFACE_TOKEN")
	model_id = "google/gemma-3n-E4B"

	cache_dir = "/tmp/hf_cache"
	tokenizer = AutoProcessor.from_pretrained(model_id, token=hf_token, cache_dir=cache_dir)
	model = AutoModelForImageTextToText.from_pretrained(model_id, token=hf_token, cache_dir=cache_dir)


	def call_llm(prompt):
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
	outputs = model.generate(**inputs, max_new_tokens=2048)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)

	prompt_template = """
	You are a highly skilled data cleaning agent. Your job is to deeply understand the structure and meaning of each column in a dataset based on its summary statistics and example values, and generate a detailed, justified cleaning plan.
	The dataset may contain:
	- numeric columns (age, price, income, etc)
	- categorical columns (gender, country, status, etc)
	- identifiers (id, uuid, etc)
	- text fields (comments, descriptions, etc)
	- dates or timestamps
	- unexpected or noisy values
	- missing data
	- inconsistent formatting
	Your goal is to:
	1. Identify what each column most likely represents.
	2. Decide if it should be cleaned, imputed, dropped, mapped, scaled, or standardized.
	3. Choose appropriate cleaning methods (e.g., impute with median, map inconsistent values, detect and fill outliers).
	4. Add reasoning for each step to explain why you made that decision.
	5. At the end, summarize what you did overall in plain language for a human to understand.
	Output JSON with this exact format:
	Note: The key must be spelled "column" exactly. Do not use "colum" or any other variant.
	{
	"plan": [
	{
	"column": "col_name",
	"action": "impute" \| "drop" \| "standardize" \| "normalize" \| "scale" \| "clip_outliers" \| "fill_outliers" \| "convert_dtype" \| "map_values" \| "strip_whitespace" \| "remove_duplicates",
	"method": "mean" \| "median" \| "mode" \| "minmax" \| "zscore" \| "constant" \| "int" \| "float" \| "datetime" \| null,
	"params": { optional dictionary of extra parameters },
	"reason": "Detailed and logical explanation of why this cleaning step is needed."
	},
	...
	],
	"explanation": "A clear, human-friendly summary of the full cleaning plan."
	}
	Think carefully. Only propose changes that are statistically or logically justified. Be rigorous but practical.
	Column Analysis:
	{column_data}
	"""

	def generate_cleaning_plan(analysis_dict):
	column_data = json.dumps(analysis_dict["columns"], indent=2)
	prompt = prompt_template.format(column_data=column_data)
	result = call_llm(prompt)

	match = re.search(r"\{.*\}", result, re.DOTALL)
	if match:
	try:
	parsed = json.loads(match.group(0))
	return parsed.get("plan", []), parsed.get("explanation", "")
	except json.JSONDecodeError:
	print("Failed to parse JSON. Raw output:\n", result)
	return [], ""
	else:
	print("No valid JSON object found. Raw output:\n", result)
	return [], ""