from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoProcessor, AutoModelForImageTextToText

import torch
import json
import os


hf_token = os.environ.get("HUGGINGFACE_TOKEN")

model_id = "google/gemma-3n-E4B"


hf_token = os.environ.get("HUGGINGFACE_TOKEN")
cache_dir = "/tmp/hf_cache"
tokenizer = AutoProcessor.from_pretrained(model_id, token=hf_token, cache_dir=cache_dir)
model = AutoModelForImageTextToText.from_pretrained(model_id, token=hf_token, cache_dir=cache_dir)


def call_llm(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=1024)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

insight_prompt = """
You are a senior data analyst. You are given a dataset summary and column statistics after cleaning.
Please perform the following:
1. Describe the structure of the data in natural language.
2. Mention any interesting patterns or distributions (e.g. most common values, ranges, anomalies).
3. Derive any basic insights you can (e.g. relationships between columns, high-cardinality features, outliers).
4. Point out anything surprising or worth further investigation.
Be specific. Don't explain generic EDA steps — interpret the data as if you're preparing a short report.
Column Summary:
{column_data}
"""

def generate_insights(column_data):
    prompt = insight_prompt.format(column_data=json.dumps(column_data, indent=2))
    return call_llm(prompt)