from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoProcessor, AutoModelForImageTextToText import torch import json import os hf_token = os.environ.get("HUGGINGFACE_TOKEN") model_id = "google/gemma-3n-E4B" hf_token = os.environ.get("HUGGINGFACE_TOKEN") cache_dir = "/tmp/hf_cache" tokenizer = AutoProcessor.from_pretrained(model_id, token=hf_token, cache_dir=cache_dir) model = AutoModelForImageTextToText.from_pretrained(model_id, token=hf_token, cache_dir=cache_dir) def call_llm(prompt): inputs = tokenizer(prompt, return_tensors="pt", truncation=True) outputs = model.generate(**inputs, max_new_tokens=1024) return tokenizer.decode(outputs[0], skip_special_tokens=True) insight_prompt = """ You are a senior data analyst. You are given a dataset summary and column statistics after cleaning. Please perform the following: 1. Describe the structure of the data in natural language. 2. Mention any interesting patterns or distributions (e.g. most common values, ranges, anomalies). 3. Derive any basic insights you can (e.g. relationships between columns, high-cardinality features, outliers). 4. Point out anything surprising or worth further investigation. Be specific. Don't explain generic EDA steps — interpret the data as if you're preparing a short report. Column Summary: {column_data} """ def generate_insights(column_data): prompt = insight_prompt.format(column_data=json.dumps(column_data, indent=2)) return call_llm(prompt)