Osnly commited on
Commit
49436c8
·
verified ·
1 Parent(s): 6a98df4

Update src/insight.py

Browse files
Files changed (1) hide show
  1. src/insight.py +37 -34
src/insight.py CHANGED
@@ -1,34 +1,37 @@
1
- # insight.py
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
- import torch
4
- import json
5
-
6
- # Load Gemma model from Hugging Face
7
- model_id = "google/gemma-3n-E4B-it"
8
- tokenizer = AutoTokenizer.from_pretrained(model_id)
9
- model = AutoModelForCausalLM.from_pretrained(model_id)
10
-
11
- def call_llm(prompt):
12
- inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
13
- outputs = model.generate(**inputs, max_new_tokens=1024)
14
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
15
-
16
- insight_prompt = """
17
- You are a senior data analyst. You are given a dataset summary and column statistics after cleaning.
18
-
19
- Please perform the following:
20
- 1. Describe the structure of the data in natural language.
21
- 2. Mention any interesting patterns or distributions (e.g. most common values, ranges, anomalies).
22
- 3. Derive any basic insights you can (e.g. relationships between columns, high-cardinality features, outliers).
23
- 4. Point out anything surprising or worth further investigation.
24
-
25
- Be specific. Don't explain generic EDA steps interpret the data as if you're preparing a short report.
26
-
27
- Column Summary:
28
- {column_data}
29
- """
30
-
31
- def generate_insights(column_data):
32
- prompt = insight_prompt.format(column_data=json.dumps(column_data, indent=2))
33
- return call_llm(prompt)
34
-
 
 
 
 
1
+ # insight.py
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
4
+ import json
5
+ import os
6
+
7
+
8
+ model_id = "google/gemma-3n-E4B-it"
9
+ hf_token = os.environ.get("HUGGINGFACE_TOKEN")
10
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
11
+ model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_token)
12
+
13
+
14
+ def call_llm(prompt):
15
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
16
+ outputs = model.generate(**inputs, max_new_tokens=1024)
17
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
18
+
19
+ insight_prompt = """
20
+ You are a senior data analyst. You are given a dataset summary and column statistics after cleaning.
21
+
22
+ Please perform the following:
23
+ 1. Describe the structure of the data in natural language.
24
+ 2. Mention any interesting patterns or distributions (e.g. most common values, ranges, anomalies).
25
+ 3. Derive any basic insights you can (e.g. relationships between columns, high-cardinality features, outliers).
26
+ 4. Point out anything surprising or worth further investigation.
27
+
28
+ Be specific. Don't explain generic EDA steps — interpret the data as if you're preparing a short report.
29
+
30
+ Column Summary:
31
+ {column_data}
32
+ """
33
+
34
+ def generate_insights(column_data):
35
+ prompt = insight_prompt.format(column_data=json.dumps(column_data, indent=2))
36
+ return call_llm(prompt)
37
+