AnilNiraula commited on
Commit
846211a
·
verified ·
1 Parent(s): 66fead8

Create finetuned_model.py

Browse files
Files changed (1) hide show
  1. finetuned_model.py +131 -0
finetuned_model.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ from datasets import Dataset
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
5
+ import torch
6
+
7
+ # Step 1: Set Up Environment
8
+ # Ensure libraries are installed: pip install transformers datasets torch accelerate pandas
9
+
10
+ # Step 2: Load and Preprocess Dataset
11
+ # Load CSV from Spaces dataset directory
12
+ csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
13
+ try:
14
+ df = pd.read_csv(csv_path)
15
+ except Exception as e:
16
+ print(f"Error loading CSV: {e}")
17
+ exit()
18
+
19
+ # Calculate annual returns if not provided
20
+ df['Date'] = pd.to_datetime(df['Date'])
21
+ df = df.sort_values('Date')
22
+ df['Return'] = df['SP500'].pct_change(12) * 100 # Annual return based on monthly data
23
+
24
+ # Create question-answer pairs and summaries
25
+ qa_pairs = []
26
+ for _, row in df.iterrows():
27
+ date = row['Date'].strftime('%Y-%m-%d')
28
+ year = row['Date'].year
29
+ sp500 = row['SP500']
30
+ dividend = row['Dividend']
31
+ earnings = row['Earnings']
32
+ return_val = row.get('Return', 0.0)
33
+
34
+ # Question-answer pairs
35
+ qa_pairs.append({
36
+ "question": f"What was the S&P 500 return in {year}?",
37
+ "answer": f"The S&P 500 returned approximately {return_val:.1f}% in {year}."
38
+ })
39
+ qa_pairs.append({
40
+ "question": f"What was the S&P 500 index value on {date}?",
41
+ "answer": f"The S&P 500 closed at approximately {sp500:.2f} on {date}."
42
+ })
43
+ if dividend > 0:
44
+ qa_pairs.append({
45
+ "question": f"What was the S&P 500 dividend in {year}?",
46
+ "answer": f"The S&P 500 dividend was approximately {dividend:.2f} in {year}."
47
+ })
48
+ if earnings > 0:
49
+ qa_pairs.append({
50
+ "question": f"What were the S&P 500 earnings in {year}?",
51
+ "answer": f"The S&P 500 earnings were approximately {earnings:.2f} in {year}."
52
+ })
53
+
54
+ # Summaries
55
+ qa_pairs.append({
56
+ "summary": f"The S&P 500 closed at {sp500:.2f} on {date}, with a {return_val:.1f}% annual return."
57
+ })
58
+
59
+ # Add general S&P 500 growth rate question
60
+ qa_pairs.append({
61
+ "question": "What is the S&P 500 index fund average growth rate?",
62
+ "answer": "The S&P 500 index fund’s average annual return is approximately 10–12% over the long term (1927–2025), including dividends, based on historical data."
63
+ })
64
+
65
+ # Save to JSON for dataset loading
66
+ with open("financial_data.json", "w") as f:
67
+ json.dump(qa_pairs, f, indent=2)
68
+
69
+ # Load dataset
70
+ dataset = Dataset.from_json("financial_data.json")
71
+ dataset = dataset.train_test_split(test_size=0.2, seed=42)
72
+ train_dataset = dataset["train"]
73
+ val_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["train"]
74
+ test_dataset = dataset["test"].train_test_split(test_size=0.5, seed=42)["test"]
75
+
76
+ # Step 3: Tokenize Data
77
+ tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
78
+ tokenizer.pad_token = tokenizer.eos_token
79
+
80
+ def tokenize_function(examples):
81
+ if "question" in examples and "answer" in examples:
82
+ inputs = [q + " A: " + a for q, a in zip(examples["question"], examples["answer"])]
83
+ else:
84
+ inputs = examples["summary"]
85
+ return tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
86
+
87
+ tokenized_train = train_dataset.map(tokenize_function, batched=True)
88
+ tokenized_val = val_dataset.map(tokenize_function, batched=True)
89
+ tokenized_test = test_dataset.map(tokenize_function, batched=True)
90
+
91
+ # Step 4: Load Pre-trained Model
92
+ model = AutoModelForCausalLM.from_pretrained("distilgpt2")
93
+
94
+ # Step 5: Set Up Fine-Tuning
95
+ training_args = TrainingArguments(
96
+ output_dir="./finetuned_model",
97
+ evaluation_strategy="epoch",
98
+ learning_rate=2e-5,
99
+ per_device_train_batch_size=4,
100
+ per_device_eval_batch_size=4,
101
+ num_train_epochs=3,
102
+ weight_decay=0.01,
103
+ logging_steps=10,
104
+ save_strategy="epoch",
105
+ load_best_model_at_end=True,
106
+ metric_for_best_model="eval_loss"
107
+ )
108
+
109
+ trainer = Trainer(
110
+ model=model,
111
+ args=training_args,
112
+ train_dataset=tokenized_train,
113
+ eval_dataset=tokenized_val,
114
+ )
115
+
116
+ # Step 6: Fine-Tune the Model
117
+ trainer.train()
118
+
119
+ # Step 7: Evaluate the Model
120
+ eval_results = trainer.evaluate(tokenized_test)
121
+ print("Evaluation results:", eval_results)
122
+
123
+ # Step 8: Save the Fine-Tuned Model
124
+ trainer.save_model("./finetuned_model")
125
+ tokenizer.save_pretrained("./finetuned_model")
126
+
127
+ # Test the model
128
+ input_text = "What was the S&P 500 return in 2020?"
129
+ inputs = tokenizer(input_text, return_tensors="pt")
130
+ outputs = model.generate(**inputs, max_new_tokens=50)
131
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))