ivxxdegen commited on
Commit
7e188c1
Β·
1 Parent(s): 57bccaa
Files changed (1) hide show
  1. app.py +11 -9
app.py CHANGED
@@ -54,9 +54,7 @@ if not os.path.exists(dataset_path):
54
  # --- Load the dataset using pandas ---
55
  print("πŸ“₯ Loading dataset using pandas...")
56
  df = pd.read_json(dataset_path, lines=True)
57
- # Ensure consistency: convert columns to string to avoid mixed types
58
- df["lore"] = df["lore"].astype(str)
59
- df["tweet"] = df["tweet"].astype(str)
60
  dataset = Dataset.from_pandas(df)
61
  print("Dataset columns:", dataset.column_names)
62
 
@@ -82,11 +80,11 @@ torch.cuda.empty_cache()
82
  model.gradient_checkpointing_enable()
83
 
84
  # --- Integrate PEFT (LoRA) ---
85
- # Update target_modules based on your model's inspection; here we use "qkv_proj"
86
  lora_config = LoraConfig(
87
  r=16,
88
  lora_alpha=32,
89
- target_modules=["qkv_proj"], # Adjust as needed after inspecting model modules
90
  lora_dropout=0.1,
91
  bias="none"
92
  )
@@ -95,11 +93,15 @@ model.print_trainable_parameters()
95
 
96
  # --- Preprocess the dataset ---
97
  def preprocess_function(examples):
 
 
98
  tweets = examples.get("tweet", [])
99
  lores = examples.get("lore", [])
100
- combined_texts = []
101
- for tweet, lore in zip(tweets, lores):
102
- combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
 
 
103
  combined_texts.append(combined_text)
104
  return tokenizer(combined_texts, truncation=True, padding=True)
105
 
@@ -120,7 +122,7 @@ tokenized_eval = eval_dataset.map(add_labels, batched=True)
120
  # --- Set training arguments ---
121
  training_args = TrainingArguments(
122
  output_dir=output_dir,
123
- evaluation_strategy="epoch",
124
  logging_dir="./logs",
125
  logging_steps=500,
126
  num_train_epochs=3,
 
54
  # --- Load the dataset using pandas ---
55
  print("πŸ“₯ Loading dataset using pandas...")
56
  df = pd.read_json(dataset_path, lines=True)
57
+ # (Do not convert to string so that the dicts remain intact)
 
 
58
  dataset = Dataset.from_pandas(df)
59
  print("Dataset columns:", dataset.column_names)
60
 
 
80
  model.gradient_checkpointing_enable()
81
 
82
  # --- Integrate PEFT (LoRA) ---
83
+ # Based on your inspection, we now target "qkv_proj". Adjust if needed.
84
  lora_config = LoraConfig(
85
  r=16,
86
  lora_alpha=32,
87
+ target_modules=["qkv_proj"],
88
  lora_dropout=0.1,
89
  bias="none"
90
  )
 
93
 
94
  # --- Preprocess the dataset ---
95
  def preprocess_function(examples):
96
+ combined_texts = []
97
+ # For each example, extract the tweet content and lore response
98
  tweets = examples.get("tweet", [])
99
  lores = examples.get("lore", [])
100
+ for tweet_obj, lore_obj in zip(tweets, lores):
101
+ # Extract "content" from tweet, and "response" from lore
102
+ tweet_text = tweet_obj.get("content", "") if isinstance(tweet_obj, dict) else str(tweet_obj)
103
+ lore_text = lore_obj.get("response", "") if isinstance(lore_obj, dict) else str(lore_obj)
104
+ combined_text = "[PERSONALITY] " + tweet_text + "\n[KNOWLEDGE] " + lore_text
105
  combined_texts.append(combined_text)
106
  return tokenizer(combined_texts, truncation=True, padding=True)
107
 
 
122
  # --- Set training arguments ---
123
  training_args = TrainingArguments(
124
  output_dir=output_dir,
125
+ evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future versions)
126
  logging_dir="./logs",
127
  logging_steps=500,
128
  num_train_epochs=3,