Spaces:

ivxxdegen
/

new_mibera_train

Runtime error

App Files Files Community

ivxxdegen commited on Feb 3

Commit

7e188c1

1 Parent(s): 57bccaa

upgrade

Browse files

Files changed (1) hide show

app.py +11 -9

app.py CHANGED Viewed

@@ -54,9 +54,7 @@ if not os.path.exists(dataset_path):
 # --- Load the dataset using pandas ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
-# Ensure consistency: convert columns to string to avoid mixed types
-df["lore"] = df["lore"].astype(str)
-df["tweet"] = df["tweet"].astype(str)
 dataset = Dataset.from_pandas(df)
 print("Dataset columns:", dataset.column_names)
@@ -82,11 +80,11 @@ torch.cuda.empty_cache()
 model.gradient_checkpointing_enable()
 # --- Integrate PEFT (LoRA) ---
-# Update target_modules based on your model's inspection; here we use "qkv_proj"
 lora_config = LoraConfig(
     r=16,
     lora_alpha=32,
-    target_modules=["qkv_proj"],  # Adjust as needed after inspecting model modules
     lora_dropout=0.1,
     bias="none"
 )
@@ -95,11 +93,15 @@ model.print_trainable_parameters()
 # --- Preprocess the dataset ---
 def preprocess_function(examples):
     tweets = examples.get("tweet", [])
     lores = examples.get("lore", [])
-    combined_texts = []
-    for tweet, lore in zip(tweets, lores):
-        combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
         combined_texts.append(combined_text)
     return tokenizer(combined_texts, truncation=True, padding=True)
@@ -120,7 +122,7 @@ tokenized_eval = eval_dataset.map(add_labels, batched=True)
 # --- Set training arguments ---
 training_args = TrainingArguments(
     output_dir=output_dir,
-    evaluation_strategy="epoch",
     logging_dir="./logs",
     logging_steps=500,
     num_train_epochs=3,

 # --- Load the dataset using pandas ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
+# (Do not convert to string so that the dicts remain intact)
 dataset = Dataset.from_pandas(df)
 print("Dataset columns:", dataset.column_names)
 model.gradient_checkpointing_enable()
 # --- Integrate PEFT (LoRA) ---
+# Based on your inspection, we now target "qkv_proj". Adjust if needed.
 lora_config = LoraConfig(
     r=16,
     lora_alpha=32,
+    target_modules=["qkv_proj"],
     lora_dropout=0.1,
     bias="none"
 )
 # --- Preprocess the dataset ---
 def preprocess_function(examples):
+    combined_texts = []
+    # For each example, extract the tweet content and lore response
     tweets = examples.get("tweet", [])
     lores = examples.get("lore", [])
+    for tweet_obj, lore_obj in zip(tweets, lores):
+        # Extract "content" from tweet, and "response" from lore
+        tweet_text = tweet_obj.get("content", "") if isinstance(tweet_obj, dict) else str(tweet_obj)
+        lore_text = lore_obj.get("response", "") if isinstance(lore_obj, dict) else str(lore_obj)
+        combined_text = "[PERSONALITY] " + tweet_text + "\n[KNOWLEDGE] " + lore_text
         combined_texts.append(combined_text)
     return tokenizer(combined_texts, truncation=True, padding=True)
 # --- Set training arguments ---
 training_args = TrainingArguments(
     output_dir=output_dir,
+    evaluation_strategy="epoch",  # (Deprecated: use eval_strategy in future versions)
     logging_dir="./logs",
     logging_steps=500,
     num_train_epochs=3,