Spaces:

ivxxdegen
/

new_mibera_train

Runtime error

App Files Files Community

ivxxdegen commited on Feb 3

Commit

b6e95ca

1 Parent(s): 7e188c1

upgrade

Browse files

Files changed (1) hide show

app.py +12 -11

app.py CHANGED Viewed

@@ -54,7 +54,11 @@ if not os.path.exists(dataset_path):
 # --- Load the dataset using pandas ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
-# (Do not convert to string so that the dicts remain intact)
 dataset = Dataset.from_pandas(df)
 print("Dataset columns:", dataset.column_names)
@@ -80,7 +84,7 @@ torch.cuda.empty_cache()
 model.gradient_checkpointing_enable()
 # --- Integrate PEFT (LoRA) ---
-# Based on your inspection, we now target "qkv_proj". Adjust if needed.
 lora_config = LoraConfig(
     r=16,
     lora_alpha=32,
@@ -94,14 +98,11 @@ model.print_trainable_parameters()
 # --- Preprocess the dataset ---
 def preprocess_function(examples):
     combined_texts = []
-    # For each example, extract the tweet content and lore response
-    tweets = examples.get("tweet", [])
-    lores = examples.get("lore", [])
-    for tweet_obj, lore_obj in zip(tweets, lores):
-        # Extract "content" from tweet, and "response" from lore
-        tweet_text = tweet_obj.get("content", "") if isinstance(tweet_obj, dict) else str(tweet_obj)
-        lore_text = lore_obj.get("response", "") if isinstance(lore_obj, dict) else str(lore_obj)
-        combined_text = "[PERSONALITY] " + tweet_text + "\n[KNOWLEDGE] " + lore_text
         combined_texts.append(combined_text)
     return tokenizer(combined_texts, truncation=True, padding=True)
@@ -122,7 +123,7 @@ tokenized_eval = eval_dataset.map(add_labels, batched=True)
 # --- Set training arguments ---
 training_args = TrainingArguments(
     output_dir=output_dir,
-    evaluation_strategy="epoch",  # (Deprecated: use eval_strategy in future versions)
     logging_dir="./logs",
     logging_steps=500,
     num_train_epochs=3,

 # --- Load the dataset using pandas ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
+# Create new columns by extracting text from the nested JSON objects
+df["tweet_text"] = df["tweet"].apply(lambda x: x.get("content", "") if isinstance(x, dict) else str(x))
+df["lore_text"] = df["lore"].apply(lambda x: x.get("response", "") if isinstance(x, dict) else str(x))
+# Optionally, drop the original columns if desired:
+# df = df.drop(columns=["tweet", "lore"])
 dataset = Dataset.from_pandas(df)
 print("Dataset columns:", dataset.column_names)
 model.gradient_checkpointing_enable()
 # --- Integrate PEFT (LoRA) ---
+# Based on your inspection, we target "qkv_proj" (update if needed)
 lora_config = LoraConfig(
     r=16,
     lora_alpha=32,
 # --- Preprocess the dataset ---
 def preprocess_function(examples):
     combined_texts = []
+    # Use the new flattened columns "tweet_text" and "lore_text"
+    tweets = examples.get("tweet_text", [])
+    lores = examples.get("lore_text", [])
+    for tweet, lore in zip(tweets, lores):
+        combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
         combined_texts.append(combined_text)
     return tokenizer(combined_texts, truncation=True, padding=True)
 # --- Set training arguments ---
 training_args = TrainingArguments(
     output_dir=output_dir,
+    evaluation_strategy="epoch",
     logging_dir="./logs",
     logging_steps=500,
     num_train_epochs=3,