Spaces:

ivxxdegen
/

new_mibera_train

Runtime error

App Files Files Community

ivxxdegen commited on Feb 3

Commit

4ba1c51

1 Parent(s): b6e95ca

upgrade

Browse files

Files changed (1) hide show

app.py +13 -8

app.py CHANGED Viewed

@@ -54,13 +54,18 @@ if not os.path.exists(dataset_path):
 # --- Load the dataset using pandas ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
-# Create new columns by extracting text from the nested JSON objects
 df["tweet_text"] = df["tweet"].apply(lambda x: x.get("content", "") if isinstance(x, dict) else str(x))
 df["lore_text"] = df["lore"].apply(lambda x: x.get("response", "") if isinstance(x, dict) else str(x))
-# Optionally, drop the original columns if desired:
-# df = df.drop(columns=["tweet", "lore"])
 dataset = Dataset.from_pandas(df)
 print("Dataset columns:", dataset.column_names)
 # --- Split the dataset into train and evaluation subsets ---
 split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
@@ -76,7 +81,7 @@ model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
     device_map="auto",
     max_memory=max_memory,
-    offload_folder=offload_folder,
     low_cpu_mem_usage=True,
     offload_state_dict=True
 )
@@ -84,7 +89,7 @@ torch.cuda.empty_cache()
 model.gradient_checkpointing_enable()
 # --- Integrate PEFT (LoRA) ---
-# Based on your inspection, we target "qkv_proj" (update if needed)
 lora_config = LoraConfig(
     r=16,
     lora_alpha=32,
@@ -98,7 +103,7 @@ model.print_trainable_parameters()
 # --- Preprocess the dataset ---
 def preprocess_function(examples):
     combined_texts = []
-    # Use the new flattened columns "tweet_text" and "lore_text"
     tweets = examples.get("tweet_text", [])
     lores = examples.get("lore_text", [])
     for tweet, lore in zip(tweets, lores):
@@ -118,12 +123,12 @@ def add_labels(batch):
 print("🛠 Adding labels to train dataset...")
 tokenized_train = tokenized_train.map(add_labels, batched=True)
 print("🛠 Adding labels to eval dataset...")
-tokenized_eval = eval_dataset.map(add_labels, batched=True)
 # --- Set training arguments ---
 training_args = TrainingArguments(
     output_dir=output_dir,
-    evaluation_strategy="epoch",
     logging_dir="./logs",
     logging_steps=500,
     num_train_epochs=3,

 # --- Load the dataset using pandas ---
 print("📥 Loading dataset using pandas...")
 df = pd.read_json(dataset_path, lines=True)
+# Flatten nested JSON columns: extract "content" from tweet and "response" from lore.
 df["tweet_text"] = df["tweet"].apply(lambda x: x.get("content", "") if isinstance(x, dict) else str(x))
 df["lore_text"] = df["lore"].apply(lambda x: x.get("response", "") if isinstance(x, dict) else str(x))
+# Optionally drop the original nested columns:
+df = df.drop(columns=["tweet", "lore"])
+# Now convert the flattened DataFrame into a Hugging Face Dataset.
 dataset = Dataset.from_pandas(df)
 print("Dataset columns:", dataset.column_names)
+# Expected columns are now: ['tweet_text', 'lore_text'] plus any others
 # --- Split the dataset into train and evaluation subsets ---
 split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
     trust_remote_code=True,
     device_map="auto",
     max_memory=max_memory,
+    offload_folder="./offload",
     low_cpu_mem_usage=True,
     offload_state_dict=True
 )
 model.gradient_checkpointing_enable()
 # --- Integrate PEFT (LoRA) ---
+# Based on your inspection, we target "qkv_proj". Update if necessary.
 lora_config = LoraConfig(
     r=16,
     lora_alpha=32,
 # --- Preprocess the dataset ---
 def preprocess_function(examples):
     combined_texts = []
+    # Use the new flattened columns: "tweet_text" and "lore_text"
     tweets = examples.get("tweet_text", [])
     lores = examples.get("lore_text", [])
     for tweet, lore in zip(tweets, lores):
 print("🛠 Adding labels to train dataset...")
 tokenized_train = tokenized_train.map(add_labels, batched=True)
 print("🛠 Adding labels to eval dataset...")
+tokenized_eval = tokenized_eval.map(add_labels, batched=True)
 # --- Set training arguments ---
 training_args = TrainingArguments(
     output_dir=output_dir,
+    evaluation_strategy="epoch",  # (Deprecated: use eval_strategy in future versions)
     logging_dir="./logs",
     logging_steps=500,
     num_train_epochs=3,