ivxxdegen commited on
Commit
b6e95ca
Β·
1 Parent(s): 7e188c1
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -54,7 +54,11 @@ if not os.path.exists(dataset_path):
54
  # --- Load the dataset using pandas ---
55
  print("πŸ“₯ Loading dataset using pandas...")
56
  df = pd.read_json(dataset_path, lines=True)
57
- # (Do not convert to string so that the dicts remain intact)
 
 
 
 
58
  dataset = Dataset.from_pandas(df)
59
  print("Dataset columns:", dataset.column_names)
60
 
@@ -80,7 +84,7 @@ torch.cuda.empty_cache()
80
  model.gradient_checkpointing_enable()
81
 
82
  # --- Integrate PEFT (LoRA) ---
83
- # Based on your inspection, we now target "qkv_proj". Adjust if needed.
84
  lora_config = LoraConfig(
85
  r=16,
86
  lora_alpha=32,
@@ -94,14 +98,11 @@ model.print_trainable_parameters()
94
  # --- Preprocess the dataset ---
95
  def preprocess_function(examples):
96
  combined_texts = []
97
- # For each example, extract the tweet content and lore response
98
- tweets = examples.get("tweet", [])
99
- lores = examples.get("lore", [])
100
- for tweet_obj, lore_obj in zip(tweets, lores):
101
- # Extract "content" from tweet, and "response" from lore
102
- tweet_text = tweet_obj.get("content", "") if isinstance(tweet_obj, dict) else str(tweet_obj)
103
- lore_text = lore_obj.get("response", "") if isinstance(lore_obj, dict) else str(lore_obj)
104
- combined_text = "[PERSONALITY] " + tweet_text + "\n[KNOWLEDGE] " + lore_text
105
  combined_texts.append(combined_text)
106
  return tokenizer(combined_texts, truncation=True, padding=True)
107
 
@@ -122,7 +123,7 @@ tokenized_eval = eval_dataset.map(add_labels, batched=True)
122
  # --- Set training arguments ---
123
  training_args = TrainingArguments(
124
  output_dir=output_dir,
125
- evaluation_strategy="epoch", # (Deprecated: use eval_strategy in future versions)
126
  logging_dir="./logs",
127
  logging_steps=500,
128
  num_train_epochs=3,
 
54
  # --- Load the dataset using pandas ---
55
  print("πŸ“₯ Loading dataset using pandas...")
56
  df = pd.read_json(dataset_path, lines=True)
57
+ # Create new columns by extracting text from the nested JSON objects
58
+ df["tweet_text"] = df["tweet"].apply(lambda x: x.get("content", "") if isinstance(x, dict) else str(x))
59
+ df["lore_text"] = df["lore"].apply(lambda x: x.get("response", "") if isinstance(x, dict) else str(x))
60
+ # Optionally, drop the original columns if desired:
61
+ # df = df.drop(columns=["tweet", "lore"])
62
  dataset = Dataset.from_pandas(df)
63
  print("Dataset columns:", dataset.column_names)
64
 
 
84
  model.gradient_checkpointing_enable()
85
 
86
  # --- Integrate PEFT (LoRA) ---
87
+ # Based on your inspection, we target "qkv_proj" (update if needed)
88
  lora_config = LoraConfig(
89
  r=16,
90
  lora_alpha=32,
 
98
  # --- Preprocess the dataset ---
99
  def preprocess_function(examples):
100
  combined_texts = []
101
+ # Use the new flattened columns "tweet_text" and "lore_text"
102
+ tweets = examples.get("tweet_text", [])
103
+ lores = examples.get("lore_text", [])
104
+ for tweet, lore in zip(tweets, lores):
105
+ combined_text = "[PERSONALITY] " + tweet + "\n[KNOWLEDGE] " + lore
 
 
 
106
  combined_texts.append(combined_text)
107
  return tokenizer(combined_texts, truncation=True, padding=True)
108
 
 
123
  # --- Set training arguments ---
124
  training_args = TrainingArguments(
125
  output_dir=output_dir,
126
+ evaluation_strategy="epoch",
127
  logging_dir="./logs",
128
  logging_steps=500,
129
  num_train_epochs=3,