File size: 3,637 Bytes
9fb4d52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Block 4: Random Forest Model with Progress Display
# --------------------------------------------------
# Use a classic machine learning approach for rating prediction with progress tracking.
# Block 2: Load and Prepare Data
# ------------------------------
# This block loads the data from your Excel file, fixes the header,
# and prepares it for the model.

import pandas as pd

print("--- Loading and Preparing Data ---")

# Define the correct column names we want to use.
correct_column_names = ['Id', 'Review', 'Rating']

# 1. Load the Excel file, skipping the bad header row.
#    We explicitly tell pandas there is no header to read.
df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)

# 2. Manually assign our correct column names. This is the key step
#    to prevent the 'KeyError'.
df.columns = correct_column_names

# 3. Clean the data:
#    - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
#    - Drop any rows where 'Rating' or 'Review' is missing.
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df.dropna(subset=['Rating', 'Review'], inplace=True)

# 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
#    This helps the model train more effectively.
df['normalized_rating'] = (df['Rating'] - 1) / 4.0

# 5. Create our final, clean DataFrame for the model.
df_regression = df[['Review', 'normalized_rating']].copy()

print("✅ Data loaded and prepared successfully!")
print("\nHere's a sample of the prepared data:")
print(df_regression.head())
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np

print("--- Pivoting to Random Forest ---")

# Assume 'df_regression' is your DataFrame with 'Review' and 'normalized_rating' columns

# --- 4.1. Prepare Data and Split ---
X = df_regression['Review']
y = df_regression['normalized_rating']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")

# --- 4.2. Vectorize Text Data using TF-IDF ---
print("Vectorizing text with TF-IDF...")
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
print("Vectorization complete.")
print(f"Shape of TF-IDF matrix: {X_train_tfidf.shape}")


# --- 4.3. Train the Random Forest Model ---
print("⚙️ Training Random Forest Regressor...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=50,
    random_state=42,
    n_jobs=-1,
    verbose=1  # <<< ADDED: This will print progress updates during training.
)

rf_model.fit(X_train_tfidf, y_train)
print("✅ Model training finished!")


# --- 4.4. Evaluate the Model ---
print("Evaluating model performance...")
predictions = rf_model.predict(X_val_tfidf)

mse = mean_squared_error(y_val, predictions)
r2 = r2_score(y_val, predictions)

print(f"\n--- Evaluation Results ---")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print("--------------------------")


# --- 4.5. Save the Model and Vectorizer ---
joblib.dump(rf_model, 'random_forest_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

print("\nModel and TF-IDF vectorizer saved successfully.")