# Block 4: Random Forest Model with Progress Display # -------------------------------------------------- # Use a classic machine learning approach for rating prediction with progress tracking. # Block 2: Load and Prepare Data # ------------------------------ # This block loads the data from your Excel file, fixes the header, # and prepares it for the model. import pandas as pd print("--- Loading and Preparing Data ---") # Define the correct column names we want to use. correct_column_names = ['Id', 'Review', 'Rating'] # 1. Load the Excel file, skipping the bad header row. # We explicitly tell pandas there is no header to read. df = pd.read_excel('train_best.xlsx', header=None, skiprows=1) # 2. Manually assign our correct column names. This is the key step # to prevent the 'KeyError'. df.columns = correct_column_names # 3. Clean the data: # - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'. # - Drop any rows where 'Rating' or 'Review' is missing. df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce') df.dropna(subset=['Rating', 'Review'], inplace=True) # 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale. # This helps the model train more effectively. df['normalized_rating'] = (df['Rating'] - 1) / 4.0 # 5. Create our final, clean DataFrame for the model. df_regression = df[['Review', 'normalized_rating']].copy() print("✅ Data loaded and prepared successfully!") print("\nHere's a sample of the prepared data:") print(df_regression.head()) import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score import joblib import numpy as np print("--- Pivoting to Random Forest ---") # Assume 'df_regression' is your DataFrame with 'Review' and 'normalized_rating' columns # --- 4.1. Prepare Data and Split --- X = df_regression['Review'] y = df_regression['normalized_rating'] X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=42 ) print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.") # --- 4.2. Vectorize Text Data using TF-IDF --- print("Vectorizing text with TF-IDF...") vectorizer = TfidfVectorizer( max_features=5000, ngram_range=(1, 2), stop_words='english' ) X_train_tfidf = vectorizer.fit_transform(X_train) X_val_tfidf = vectorizer.transform(X_val) print("Vectorization complete.") print(f"Shape of TF-IDF matrix: {X_train_tfidf.shape}") # --- 4.3. Train the Random Forest Model --- print("⚙️ Training Random Forest Regressor...") rf_model = RandomForestRegressor( n_estimators=200, max_depth=50, random_state=42, n_jobs=-1, verbose=1 # <<< ADDED: This will print progress updates during training. ) rf_model.fit(X_train_tfidf, y_train) print("✅ Model training finished!") # --- 4.4. Evaluate the Model --- print("Evaluating model performance...") predictions = rf_model.predict(X_val_tfidf) mse = mean_squared_error(y_val, predictions) r2 = r2_score(y_val, predictions) print(f"\n--- Evaluation Results ---") print(f"Mean Squared Error (MSE): {mse:.4f}") print(f"R-squared (R²): {r2:.4f}") print("--------------------------") # --- 4.5. Save the Model and Vectorizer --- joblib.dump(rf_model, 'random_forest_model.joblib') joblib.dump(vectorizer, 'tfidf_vectorizer.joblib') print("\nModel and TF-IDF vectorizer saved successfully.")