Spaces:

Agrannya
/

Product_review_model

Sleeping

App Files Files Community

Product_review_model / model_save.py

Agrannya

adding the files for the model

9fb4d52 verified about 1 month ago

raw

history blame contribute delete

3.64 kB

	# Block 4: Random Forest Model with Progress Display
	# --------------------------------------------------
	# Use a classic machine learning approach for rating prediction with progress tracking.
	# Block 2: Load and Prepare Data
	# ------------------------------
	# This block loads the data from your Excel file, fixes the header,
	# and prepares it for the model.

	import pandas as pd

	print("--- Loading and Preparing Data ---")

	# Define the correct column names we want to use.
	correct_column_names = ['Id', 'Review', 'Rating']

	# 1. Load the Excel file, skipping the bad header row.
	# We explicitly tell pandas there is no header to read.
	df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)

	# 2. Manually assign our correct column names. This is the key step
	# to prevent the 'KeyError'.
	df.columns = correct_column_names

	# 3. Clean the data:
	# - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
	# - Drop any rows where 'Rating' or 'Review' is missing.
	df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
	df.dropna(subset=['Rating', 'Review'], inplace=True)

	# 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
	# This helps the model train more effectively.
	df['normalized_rating'] = (df['Rating'] - 1) / 4.0

	# 5. Create our final, clean DataFrame for the model.
	df_regression = df[['Review', 'normalized_rating']].copy()

	print("✅ Data loaded and prepared successfully!")
	print("\nHere's a sample of the prepared data:")
	print(df_regression.head())
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error, r2_score
	import joblib
	import numpy as np

	print("--- Pivoting to Random Forest ---")

	# Assume 'df_regression' is your DataFrame with 'Review' and 'normalized_rating' columns

	# --- 4.1. Prepare Data and Split ---
	X = df_regression['Review']
	y = df_regression['normalized_rating']

	X_train, X_val, y_train, y_val = train_test_split(
	X, y, test_size=0.2, random_state=42
	)

	print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")

	# --- 4.2. Vectorize Text Data using TF-IDF ---
	print("Vectorizing text with TF-IDF...")
	vectorizer = TfidfVectorizer(
	max_features=5000,
	ngram_range=(1, 2),
	stop_words='english'
	)

	X_train_tfidf = vectorizer.fit_transform(X_train)
	X_val_tfidf = vectorizer.transform(X_val)
	print("Vectorization complete.")
	print(f"Shape of TF-IDF matrix: {X_train_tfidf.shape}")


	# --- 4.3. Train the Random Forest Model ---
	print("⚙️ Training Random Forest Regressor...")
	rf_model = RandomForestRegressor(
	n_estimators=200,
	max_depth=50,
	random_state=42,
	n_jobs=-1,
	verbose=1 # <<< ADDED: This will print progress updates during training.
	)

	rf_model.fit(X_train_tfidf, y_train)
	print("✅ Model training finished!")


	# --- 4.4. Evaluate the Model ---
	print("Evaluating model performance...")
	predictions = rf_model.predict(X_val_tfidf)

	mse = mean_squared_error(y_val, predictions)
	r2 = r2_score(y_val, predictions)

	print(f"\n--- Evaluation Results ---")
	print(f"Mean Squared Error (MSE): {mse:.4f}")
	print(f"R-squared (R²): {r2:.4f}")
	print("--------------------------")


	# --- 4.5. Save the Model and Vectorizer ---
	joblib.dump(rf_model, 'random_forest_model.joblib')
	joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')

	print("\nModel and TF-IDF vectorizer saved successfully.")