Canstralian commited on
Commit
33dc395
·
verified ·
1 Parent(s): d58fb21

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (1) hide show
  1. data/data_processing.py +47 -0
data/data_processing.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.preprocessing import StandardScaler, PolynomialFeatures
3
+
4
+ def load_data(file_path):
5
+ """Load dataset from a CSV file."""
6
+ return pd.read_csv(file_path)
7
+
8
+ def scale_features(df):
9
+ """Scale numerical features using StandardScaler."""
10
+ numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
11
+ scaler = StandardScaler()
12
+ df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
13
+ return df
14
+
15
+ def create_polynomial_features(df, degree=2, selected_columns=None):
16
+ """Create polynomial features.
17
+
18
+ Args:
19
+ df: Input DataFrame
20
+ degree: Degree of polynomial features (default: 2)
21
+ selected_columns: List of column names to use for polynomial features.
22
+ If None, uses all numerical columns (default: None)
23
+ """
24
+ if selected_columns is not None:
25
+ numerical_cols = [col for col in selected_columns if col in df.columns]
26
+ if not numerical_cols:
27
+ raise ValueError("None of the selected columns found in DataFrame")
28
+ else:
29
+ numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
30
+ poly = PolynomialFeatures(degree=degree, include_bias=False)
31
+ poly_features = poly.fit_transform(df[numerical_cols])
32
+ poly_feature_names = poly.get_feature_names_out(numerical_cols)
33
+ poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
34
+ df = df.join(poly_df)
35
+ return df
36
+
37
+ def process_data(file_path):
38
+ """Load, process, and return the dataset."""
39
+ df = load_data(file_path)
40
+ df = scale_features(df)
41
+ df = create_polynomial_features(df)
42
+ return df
43
+
44
+ if __name__ == "__main__":
45
+ file_path = 'path_to_your_data.csv' # Replace with your actual file path
46
+ processed_data = process_data(file_path)
47
+ processed_data.to_csv('processed_data_with_features.csv', index=False)