baakaani commited on
Commit
57826b8
·
1 Parent(s): d9b2ca0

new changes

Browse files
Files changed (3) hide show
  1. app.py +256 -1
  2. config_layout.json +49 -0
  3. requirements.txt +3 -0
app.py CHANGED
@@ -1,3 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
 
3
- st.title("Hello")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ from importlib import reload
3
+ from itertools import product as cproduct
4
+ from itertools import combinations
5
+ from pylab import *
6
+ import itertools
7
+ import json
8
+ import math
9
+ import os
10
+ import pandas as pd
11
+ import pm4py
12
+ import random
13
  import streamlit as st
14
+ import subprocess
15
 
16
+ st.set_page_config(layout='wide')
17
+ INPUT_XES="output/inputlog_temp.xes"
18
+
19
+ """
20
+ # Configuration File fabric for
21
+ ## GEDI: **G**enerating **E**vent **D**ata with **I**ntentional Features for Benchmarking Process Mining
22
+ """
23
+ def double_switch(label_left, label_right, third_label=None, fourth_label=None):
24
+ if third_label==None and fourth_label==None:
25
+ # Create two columns for the labels and toggle switch
26
+ col0, col1, col2, col3, col4 = st.columns([2,1,1,1,2])
27
+ else:
28
+ # Create two columns for the labels and toggle switch
29
+ col0, col1, col2, col3, col4, col5, col6, col7, col8 = st.columns([1,1,1,1,1,1,1,1,1])
30
+
31
+ # Add labels to the columns
32
+ with col1:
33
+ st.write(label_left)
34
+
35
+ with col2:
36
+ # Create the toggle switch
37
+ toggle_option = st.toggle(" ",value=False,
38
+ key="toggle_switch_"+label_left,
39
+ )
40
+
41
+ with col3:
42
+ st.write(label_right)
43
+ if third_label is None and fourth_label is None:return toggle_option
44
+ else:
45
+ with col5:
46
+ st.write(third_label)
47
+
48
+ with col6:
49
+ # Create the toggle switch
50
+ toggle_option_2 = st.toggle(" ",value=False,
51
+ key="toggle_switch_"+third_label,
52
+ )
53
+
54
+ with col7:
55
+ st.write(fourth_label)
56
+ return toggle_option, toggle_option_2
57
+
58
+ def multi_button(labels):
59
+ cols = st.columns(len(labels))
60
+ activations = []
61
+ for col, label in zip(cols, labels):
62
+ activations.append(col.button(label))
63
+ return activations
64
+
65
+ def input_multicolumn(labels, default_values, n_cols=5):
66
+ result = {}
67
+ cols = st.columns(n_cols)
68
+ factor = math.ceil(len(labels)/n_cols)
69
+ extended = cols.copy()
70
+ for _ in range(factor):
71
+ extended.extend(cols)
72
+ for label, default_value, col in zip(labels, default_values, extended):
73
+ with col:
74
+ result[label] = col.text_input(label, default_value, key=f"input_"+label+'_'+str(default_value))
75
+ return result.values()
76
+
77
+ def split_list(input_list, n):
78
+ # Calculate the size of each chunk
79
+ k, m = divmod(len(input_list), n)
80
+ # Use list comprehension to create n sublists
81
+ return [input_list[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
82
+
83
+ def get_ranges_from_stats(stats, tuple_values):
84
+ col_for_row = ", ".join([f"x[\'{i}\'].astype(float)" for i in tuple_values])
85
+ stats['range'] = stats.apply(lambda x: tuple([eval(col_for_row)]), axis=1)
86
+ #tasks = eval(f"list(itertools.product({(parameters*n_para_obj)[:-2]}))")
87
+ result = [f"np.around({x}, 2)" for x in stats['range']]
88
+ result = ", ".join(result)
89
+ return result
90
+
91
+ def create_objectives_grid(df, objectives, n_para_obj=2, method="combinatorial"):
92
+ if method=="combinatorial":
93
+ sel_features = df.index.to_list()
94
+ parameters_o = "objectives, "
95
+ parameters = get_ranges_from_stats(df, sorted(objectives))
96
+ objectives = sorted(sel_features)
97
+ tasks = f"list(cproduct({parameters}))[0]"
98
+
99
+ elif method=="range-from-csv":
100
+ tasks = ""
101
+ for objective in objectives:
102
+ min_col, max_col, step_col = st.columns(3)
103
+ with min_col:
104
+ selcted_min = st.slider(objective+': min', min_value=float(df[objective].min()), max_value=float(df[objective].max()), value=df[objective].quantile(0.1), step=0.1, key=objective+"min")
105
+ with max_col:
106
+ selcted_max = st.slider('max', min_value=selcted_min, max_value=float(df[objective].max()), value=df[objective].quantile(0.9), step=0.1, key=objective+"max")
107
+ with step_col:
108
+ step_value = st.slider('step', min_value=float(df[objective].min()), max_value=float(df[objective].quantile(0.9)), value=df[objective].median()/(df[objective].min()+0.0001), step=0.01, key=objective+"step")
109
+ tasks += f"np.around(np.arange({selcted_min}, {selcted_max}+{step_value}, {step_value}),2), "
110
+ else :#method=="range-manual":
111
+ experitments = []
112
+ tasks=""
113
+ if objectives != None:
114
+ cross_labels = [feature[0]+': '+feature[1] for feature in list(cproduct(objectives,['min', 'max', 'step']))]
115
+ cross_values = [round(eval(str(combination[0])+combination[1]), 2) for combination in list(cproduct(list(df.values()), ['*1', '*2', '/3']))]
116
+ ranges = zip(objectives, split_list(list(input_multicolumn(cross_labels, cross_values, n_cols=3)), n_para_obj))
117
+ for objective, range_value in ranges:
118
+ selcted_min, selcted_max, step_value = range_value
119
+ tasks += f"np.around(np.arange({selcted_min}, {selcted_max}+{step_value}, {step_value}),2), "
120
+
121
+ #import pdb; pdb.set_trace()
122
+ cartesian_product = list(cproduct(*eval(tasks)))
123
+ experiments = [{key: value[idx] for idx, key in enumerate(objectives)} for value in cartesian_product]
124
+ return experiments
125
+
126
+ def set_generator_experiments(generator_params):
127
+ def handle_csv_file(grid_option):
128
+ uploaded_file = st.file_uploader("Pick a csv-file containing feature values for features:", type="csv")
129
+ if uploaded_file is not None:
130
+ df = pd.read_csv(uploaded_file)
131
+ sel_features = st.multiselect("Selected features", list(df.columns))
132
+ if sel_features:
133
+ df = df[sel_features]
134
+ return df, sel_features
135
+ return None, None
136
+
137
+ def handle_combinatorial(sel_features, stats, tuple_values):
138
+ triangular_option = double_switch("Square", "Triangular")
139
+ if triangular_option:
140
+ experiments = []
141
+ elements = sel_features
142
+ # List to store all combinations
143
+ all_combinations = [combinations(sel_features, r) for r in range(1, len(sel_features) + 1)]
144
+ all_combinations = [comb for sublist in all_combinations for comb in sublist]
145
+
146
+ # Print or use the result as needed
147
+ for comb in all_combinations:
148
+ sel_stats = stats.loc[sorted(list(comb))]
149
+ experiments += create_objectives_grid(sel_stats, tuple_values, n_para_obj=len(tuple_values), method="combinatorial")
150
+ else:
151
+ experiments = create_objectives_grid(stats, tuple_values, n_para_obj=len(tuple_values))
152
+ return experiments
153
+
154
+ def handle_grid_option(grid_option, df, sel_features):
155
+ if grid_option:
156
+ combinatorial = double_switch("Range", "Combinatorial")
157
+ if combinatorial:
158
+ add_quantile = st.slider('Add %-quantile', min_value=0.0, max_value=100.0, value=50.0, step=5.0)
159
+ stats = df.describe().transpose().sort_index()
160
+ stats[f"{int(add_quantile)}%"] = df.quantile(q=add_quantile / 100)
161
+ st.write(stats)
162
+ tuple_values = st.multiselect("Tuples including", list(stats.columns)[3:], default=['min', 'max'])
163
+ return handle_combinatorial(sel_features, stats, tuple_values)
164
+ else: # Range
165
+ return create_objectives_grid(df, sel_features, n_para_obj=len(sel_features), method="range-from-csv")
166
+ else: # Point
167
+ st.write(df)
168
+ return df.to_dict(orient='records')
169
+
170
+ def handle_manual_option(sel_features, grid_option):
171
+ if sel_features:
172
+ if grid_option:
173
+ return create_objectives_grid(generator_params['experiment'], sel_features, n_para_obj=len(sel_features), method="range-manual")
174
+ else:
175
+ experiment = {sel_feature: float(st.text_input(sel_feature, generator_params['experiment'][sel_feature])) for sel_feature in sel_features}
176
+ return [experiment]
177
+ return []
178
+
179
+ grid_option, csv_option = double_switch("Point-", "Grid-based", third_label="Manual", fourth_label="From CSV")
180
+
181
+ if csv_option:
182
+ df, sel_features = handle_csv_file(grid_option)
183
+ if df is not None and sel_features is not None:
184
+ experiments = handle_grid_option(grid_option, df, sel_features)
185
+ else:
186
+ experiments = []
187
+ else: # Manual
188
+ sel_features = st.multiselect("Selected features", list(generator_params['experiment'].keys()))
189
+ experiments = handle_manual_option(sel_features, grid_option)
190
+
191
+ generator_params['experiment'] = experiments
192
+ st.write(f"...result in {len(generator_params['experiment'])} experiment(s)")
193
+
194
+ """
195
+ #### Configuration space
196
+ """
197
+ updated_values = input_multicolumn(generator_params['config_space'].keys(), generator_params['config_space'].values())
198
+ for key, new_value in zip(generator_params['config_space'].keys(), updated_values):
199
+ generator_params['config_space'][key] = eval(new_value)
200
+ generator_params['n_trials'] = int(st.text_input('n_trials', generator_params['n_trials']))
201
+
202
+ return generator_params
203
+
204
+ if __name__ == '__main__':
205
+ config_layout = json.load(open("config_layout.json"))
206
+ type(config_layout)
207
+ step_candidates = ["instance_augmentation","event_logs_generation","feature_extraction","benchmark_test"]
208
+ pipeline_steps = st.multiselect(
209
+ "Choose pipeline step",
210
+ step_candidates,
211
+ []
212
+ )
213
+ step_configs = []
214
+ set_col, view_col = st.columns([3, 2])
215
+ for pipeline_step in pipeline_steps:
216
+ step_config = [d for d in config_layout if d['pipeline_step'] == pipeline_step][0]
217
+ with set_col:
218
+ st.header(pipeline_step)
219
+ for step_key in step_config.keys():
220
+ if step_key == "generator_params":
221
+ st.subheader("Set-up experiments")
222
+ step_config[step_key] = set_generator_experiments(step_config[step_key])
223
+ elif step_key == "feature_params":
224
+ layout_features = list(step_config[step_key]['feature_set'])
225
+ step_config[step_key]["feature_set"] = st.multiselect(
226
+ "features to extract",
227
+ layout_features)
228
+ elif step_key != "pipeline_step":
229
+ step_config[step_key] = st.text_input(step_key, step_config[step_key])
230
+ with view_col:
231
+ st.write(step_config)
232
+ step_configs.append(step_config)
233
+ config_file = json.dumps(step_configs, indent=4)
234
+ output_path = st.text_input("Output file path", "config_files/experiment_config.json")
235
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
236
+ save_labels = ["Save config file", "Save and run config_file"]
237
+ save_labels = ["Save configuration file"]
238
+ #create_button, create_run_button = multi_button(save_labels)
239
+ create_button = multi_button(save_labels)
240
+ # ToDo: Bug: automatically updates the experiment_config.json file even without pressing the save button
241
+ if create_button: # or create_run_button:
242
+ with open(output_path, "w") as f:
243
+ f.write(config_file)
244
+ st.write("Saved configuration in ", output_path, ". Run command:")
245
+ #if create_run_button:
246
+ if True:
247
+ var = f"python -W ignore main.py -a {output_path}"
248
+ st.code(var, language='bash')
249
+ if False: #FIXME: Command fails when using multiprocessing
250
+ command = var.split()
251
+
252
+ # Run the command
253
+ result = subprocess.run(command, capture_output=True, text=True)
254
+
255
+ if len(result.stderr)==0:
256
+ st.write(result.stdout)
257
+ else:
258
+ st.write("ERROR: ", result.stderr)
config_layout.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "pipeline_step": "instance_augmentation",
4
+ "augmentation_params":{"method":"SMOTE", "no_samples":2,
5
+ "feature_selection": ["ratio_top_20_variants", "epa_normalized_sequence_entropy_linear_forgetting"]},
6
+ "input_path": "data/test/bpic_features.csv",
7
+ "output_path": "output"
8
+ },
9
+ {
10
+ "pipeline_step": "event_logs_generation",
11
+ "output_path": "output/features/2_bpic_features/2_ense_rmcv_feat.csv",
12
+ "output_path": "data/test",
13
+ "generator_params": {
14
+ "experiment": "data/grid_objectives.csv",
15
+ "experiment": {"input_path": "data/2_bpic_features.csv",
16
+ "objectives": ["ratio_top_20_variants", "epa_normalized_sequence_entropy_linear_forgetting"]},
17
+ "experiment": {"n_traces":832, "n_unique_traces":828, "ratio_variants_per_number_of_traces":0.99, "trace_len_min":1, "trace_len_max":132, "trace_len_mean":53.31, "trace_len_median":54, "trace_len_mode":61, "trace_len_std":19.89, "trace_len_variance":395.81, "trace_len_q1":44, "trace_len_q3":62, "trace_len_iqr":18, "trace_len_geometric_mean":48.15, "trace_len_geometric_std":1.69, "trace_len_harmonic_mean":37.58, "trace_len_skewness":0.0541, "trace_len_kurtosis":0.81, "trace_len_coefficient_variation":0.37, "trace_len_entropy":6.65, "trace_len_hist1":0.004, "trace_len_hist2":0.005, "trace_len_hist3":0.005, "trace_len_hist4":0.024, "trace_len_hist5":0.024, "trace_len_hist6":0.008, "trace_len_hist7":0.005, "trace_len_hist8":0.001, "trace_len_hist9":0.0, "trace_len_hist10":0.00, "trace_len_skewness_hist":0.05, "trace_len_kurtosis_hist":0.8, "ratio_most_common_variant":0.0, "ratio_top_1_variants":0.01, "ratio_top_5_variants":0.05, "ratio_top_10_variants":0.10, "ratio_top_20_variants":0.2, "ratio_top_50_variants":0.5, "ratio_top_75_variants":0.75, "mean_variant_occurrence":1.0, "std_variant_occurrence":0.07, "skewness_variant_occurrence":14.28, "kurtosis_variant_occurrence":202.00, "n_unique_activities":410, "activities_min":1, "activities_max":830, "activities_mean":108.18, "activities_median":12, "activities_std":187.59, "activities_variance":35189, "activities_q1":3, "activities_q3":125, "activities_iqr":122, "activities_skewness":2.13, "activities_kurtosis":3.81, "n_unique_start_activities":14, "start_activities_min":1, "start_activities_max":731, "start_activities_mean":59.43, "start_activities_median":1, "start_activities_std":186.72, "start_activities_variance":34863, "start_activities_q1":1, "start_activities_q3":8, "start_activities_iqr":7, "start_activities_skewness":3, "start_activities_kurtosis":9.0, "n_unique_end_activities":82, "end_activities_min":1, "end_activities_max":216, "end_activities_mean":10, "end_activities_median":1, "end_activities_std":35, "end_activities_variance":1247, "end_activities_q1":1, "end_activities_q3":3, "end_activities_iqr":2, "end_activities_skewness":5, "end_activities_kurtosis":26, "eventropy_trace":10, "eventropy_prefix":15, "eventropy_global_block":19, "eventropy_lempel_ziv":4, "eventropy_k_block_diff_1":7.1, "eventropy_k_block_diff_3":7.1, "eventropy_k_block_diff_5":7.1, "eventropy_k_block_ratio_1":7.1, "eventropy_k_block_ratio_3":7.1, "eventropy_k_block_ratio_5":7.1, "eventropy_knn_3":5.54, "eventropy_knn_5":5.04, "eventropy_knn_7":4.72, "epa_variant_entropy":240512, "epa_normalized_variant_entropy":0.68, "epa_sequence_entropy":285876, "epa_normalized_sequence_entropy":0.60, "epa_sequence_entropy_linear_forgetting":150546, "epa_normalized_sequence_entropy_linear_forgetting":0.32, "epa_sequence_entropy_exponential_forgetting":185312, "epa_normalized_sequence_entropy_exponential_forgetting":0.39},
18
+ "config_space": {
19
+ "mode": [5, 20],
20
+ "sequence": [0.01, 1],
21
+ "choice": [0.01, 1],
22
+ "parallel": [0.01, 1],
23
+ "loop": [0.01, 1],
24
+ "silent": [0.01, 1],
25
+ "lt_dependency": [0.01, 1],
26
+ "num_traces": [10, 100],
27
+ "duplicate": [0],
28
+ "or": [0]
29
+ },
30
+ "n_trials": 2
31
+ }
32
+ },
33
+ {
34
+ "pipeline_step": "feature_extraction",
35
+ "input_path": "data/test",
36
+ "feature_params": {"feature_set": ["n_traces", "n_unique_traces", "ratio_unique_traces_per_trace", "trace_len_min", "trace_len_max", "trace_len_mean", "trace_len_median", "trace_len_mode", "trace_len_std", "trace_len_variance", "trace_len_q1", "trace_len_q3", "trace_len_iqr", "trace_len_geometric_mean", "trace_len_geometric_std", "trace_len_harmonic_mean", "trace_len_skewness", "trace_len_kurtosis", "trace_len_coefficient_variation", "trace_len_entropy", "trace_len_hist1", "trace_len_hist2", "trace_len_hist3", "trace_len_hist4", "trace_len_hist5", "trace_len_hist6", "trace_len_hist7", "trace_len_hist8", "trace_len_hist9", "trace_len_hist10", "trace_len_skewness_hist", "trace_len_kurtosis_hist", "ratio_most_common_variant", "ratio_top_1_variants", "ratio_top_5_variants", "ratio_top_10_variants", "ratio_top_20_variants", "ratio_top_50_variants", "ratio_top_75_variants", "mean_variant_occurrence", "std_variant_occurrence", "skewness_variant_occurrence", "kurtosis_variant_occurrence", "n_unique_activities", "activities_min", "activities_max", "activities_mean", "activities_median", "activities_std", "activities_variance", "activities_q1", "activities_q3", "activities_iqr", "activities_skewness", "activities_kurtosis", "n_unique_start_activities", "start_activities_min", "start_activities_max", "start_activities_mean", "start_activities_median", "start_activities_std", "start_activities_variance", "start_activities_q1", "start_activities_q3", "start_activities_iqr", "start_activities_skewness", "start_activities_kurtosis", "n_unique_end_activities", "end_activities_min", "end_activities_max", "end_activities_mean", "end_activities_median", "end_activities_std", "end_activities_variance", "end_activities_q1", "end_activities_q3", "end_activities_iqr", "end_activities_skewness", "end_activities_kurtosis", "eventropy_trace", "eventropy_prefix", "eventropy_prefix_flattened", "eventropy_global_block", "eventropy_global_block_flattened", "eventropy_lempel_ziv", "eventropy_lempel_ziv_flattened", "eventropy_k_block_diff_1", "eventropy_k_block_diff_3", "eventropy_k_block_diff_5", "eventropy_k_block_ratio_1", "eventropy_k_block_ratio_3", "eventropy_k_block_ratio_5", "eventropy_knn_3", "eventropy_knn_5", "eventropy_knn_7", "epa_variant_entropy", "epa_normalized_variant_entropy", "epa_sequence_entropy", "epa_normalized_sequence_entropy", "epa_sequence_entropy_linear_forgetting", "epa_normalized_sequence_entropy_linear_forgetting", "epa_sequence_entropy_exponential_forgetting", "epa_normalized_sequence_entropy_exponential_forgetting"]},
37
+ "output_path": "output/plots",
38
+ "real_eventlog_path": "data/BaselineED_feat.csv",
39
+ "plot_type": "boxplot"
40
+ },
41
+ {
42
+ "pipeline_step": "benchmark_test",
43
+ "benchmark_task": "discovery",
44
+ "input_path":"data/test",
45
+ "output_path":"output",
46
+ "miners" : ["inductive", "heu", "imf", "ilp"]
47
+ }
48
+ ]
49
+
requirements.txt CHANGED
@@ -1 +1,4 @@
 
 
1
  streamlit
 
 
1
+ pandas
2
+ pm4py
3
  streamlit
4
+ matplotlib # Required by 'pylab'