Spaces:
Sleeping
Sleeping
File size: 8,981 Bytes
41aae2b 2b0ceb5 bc63bbb 2b0ceb5 bc63bbb 41aae2b bc63bbb 41aae2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import numpy as np
import random
import pandas as pd
class Data:
def __init__(self, sources, destinations, timestamps, edge_idxs, labels):
self.sources = sources
self.destinations = destinations
self.timestamps = timestamps
self.edge_idxs = edge_idxs
self.labels = labels
self.n_interactions = len(sources)
self.unique_nodes = set(sources) | set(destinations)
self.n_unique_nodes = len(self.unique_nodes)
def get_data_node_classification(dataset_name, use_validation=False):
### Load data and train val test split
graph_df = pd.read_csv('./data/ml_{}.csv'.format(dataset_name))
edge_features = np.load('./data/ml_{}.npy'.format(dataset_name))
node_features = np.load('./data/ml_{}_node.npy'.format(dataset_name))
val_time, test_time = list(np.quantile(graph_df.ts, [0.70, 0.85]))
sources = graph_df.u.values
destinations = graph_df.i.values
edge_idxs = graph_df.idx.values
labels = graph_df.label.values
timestamps = graph_df.ts.values
random.seed(2020)
train_mask = timestamps <= val_time if use_validation else timestamps <= test_time
test_mask = timestamps > test_time
val_mask = np.logical_and(timestamps <= test_time, timestamps > val_time) if use_validation else test_mask
full_data = Data(sources, destinations, timestamps, edge_idxs, labels)
train_data = Data(sources[train_mask], destinations[train_mask], timestamps[train_mask],
edge_idxs[train_mask], labels[train_mask])
val_data = Data(sources[val_mask], destinations[val_mask], timestamps[val_mask],
edge_idxs[val_mask], labels[val_mask])
test_data = Data(sources[test_mask], destinations[test_mask], timestamps[test_mask],
edge_idxs[test_mask], labels[test_mask])
return full_data, node_features, edge_features, train_data, val_data, test_data
def get_data(dataset_name, different_new_nodes_between_val_and_test=False, randomize_features=False):
### Load data and train val test split
graph_df = pd.read_csv('./data/ml_{}.csv'.format(dataset_name))
edge_features = np.load('./data/ml_{}.npy'.format(dataset_name))
node_features = np.load('./data/ml_{}_node.npy'.format(dataset_name))
############################ ASHUTOSH #####################################
graph_df = graph_df.rename(columns={
'user_id': 'u',
'item_id': 'i',
'ts': 'ts',
'state_label': 'label',
# You can drop or handle the features column separately if needed
})
# Add edge index column
graph_df['idx'] = range(len(graph_df))
############################################################################
if randomize_features:
node_features = np.random.rand(node_features.shape[0], node_features.shape[1])
val_time, test_time = list(np.quantile(graph_df.ts, [0.70, 0.85]))
sources = graph_df.u.values
destinations = graph_df.i.values
edge_idxs = graph_df.idx.values
labels = graph_df.label.values
timestamps = graph_df.ts.values
full_data = Data(sources, destinations, timestamps, edge_idxs, labels)
random.seed(2020)
node_set = set(sources) | set(destinations)
n_total_unique_nodes = len(node_set)
# Compute nodes which appear at test time
test_node_set = set(sources[timestamps > val_time]).union(
set(destinations[timestamps > val_time]))
# Sample nodes which we keep as new nodes (to test inductiveness), so than we have to remove all
# their edges from training
new_test_node_set = set(random.sample(test_node_set, int(0.1 * n_total_unique_nodes)))
# Mask saying for each source and destination whether they are new test nodes
new_test_source_mask = graph_df.u.map(lambda x: x in new_test_node_set).values
new_test_destination_mask = graph_df.i.map(lambda x: x in new_test_node_set).values
# Mask which is true for edges with both destination and source not being new test nodes (because
# we want to remove all edges involving any new test node)
observed_edges_mask = np.logical_and(~new_test_source_mask, ~new_test_destination_mask)
# For train we keep edges happening before the validation time which do not involve any new node
# used for inductiveness
train_mask = np.logical_and(timestamps <= val_time, observed_edges_mask)
train_data = Data(sources[train_mask], destinations[train_mask], timestamps[train_mask],
edge_idxs[train_mask], labels[train_mask])
# define the new nodes sets for testing inductiveness of the model
train_node_set = set(train_data.sources).union(train_data.destinations)
assert len(train_node_set & new_test_node_set) == 0
new_node_set = node_set - train_node_set
val_mask = np.logical_and(timestamps <= test_time, timestamps > val_time)
test_mask = timestamps > test_time
if different_new_nodes_between_val_and_test:
n_new_nodes = len(new_test_node_set) // 2
val_new_node_set = set(list(new_test_node_set)[:n_new_nodes])
test_new_node_set = set(list(new_test_node_set)[n_new_nodes:])
edge_contains_new_val_node_mask = np.array(
[(a in val_new_node_set or b in val_new_node_set) for a, b in zip(sources, destinations)])
edge_contains_new_test_node_mask = np.array(
[(a in test_new_node_set or b in test_new_node_set) for a, b in zip(sources, destinations)])
new_node_val_mask = np.logical_and(val_mask, edge_contains_new_val_node_mask)
new_node_test_mask = np.logical_and(test_mask, edge_contains_new_test_node_mask)
else:
edge_contains_new_node_mask = np.array(
[(a in new_node_set or b in new_node_set) for a, b in zip(sources, destinations)])
new_node_val_mask = np.logical_and(val_mask, edge_contains_new_node_mask)
new_node_test_mask = np.logical_and(test_mask, edge_contains_new_node_mask)
# validation and test with all edges
val_data = Data(sources[val_mask], destinations[val_mask], timestamps[val_mask],
edge_idxs[val_mask], labels[val_mask])
test_data = Data(sources[test_mask], destinations[test_mask], timestamps[test_mask],
edge_idxs[test_mask], labels[test_mask])
# validation and test with edges that at least has one new node (not in training set)
new_node_val_data = Data(sources[new_node_val_mask], destinations[new_node_val_mask],
timestamps[new_node_val_mask],
edge_idxs[new_node_val_mask], labels[new_node_val_mask])
new_node_test_data = Data(sources[new_node_test_mask], destinations[new_node_test_mask],
timestamps[new_node_test_mask], edge_idxs[new_node_test_mask],
labels[new_node_test_mask])
print("The dataset has {} interactions, involving {} different nodes".format(full_data.n_interactions,
full_data.n_unique_nodes))
print("The training dataset has {} interactions, involving {} different nodes".format(
train_data.n_interactions, train_data.n_unique_nodes))
print("The validation dataset has {} interactions, involving {} different nodes".format(
val_data.n_interactions, val_data.n_unique_nodes))
print("The test dataset has {} interactions, involving {} different nodes".format(
test_data.n_interactions, test_data.n_unique_nodes))
print("The new node validation dataset has {} interactions, involving {} different nodes".format(
new_node_val_data.n_interactions, new_node_val_data.n_unique_nodes))
print("The new node test dataset has {} interactions, involving {} different nodes".format(
new_node_test_data.n_interactions, new_node_test_data.n_unique_nodes))
print("{} nodes were used for the inductive testing, i.e. are never seen during training".format(
len(new_test_node_set)))
return node_features, edge_features, full_data, train_data, val_data, test_data, \
new_node_val_data, new_node_test_data
def compute_time_statistics(sources, destinations, timestamps):
last_timestamp_sources = dict()
last_timestamp_dst = dict()
all_timediffs_src = []
all_timediffs_dst = []
for k in range(len(sources)):
source_id = sources[k]
dest_id = destinations[k]
c_timestamp = timestamps[k]
if source_id not in last_timestamp_sources.keys():
last_timestamp_sources[source_id] = 0
if dest_id not in last_timestamp_dst.keys():
last_timestamp_dst[dest_id] = 0
all_timediffs_src.append(c_timestamp - last_timestamp_sources[source_id])
all_timediffs_dst.append(c_timestamp - last_timestamp_dst[dest_id])
last_timestamp_sources[source_id] = c_timestamp
last_timestamp_dst[dest_id] = c_timestamp
assert len(all_timediffs_src) == len(sources)
assert len(all_timediffs_dst) == len(sources)
mean_time_shift_src = np.mean(all_timediffs_src)
std_time_shift_src = np.std(all_timediffs_src)
mean_time_shift_dst = np.mean(all_timediffs_dst)
std_time_shift_dst = np.std(all_timediffs_dst)
return mean_time_shift_src, std_time_shift_src, mean_time_shift_dst, std_time_shift_dst
|