Spaces:
Build error
Build error
| # Copyright 2021 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import logging | |
| import warnings | |
| from pathlib import Path | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.preprocessing import MultiLabelBinarizer | |
| # Might be nice to print to log instead? Happens when we drop closed class. | |
| warnings.filterwarnings(action="ignore", category=UserWarning) | |
| # When we divide by 0 in log | |
| np.seterr(divide="ignore") | |
| # treating inf values as NaN as well | |
| pd.set_option("use_inf_as_na", True) | |
| logs = logging.getLogger(__name__) | |
| logs.setLevel(logging.INFO) | |
| logs.propagate = False | |
| if not logs.handlers: | |
| Path("./log_files").mkdir(exist_ok=True) | |
| # Logging info to log file | |
| file = logging.FileHandler("./log_files/npmi.log") | |
| fileformat = logging.Formatter("%(asctime)s:%(message)s") | |
| file.setLevel(logging.INFO) | |
| file.setFormatter(fileformat) | |
| # Logging debug messages to stream | |
| stream = logging.StreamHandler() | |
| streamformat = logging.Formatter("[data_measurements_tool] %(message)s") | |
| stream.setLevel(logging.WARNING) | |
| stream.setFormatter(streamformat) | |
| logs.addHandler(file) | |
| logs.addHandler(stream) | |
| _NUM_BATCHES = 500 | |
| class nPMI: | |
| # TODO: Expand beyond pairwise | |
| def __init__( | |
| self, | |
| vocab_counts_df, | |
| tokenized_df, | |
| tokenized_col_name="tokenized_text", | |
| num_batches=_NUM_BATCHES, | |
| ): | |
| logs.info("Initiating npmi class.") | |
| logs.info("vocab is") | |
| logs.info(vocab_counts_df) | |
| self.vocab_counts_df = vocab_counts_df | |
| logs.info("tokenized is") | |
| self.tokenized_df = tokenized_df | |
| logs.info(self.tokenized_df) | |
| self.tokenized_col_name = tokenized_col_name | |
| # self.mlb_list holds num batches x num_sentences | |
| self.mlb_list = [] | |
| def binarize_words_in_sentence(self): | |
| logs.info("Creating co-occurrence matrix for PMI calculations.") | |
| batches = np.linspace(0, self.tokenized_df.shape[0], _NUM_BATCHES).astype(int) | |
| i = 0 | |
| # Creates list of size (# batches x # sentences) | |
| while i < len(batches) - 1: | |
| # Makes a sparse matrix (shape: # sentences x # words), | |
| # with the occurrence of each word per sentence. | |
| mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index) | |
| logs.info( | |
| "%s of %s sentence binarize batches." % (str(i), str(len(batches))) | |
| ) | |
| # Returns series: batch size x num_words | |
| mlb_series = mlb.fit_transform( | |
| self.tokenized_df[self.tokenized_col_name][batches[i] : batches[i + 1]] | |
| ) | |
| i += 1 | |
| self.mlb_list.append(mlb_series) | |
| def calc_cooccurrences(self, subgroup, subgroup_idx): | |
| initialize = True | |
| coo_df = None | |
| # Big computation here! Should only happen once. | |
| logs.info( | |
| "Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences." | |
| ) | |
| if not self.mlb_list: | |
| self.binarize_words_in_sentence() | |
| for batch_id in range(len(self.mlb_list)): | |
| logs.info( | |
| "%s of %s co-occurrence count batches" | |
| % (str(batch_id), str(len(self.mlb_list))) | |
| ) | |
| # List of all the sentences (list of vocab) in that batch | |
| batch_sentence_row = self.mlb_list[batch_id] | |
| # Dataframe of # sentences in batch x vocabulary size | |
| sent_batch_df = pd.DataFrame(batch_sentence_row) | |
| # logs.info('sent batch df is') | |
| # logs.info(sent_batch_df) | |
| # Subgroup counts per-sentence for the given batch | |
| subgroup_df = sent_batch_df[subgroup_idx] | |
| subgroup_df.columns = [subgroup] | |
| # Remove the sentences where the count of the subgroup is 0. | |
| # This way we have less computation & resources needs. | |
| subgroup_df = subgroup_df[subgroup_df > 0] | |
| logs.info("Removing 0 counts, subgroup_df is") | |
| logs.info(subgroup_df) | |
| mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0] | |
| logs.info("mlb subgroup only is") | |
| logs.info(mlb_subgroup_only) | |
| # Create cooccurrence matrix for the given subgroup and all words. | |
| logs.info("Now we do the T.dot approach for co-occurrences") | |
| batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df)) | |
| # Creates a batch-sized dataframe of co-occurrence counts. | |
| # Note these could just be summed rather than be batch size. | |
| if initialize: | |
| coo_df = batch_coo_df | |
| else: | |
| coo_df = coo_df.add(batch_coo_df, fill_value=0) | |
| logs.info("coo_df is") | |
| logs.info(coo_df) | |
| initialize = False | |
| logs.info("Returning co-occurrence matrix") | |
| logs.info(coo_df) | |
| return pd.DataFrame(coo_df) | |
| def calc_paired_metrics(self, subgroup_pair, subgroup_npmi_dict): | |
| """ | |
| Calculates nPMI metrics between paired subgroups. | |
| Special handling for a subgroup paired with itself. | |
| :param subgroup_npmi_dict: | |
| :return: | |
| """ | |
| paired_results_dict = {"npmi": {}, "pmi": {}, "count": {}} | |
| # Canonical ordering. This is done previously, but just in case... | |
| subgroup1, subgroup2 = sorted(subgroup_pair) | |
| vocab_cooc_df1, pmi_df1, npmi_df1 = subgroup_npmi_dict[subgroup1] | |
| logs.info("vocab cooc") | |
| logs.info(vocab_cooc_df1) | |
| if subgroup1 == subgroup2: | |
| shared_npmi_df = npmi_df1 | |
| shared_pmi_df = pmi_df1 | |
| shared_vocab_cooc_df = vocab_cooc_df1 | |
| else: | |
| vocab_cooc_df2, pmi_df2, npmi_df2 = subgroup_npmi_dict[subgroup2] | |
| logs.info("vocab cooc2") | |
| logs.info(vocab_cooc_df2) | |
| # Note that lsuffix and rsuffix should not come into play. | |
| shared_npmi_df = npmi_df1.join( | |
| npmi_df2, how="inner", lsuffix="1", rsuffix="2" | |
| ) | |
| shared_pmi_df = pmi_df1.join(pmi_df2, how="inner", lsuffix="1", rsuffix="2") | |
| shared_vocab_cooc_df = vocab_cooc_df1.join( | |
| vocab_cooc_df2, how="inner", lsuffix="1", rsuffix="2" | |
| ) | |
| shared_vocab_cooc_df = shared_vocab_cooc_df.dropna() | |
| shared_vocab_cooc_df = shared_vocab_cooc_df[ | |
| shared_vocab_cooc_df.index.notnull() | |
| ] | |
| logs.info("shared npmi df") | |
| logs.info(shared_npmi_df) | |
| logs.info("shared vocab df") | |
| logs.info(shared_vocab_cooc_df) | |
| npmi_bias = ( | |
| shared_npmi_df[subgroup1 + "-npmi"] - shared_npmi_df[subgroup2 + "-npmi"] | |
| ) | |
| paired_results_dict["npmi-bias"] = npmi_bias.dropna() | |
| paired_results_dict["npmi"] = shared_npmi_df.dropna() | |
| paired_results_dict["pmi"] = shared_pmi_df.dropna() | |
| paired_results_dict["count"] = shared_vocab_cooc_df.dropna() | |
| return paired_results_dict | |
| def calc_metrics(self, subgroup): | |
| # Index of the subgroup word in the sparse vector | |
| subgroup_idx = self.vocab_counts_df.index.get_loc(subgroup) | |
| logs.info("Calculating co-occurrences...") | |
| df_coo = self.calc_cooccurrences(subgroup, subgroup_idx) | |
| vocab_cooc_df = self.set_idx_cols(df_coo, subgroup) | |
| logs.info(vocab_cooc_df) | |
| logs.info("Calculating PMI...") | |
| pmi_df = self.calc_PMI(vocab_cooc_df, subgroup) | |
| logs.info(pmi_df) | |
| logs.info("Calculating nPMI...") | |
| npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup) | |
| logs.info(npmi_df) | |
| return vocab_cooc_df, pmi_df, npmi_df | |
| def set_idx_cols(self, df_coo, subgroup): | |
| """ | |
| :param df_coo: Co-occurrence counts for subgroup, length is num_words | |
| :return: | |
| """ | |
| count_df = df_coo.set_index(self.vocab_counts_df.index) | |
| count_df.columns = [subgroup + "-count"] | |
| count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(int) | |
| return count_df | |
| def calc_PMI(self, vocab_cooc_df, subgroup): | |
| """ | |
| # PMI(x;y) = h(y) - h(y|x) | |
| # = h(subgroup) - h(subgroup|word) | |
| # = log (p(subgroup|word) / p(subgroup)) | |
| # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y)) | |
| """ | |
| # Calculation of p(subgroup) | |
| subgroup_prob = self.vocab_counts_df.loc[subgroup]["proportion"] | |
| # Calculation of p(subgroup|word) = count(subgroup,word) / count(word) | |
| # Because the inidices match (the vocab words), | |
| # this division doesn't need to specify the index (I think?!) | |
| p_subgroup_g_word = ( | |
| vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df["count"] | |
| ) | |
| logs.info("p_subgroup_g_word is") | |
| logs.info(p_subgroup_g_word) | |
| pmi_df = pd.DataFrame() | |
| pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob) | |
| # Note: A potentially faster solution for adding count, npmi, | |
| # can be based on this zip idea: | |
| # df_test['size_kb'], df_test['size_mb'], df_test['size_gb'] = | |
| # zip(*df_test['size'].apply(sizes)) | |
| return pmi_df.dropna() | |
| def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup): | |
| """ | |
| # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y)) | |
| # = -log(p(word|subgroup)p(word)) | |
| """ | |
| p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum( | |
| vocab_cooc_df[subgroup + "-count"] | |
| ) | |
| p_word = pmi_df.apply( | |
| lambda x: self.vocab_counts_df.loc[x.name]["proportion"], axis=1 | |
| ) | |
| normalize_pmi = -np.log(p_word_g_subgroup * p_word) | |
| npmi_df = pd.DataFrame() | |
| npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi | |
| return npmi_df.dropna() | |