import os import json import pandas as pd from google.cloud import bigquery service_account_info = json.loads(os.environ["GCP_SERVICE_ACCOUNT_JSON"]) client = bigquery.Client.from_service_account_info(service_account_info) query = "SELECT * FROM `upheld-magpie-314312.codebooks.codebooks_full`" query_job = client.query(query) df_codebooks = query_job.result().to_dataframe() def get_label_names(df, task): task_df = df[df["task"] == task].sort_values(by="index") label_names_dict = dict(zip(task_df["code"], task_df["name"])) return label_names_dict def get_num_dict(df, task): task_df = df[df["task"] == task].sort_values(by="index") num_dict = dict(zip(task_df["index"], task_df["code"])) return num_dict # NOTE: this is the same logic as in the Babel pipeline # key: task type (as in codebooks.codebooks_full) # value: (tuple) name of label_names, num_dict variables # TO-DO: we could replace all of these variables with one dict-like object task_names = { "CAP": ("CAP_LABEL_NAMES", "CAP_NUM_DICT"), "CAP_MINOR": ("CAP_MIN_LABEL_NAMES", "CAP_MIN_NUM_DICT"), "CAP_MEDIA": ("CAP_MEDIA_LABEL_NAMES", "CAP_MEDIA_NUM_DICT"), "CAP_MEDIA2": ("CAP_MEDIA2_LABEL_NAMES", "CAP_MEDIA2_NUM_DICT"), "CAP_MINOR_MEDIA": ("CAP_MIN_MEDIA_LABEL_NAMES", "CAP_MIN_MEDIA_NUM_DICT"), "MANIFESTO": ("MANIFESTO_LABEL_NAMES", "MANIFESTO_NUM_DICT"), "SENTIMENT": ("SENTIMENT_LABEL_NAMES", "SENTIMENT_NUM_DICT"), "EMOTION6": ("EMOTION6_LABEL_NAMES", "EMOTION6_NUM_DICT"), "EMOTION9": ("EMOTION9_V2_LABEL_NAMES", "EMOTION9_V2_NUM_DICT"), "EMOTION9_LEGACY": ("EMOTION9_LABEL_NAMES", "EMOTION9_NUM_DICT"), "ILLFRAMES_MIGRATION": ("ILLFRAMES_MIGRATION_LABEL_NAMES", "ILLFRAMES_MIGRATION_NUM_DICT"), "ILLFRAMES_COVID": ("ILLFRAMES_COVID_LABEL_NAMES", "ILLFRAMES_COVID_NUM_DICT"), "ILLFRAMES_WAR": ("ILLFRAMES_WAR_LABEL_NAMES", "ILLFRAMES_WAR_NUM_DICT"), "ONTOLISST": ("ONTOLISST_LABEL_NAMES", "ONTOLISST_NUM_DICT"), } for task, var_names in task_names.items(): label_name_var = var_names[0] num_dict_var = var_names[1] globals()[label_name_var] = get_label_names(df_codebooks, task) globals()[num_dict_var] = get_num_dict(df_codebooks, task) # making it prettier CAP_MIN_LABEL_NAMES = {code:label_name.split("- ")[-1] for code, label_name in CAP_MIN_LABEL_NAMES.items()} CAP_MIN_MEDIA_LABEL_NAMES = {code:label_name.split("- ")[-1] for code, label_name in CAP_MIN_MEDIA_LABEL_NAMES.items()}