|
import pandas as pd |
|
import ast |
|
import json |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
|
|
|
|
class TaskVisualizations: |
|
def __init__( |
|
self, task_counts_path, selected_task_counts_path, tasks_with_areas_path |
|
): |
|
self.tasks_with_areas_df = self.load_tasks_with_areas_df( |
|
task_counts_path, tasks_with_areas_path |
|
) |
|
self.selected_tasks_with_areas_df = self.load_tasks_with_areas_df( |
|
selected_task_counts_path, tasks_with_areas_path |
|
) |
|
|
|
@classmethod |
|
def load_tasks_with_areas_df( |
|
cls, task_counts_path, tasks_with_areas_path="data/paperswithcode_tasks.csv" |
|
): |
|
task_counts_df = pd.read_csv(task_counts_path) |
|
raw_tasks_with_areas_df = pd.read_csv(tasks_with_areas_path) |
|
return raw_tasks_with_areas_df.merge(task_counts_df, on="task") |
|
|
|
@classmethod |
|
def get_topk_merge_others(cls, df, by_col, val_col, k=10, val_threshold=1000): |
|
sorted_df = df.copy().sort_values(val_col, ascending=False) |
|
topk_dict = ( |
|
sorted_df[[by_col, val_col]].set_index(by_col).iloc[:k].to_dict()[val_col] |
|
) |
|
print(topk_dict) |
|
sorted_df[by_col] = sorted_df[by_col].apply( |
|
lambda k: k |
|
if k in topk_dict.keys() and topk_dict[k] >= val_threshold |
|
else "other" |
|
) |
|
sorted_df = sorted_df.groupby(by_col).agg({val_col: sum}) |
|
return sorted_df |
|
|
|
@classmethod |
|
def get_displayed_tasks_with_areas_df(cls, tasks_with_areas_df, min_task_count): |
|
displayed_tasks_with_areas_df = tasks_with_areas_df.dropna().copy() |
|
displayed_tasks_with_areas_df["task"] = displayed_tasks_with_areas_df.apply( |
|
lambda r: r["task"] if r["count"] >= min_task_count else "other", axis=1 |
|
) |
|
displayed_tasks_with_areas_df = ( |
|
displayed_tasks_with_areas_df.groupby("area") |
|
.apply( |
|
lambda df: cls.get_topk_merge_others( |
|
df, "task", "count", val_threshold=min_task_count |
|
) |
|
) |
|
.reset_index() |
|
) |
|
displayed_tasks_with_areas_df["task"] = ( |
|
displayed_tasks_with_areas_df["task"] |
|
+ " " |
|
+ displayed_tasks_with_areas_df["count"].apply(str) |
|
) |
|
return displayed_tasks_with_areas_df |
|
|
|
def get_tasks_sunburst(self, min_task_count, which_df="selected"): |
|
if which_df == "selected": |
|
df = self.selected_tasks_with_areas_df |
|
else: |
|
df = self.tasks_with_areas_df |
|
|
|
displayed_tasks_with_areas_df = self.get_displayed_tasks_with_areas_df( |
|
df, min_task_count |
|
) |
|
|
|
return px.sunburst( |
|
displayed_tasks_with_areas_df, path=["area", "task"], values="count" |
|
) |
|
|