import streamlit as st
import numpy as np
import pandas as pd
from PIL import Image
import ujson as json
import pickle as pk
from plotnine import *

# -- Set page config
apptitle = "PhenoGenius"

st.set_page_config(
    page_title=apptitle,
    page_icon=":genie:",
    layout="wide",
    initial_sidebar_state="auto",
)

# -- Set Sidebar
image_pg = Image.open("data/img/phenogenius.png")
st.sidebar.image(image_pg, caption=None, width=100)
st.sidebar.title("PhenoGenius")

st.sidebar.header(
    "Learning phenotypic patterns in genetic diseases by symptom interaction modeling"
)

st.sidebar.markdown(
    """
 This webapp presents symptom interaction models in genetic diseases to provide:
 - Standardized clinical descriptions 
 - Interpretable matches between symptoms and genes

 Code source is available in GitHub:
 [https://github.com/kyauy/PhenoGenius](https://github.com/kyauy/PhenoGenius)

 Last update: 2024-07-15

 PhenoGenius is a collaborative project from:
"""
)

image_uga = Image.open("data/img/logo-uga.png")
st.sidebar.image(image_uga, caption=None, width=95)

image_seqone = Image.open("data/img/logo-seqone.png")
st.sidebar.image(image_seqone, caption=None, width=95)

image_miai = Image.open("data/img/logoMIAI-rvb.png")
st.sidebar.image(image_miai, caption=None, width=95)

image_chuga = Image.open("data/img/logo-chuga.png")
st.sidebar.image(image_chuga, caption=None, width=60)


@st.cache_data(max_entries=50)
def convert_df(df):
    return df.to_csv(sep="\t").encode("utf-8")


@st.cache_data(max_entries=50)
def load_data():
    matrix = pd.read_csv(
        "data/resources/ohe_all_thesaurus_weighted_2024.tsv.gz",
        sep="\t",
        compression="gzip",
        index_col=0,
    )
    return matrix


@st.cache_data(hash_funcs={"Pickle": lambda _: None}, max_entries=50)
def load_nmf_model():
    with open("data/resources/pheno_NMF_390_model_42_2024.pkl", "rb") as pickle_file:
        pheno_NMF = pk.load(pickle_file)
    with open("data/resources/pheno_NMF_390_matrix_42_2024.pkl", "rb") as pickle_file:
        reduced = pk.load(pickle_file)
    return pheno_NMF, reduced


@st.cache_data(max_entries=50)
def symbol_to_id_to_dict():
    # from NCBI
    ncbi_df = pd.read_csv("data/resources/Homo_sapiens.gene_info.gz", sep="\t")
    ncbi_df = ncbi_df[ncbi_df["#tax_id"] == 9606]
    ncbi_df_ncbi = ncbi_df.set_index("Symbol")
    ncbi_to_dict_ncbi = ncbi_df_ncbi["GeneID"].to_dict()
    ncbi_df = ncbi_df.set_index("GeneID")
    ncbi_to_dict = ncbi_df["Symbol"].to_dict()
    return ncbi_to_dict_ncbi, ncbi_to_dict


@st.cache_data(hash_funcs={"_json.Scanner": hash}, max_entries=50)
def load_hp_ontology():
    with open("data/resources/hpo_obo_2024.json") as json_data:
        data_dict = json.load(json_data)
    return data_dict


@st.cache_data(max_entries=50)
def hpo_description_to_id():
    data_dict = {}
    for key, value in hp_onto.items():
        data_dict[value["name"]] = key
    return data_dict


@st.cache_data(max_entries=50)
def load_topic_data():
    topic = pd.read_csv(
        "data/resources/main_topics_hpo_390_42_filtered_norm_004_2024.tsv",
        sep="\t",
        index_col=0,
    )
    return topic


@st.cache_data(hash_funcs={"_json.Scanner": hash}, max_entries=50)
def load_similarity_dict():
    with open("data/resources/similarity_dict_threshold_80_2024.json") as json_data:
        data_dict = json.load(json_data)
    return data_dict


def get_symbol(gene):
    if gene in symbol.keys():
        return symbol[gene]


def get_hpo_name(hpo):
    names = {}
    if hpo in hp_onto.keys():
        names[hpo] = hp_onto[hpo]["name"]
    return names


def get_hpo_name_only(hpo):
    if hpo in hp_onto.keys():
        return hp_onto[hpo]["name"]
    else:
        return None


def get_hpo_name_list(hpo_list, hp_onto):
    names = {}
    for hpo in hpo_list:
        if hpo in hp_onto.keys():
            names[hpo] = hp_onto[hpo]["name"]
    return names


def get_similar_terms(hpo_list, similarity_terms_dict):
    hpo_list_w_simi = {}
    for term in hpo_list:
        hpo_list_w_simi[term] = 1
        if term in similarity_terms_dict.keys():
            for key, value in similarity_terms_dict[term].items():
                if value > 0.8:
                    score = value / len(similarity_terms_dict[term].keys())
                    if key in hpo_list_w_simi.keys():
                        if score > hpo_list_w_simi[key]:
                            hpo_list_w_simi[key] = score
                        else:
                            pass
                    else:
                        hpo_list_w_simi[key] = score
    hpo_list_all = hpo_list_w_simi.keys()
    return hpo_list_w_simi, list(hpo_list_all)


def score(hpo_list, matrix):
    # Create a copy of the filtered matrix to avoid SettingWithCopyWarning
    matrix_filter = matrix[hpo_list].copy()

    # Use .loc to safely add or modify columns in the copy of the DataFrame
    matrix_filter.loc[:, "sum"] = matrix_filter.sum(axis=1)
    matrix_filter.loc[:, "gene_symbol"] = matrix_filter.index.to_series().apply(
        get_symbol
    )

    # Return the modified DataFrame sorted by 'sum'
    return matrix_filter.sort_values("sum", ascending=False)


def score_sim_add(hpo_list_add, matrix, sim_dict):
    # Ensure matrix_filter is a copy to avoid modifying the original DataFrame
    matrix_filter = matrix[hpo_list_add].copy()

    # Iterate through sim_dict to update matrix_filter values
    for key, value in sim_dict.items():
        if key in matrix_filter.columns:
            matrix_filter[key] = (
                matrix_filter[key] * value
            )  # Direct column assignment is fine here

    # Calculate the sum and assign gene_symbol, using direct assignment for these operations
    matrix_filter["sum"] = matrix_filter.sum(axis=1)
    matrix_filter["gene_symbol"] = matrix_filter.index.to_series().apply(get_symbol)

    # Return the DataFrame sorted by 'sum'
    return matrix_filter.sort_values("sum", ascending=False)


def get_phenotype_specificity(gene_diag, data_patient):
    rank = data_patient.loc[int(ncbi[gene_diag]), "rank"]
    max_rank = data_patient["rank"].max()
    if rank == max_rank:
        return "D - the reported phenotype is NOT consistent with what is expected for the gene/genomic region or not consistent in general."
    elif rank < 41:
        return "A - the reported phenotype is highly specific and relatively unique to the gene (top 40, 50 perc of diagnosis in PhenoGenius cohort)."
    elif rank < 250:
        return "B - the reported phenotype is consistent with the gene, is highly specific, but not necessarily unique to the gene (top 250, 75 perc of diagnosis in PhenoGenius cohort)."
    else:
        return "C - the phenotype is reported with limited association with the gene, not highly specific and/or with high genetic heterogeneity."


def get_relatives_list(hpo_list, hp_onto):
    all_list = []
    for hpo in hpo_list:
        all_list.append(hpo)
        if hpo in hp_onto.keys():
            for parent in hp_onto[hpo]["parents"]:
                all_list.append(parent)
            for children in hp_onto[hpo]["childrens"]:
                all_list.append(children)
    return list(set(all_list))


def get_hpo_id(hpo_list):
    hpo_id = []
    for description in hpo_list:
        hpo_id.append(hp_desc_id[description])
    return ",".join(hpo_id)


hp_onto = load_hp_ontology()
hp_desc_id = hpo_description_to_id()
ncbi, symbol = symbol_to_id_to_dict()


with st.form("my_form"):
    c1, c2 = st.columns(2)
    with c1:
        hpo_raw = st.multiselect(
            "Select interactively your HPOs or...",
            list(hp_desc_id.keys()),
            ["Renal cyst", "Hepatic cysts"],
        )
    with c2:
        hpo = st.text_input(
            "copy/paste your HPOs, separated with comma",
            "HP:0000107,HP:0001407",
        )
    gene_diag_input = st.multiselect(
        "Optional: provide HGNC gene symbol to be tested",
        options=list(ncbi.keys()),
        default=["PKD1"],
        max_selections=1,
    )
    submit_button = st.form_submit_button(
        label="Submit",
    )


if submit_button:
    if hpo_raw != ["Renal cyst", "Hepatic cysts"] and len(hpo_raw) > 0:
        hpo = get_hpo_id(hpo_raw)
    data = load_data()
    pheno_NMF, reduced = load_nmf_model()
    topic = load_topic_data()
    similarity_terms_dict = load_similarity_dict()

    hpo_list_ini = hpo.strip().split(",")

    if gene_diag_input:
        if gene_diag_input[0] in ncbi.keys():
            gene_diag = gene_diag_input[0]
        else:
            st.write(
                gene_diag_input
                + " gene are not in our database. Please check gene name (need to be in CAPITAL format)."
            )
            gene_diag = None
    else:
        gene_diag = None

    hpo_list_up = []
    for hpo in hpo_list_ini:
        if hpo in ["HP:0000001"]:
            pass
        elif len(hpo) != 10:
            st.write(
                "Incorrect HPO format: "
                + hpo
                + ". Please check (7-digits terms with prefix HP:, and separed by commas)."
            )
            pass
        elif hpo not in data.columns:
            pass
            st.write(hpo + " not available in current database. Please modify.")
        else:
            if data[hpo].astype(bool).sum(axis=0) != 0:
                hpo_list_up.append(hpo)
            else:
                hpo_to_test = hp_onto[hpo]["direct_parent"][0]
                while data[hpo_to_test].astype(bool).sum(
                    axis=0
                ) == 0 and hpo_to_test not in ["HP:0000001"]:
                    hpo_to_test = hp_onto[hpo_to_test]["direct_parent"][0]
                if hpo_to_test in ["HP:0000001"]:
                    st.write(
                        "No gene-HPO associations was found for "
                        + hpo
                        + " and parents."
                    )
                else:
                    hpo_list_up.append(hpo_to_test)
                    st.write(
                        "We replaced: ",
                        hpo,
                        " by ",
                        hp_onto[hpo]["direct_parent"][0],
                        "-",
                        get_hpo_name(hpo_to_test),
                    )
    hpo_list = list(set(hpo_list_up))
    del hpo_list_up

    if hpo_list:
        with st.expander("See HPO inputs"):
            st.write(get_hpo_name_list(hpo_list_ini, hp_onto))
            del hpo_list_ini

        hpo_list_name = get_relatives_list(hpo_list, hp_onto)

        st.header("Clinical description with symptom interaction modeling")

        witness = np.zeros(len(data.columns))
        witness_nmf = np.matmul(pheno_NMF.components_, witness)

        patient = np.zeros(len(data.columns))
        for hpo in hpo_list:
            hpo_index = list(data.columns).index(hpo)
            patient[hpo_index] = 1

        patient_nmf = np.matmul(pheno_NMF.components_, patient)

        witness_sugg_df = (
            pd.DataFrame(reduced)
            .set_index(data.index)
            .apply(lambda x: (x - witness_nmf) ** 2, axis=1)
        )
        patient_sugg_df = (
            pd.DataFrame(reduced)
            .set_index(data.index)
            .apply(lambda x: (x - patient_nmf) ** 2, axis=1)
        )

        case_sugg_df = (patient_sugg_df - witness_sugg_df).sum()

        patient_df_info = pd.DataFrame(case_sugg_df).merge(
            topic, left_index=True, right_index=True
        )

        patient_df_info["mean_score"] = round(
            patient_df_info[0] / (patient_df_info["total_weight"] ** 2), 4
        )

        patient_df_info_write = patient_df_info[
            ["mean_score", "main_term", "n_hpo", "hpo_name", "hpo_list", "weight"]
        ].sort_values("mean_score", ascending=False)

        del case_sugg_df
        del patient_sugg_df
        del witness_sugg_df
        del patient

        with st.expander("See projection in groups of symptoms dimension*"):
            st.dataframe(patient_df_info_write)
            st.write(
                "\* For interpretability, we report only the top 10% of the 390 groups of interacting symptom associations"
            )
            match_proj_csv = convert_df(patient_df_info_write)

            st.download_button(
                "Download description projection",
                match_proj_csv,
                "clin_desc_projected.tsv",
                "text/csv",
                key="download-csv-proj",
            )

        sim_dict, hpo_list_add_raw = get_similar_terms(hpo_list, similarity_terms_dict)
        hpo_list_add = list(set(hpo_list_add_raw) & set(data.columns.tolist()))
        similar_list = list(set(hpo_list_add) - set(hpo_list))
        similar_list_desc = get_hpo_name_list(similar_list, hp_onto)

        if similar_list_desc:
            with st.expander("See symptoms with similarity > 80%"):
                similar_list_desc_df = pd.DataFrame.from_dict(
                    similar_list_desc, orient="index"
                )
                similar_list_desc_df.columns = ["description"]
                st.write(similar_list_desc_df)
                del similar_list_desc_df
        del similar_list
        del similar_list_desc

        st.header("Phenotype matching")
        results_sum = score(hpo_list, data)
        results_sum["matchs"] = results_sum[hpo_list].astype(bool).sum(axis=1)
        results_sum["score"] = results_sum["matchs"] + results_sum["sum"]
        results_sum["rank"] = (
            results_sum["score"].rank(ascending=False, method="max").astype(int)
        )
        cols = results_sum.columns.tolist()
        cols = cols[-4:] + cols[:-4]
        match = results_sum[cols].sort_values(by=["score"], ascending=False)
        st.dataframe(match[match["score"] > 1.01].drop(columns=["sum"]))
        match_csv = convert_df(match)

        st.download_button(
            "Download matching results",
            match_csv,
            "match.tsv",
            "text/csv",
            key="download-csv-match",
        )

        if gene_diag:
            if int(ncbi[gene_diag]) in results_sum.index:
                p = (
                    ggplot(match, aes("score"))
                    + geom_density()
                    + geom_vline(
                        xintercept=results_sum.loc[int(ncbi[gene_diag]), "score"],
                        linetype="dashed",
                        color="red",
                        size=1.5,
                    )
                    + ggtitle("Matching score distribution")
                    + xlab("Gene matching score")
                    + ylab("% of genes")
                    + theme_bw()
                    + theme(
                        text=element_text(size=12),
                        figure_size=(5, 5),
                        axis_ticks=element_line(colour="black", size=4),
                        axis_line=element_line(colour="black", size=2),
                        axis_text_x=element_text(angle=45, hjust=1),
                        axis_text_y=element_text(angle=60, hjust=1),
                        subplots_adjust={"wspace": 0.1},
                        legend_position=(0.7, 0.35),
                    )
                )
                col1, col2, col3 = st.columns(3)

                with col1:
                    st.pyplot(ggplot.draw(p))

                st.write(
                    "Gene ID rank:",
                    results_sum.loc[int(ncbi[gene_diag]), "rank"],
                    "  |  ",
                    "Gene ID count:",
                    round(results_sum.loc[int(ncbi[gene_diag]), "sum"], 4),
                )
                st.write(results_sum.loc[[int(ncbi[gene_diag])]])
                st.write(
                    "Gene ID phenotype specificity:",
                    get_phenotype_specificity(gene_diag, results_sum),
                )
                del p

            else:
                st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
        del results_sum
        del match

        st.header("Phenotype matching by similarity of symptoms")
        results_sum_add = score_sim_add(hpo_list_add, data, sim_dict)
        results_sum_add["rank"] = (
            results_sum_add["sum"].rank(ascending=False, method="max").astype(int)
        )
        cols = results_sum_add.columns.tolist()
        cols = cols[-2:] + cols[:-2]
        match_sim = results_sum_add[cols].sort_values(by=["sum"], ascending=False)
        st.dataframe(match_sim[match_sim["sum"] > 0.01])

        match_sim_csv = convert_df(match_sim)

        st.download_button(
            "Download matching results",
            match_sim_csv,
            "match_sim.tsv",
            "text/csv",
            key="download-csv-match-sim",
        )

        if gene_diag:
            if int(ncbi[gene_diag]) in results_sum_add.index:
                p2 = (
                    ggplot(match_sim, aes("sum"))
                    + geom_density()
                    + geom_vline(
                        xintercept=results_sum_add.loc[int(ncbi[gene_diag]), "sum"],
                        linetype="dashed",
                        color="red",
                        size=1.5,
                    )
                    + ggtitle("Matching score distribution")
                    + xlab("Gene matching score")
                    + ylab("% of genes")
                    + theme_bw()
                    + theme(
                        text=element_text(size=12),
                        figure_size=(5, 5),
                        axis_ticks=element_line(colour="black", size=4),
                        axis_line=element_line(colour="black", size=2),
                        axis_text_x=element_text(angle=45, hjust=1),
                        axis_text_y=element_text(angle=60, hjust=1),
                        subplots_adjust={"wspace": 0.1},
                        legend_position=(0.7, 0.35),
                    )
                )
                col1, col2, col3 = st.columns(3)

                with col1:
                    st.pyplot(ggplot.draw(p2))

                st.write(
                    "Gene ID rank:",
                    results_sum_add.loc[int(ncbi[gene_diag]), "rank"],
                    "  |  ",
                    "Gene ID count:",
                    round(results_sum_add.loc[int(ncbi[gene_diag]), "sum"], 4),
                )
                st.write(
                    "Gene ID phenotype specificity:",
                    get_phenotype_specificity(gene_diag, results_sum_add),
                )
                del p2

            else:
                st.write("Gene ID rank:", " Gene not available in PhenoGenius database")

        del sim_dict
        del hpo_list_add
        del results_sum_add
        del match_sim

        st.header("Phenotype matching by groups of symptoms")

        patient_df = (
            pd.DataFrame(reduced)
            .set_index(data.index)
            .apply(lambda x: sum((x - patient_nmf) ** 2), axis=1)
        )

        witness_df = (
            pd.DataFrame(reduced)
            .set_index(data.index)
            .apply(lambda x: sum((x - witness_nmf) ** 2), axis=1)
        )
        del patient_nmf
        del witness
        del witness_nmf

        case_df = pd.DataFrame(patient_df - witness_df)
        case_df.columns = ["score"]
        case_df["score_norm"] = abs(case_df["score"] - case_df["score"].max())
        # case_df["frequency"] = matrix_frequency["variant_number"]
        case_df["sum"] = case_df["score_norm"]  # + case_df["frequency"]
        case_df_sort = case_df.sort_values(by="sum", ascending=False)
        case_df_sort["rank"] = (
            case_df_sort["sum"].rank(ascending=False, method="max").astype(int)
        )
        case_df_sort["gene_symbol"] = case_df_sort.index.to_series().apply(get_symbol)
        match_nmf = case_df_sort[["gene_symbol", "rank", "sum"]]
        st.dataframe(match_nmf[match_nmf["sum"] > 0.01])

        match_nmf_csv = convert_df(match_nmf)

        st.download_button(
            "Download matching results",
            match_nmf_csv,
            "match_groups.tsv",
            "text/csv",
            key="download-csv-match-groups",
        )

        if gene_diag:
            if int(ncbi[gene_diag]) in case_df_sort.index:
                p3 = (
                    ggplot(match_nmf, aes("sum"))
                    + geom_density()
                    + geom_vline(
                        xintercept=case_df_sort.loc[int(ncbi[gene_diag]), "sum"],
                        linetype="dashed",
                        color="red",
                        size=1.5,
                    )
                    + ggtitle("Matching score distribution")
                    + xlab("Gene matching score")
                    + ylab("% of genes")
                    + theme_bw()
                    + theme(
                        text=element_text(size=12),
                        figure_size=(5, 5),
                        axis_ticks=element_line(colour="black", size=4),
                        axis_line=element_line(colour="black", size=2),
                        axis_text_x=element_text(angle=45, hjust=1),
                        axis_text_y=element_text(angle=60, hjust=1),
                        subplots_adjust={"wspace": 0.1},
                        legend_position=(0.7, 0.35),
                    )
                )
                col1, col2, col3 = st.columns(3)

                with col1:
                    st.pyplot(ggplot.draw(p3))

                st.write(
                    "Gene ID rank:",
                    case_df_sort.loc[int(ncbi[gene_diag]), "rank"],
                    "  |  ",
                    "Gene ID count:",
                    round(case_df_sort.loc[int(ncbi[gene_diag]), "sum"], 4),
                )
                st.write(
                    "Gene ID phenotype specificity:",
                    get_phenotype_specificity(gene_diag, case_df_sort),
                )
                del p3
            else:
                st.write("Gene ID rank:", " Gene not available in PhenoGenius database")
            del case_df_sort
            del match_nmf
            del case_df

    else:
        st.write(
            "No HPO terms provided in correct format.",
        )