"""Temporary placeholder to load
"""
from dataclasses import dataclass
from typing import List

import pandas as pd
import numpy as np


@dataclass
class Domain:
    name: str
    sequence: str
    def to_dict(self):
        return {'name': self.name, 'sequence': self.sequence}


DOMAIN_NAMES = [
    'Condensation', 'Heterocyclization', 'AMP-binding', 'PCP',
    'Epimerization', 'Thioesterase'
]

@dataclass
class NRPSModule:
    module_id: int
    domains_list: List[Domain]
    is_start: bool = False
    is_final: bool = False

    @classmethod
    def parse(cls, row):
        # print(row)
        domains_list = []
        is_start = True
        is_final = False
        for domain_name in DOMAIN_NAMES:
            seq = row[domain_name]
            if isinstance(seq, str):
                domain = Domain(domain_name, seq)
                domains_list.append(domain)
                if domain_name in ('Condensation', 'Heterocyclization'): # elongation module
                    is_start = False
                if domain_name == "Thioesterase":
                    is_final = True
        return cls(row.module_id, domains_list, is_start, is_final)
    
    def to_list(self):
        return [domain.to_dict() for domain in self.domains_list]

    def to_start_module(self):
        if self.is_start:
            return self
        domains_list = self.domains_list
        domains_list = [domain for domain in domains_list if not domain.name in ('Condensation', 'Heterocyclization')]
        domains_list = [domain for domain in domains_list if domain.name != 'Thioesterase']
        return NRPSModule(self.module_id, domains_list, is_start=True, is_final=False)


def load_monomers(filename: str = "./data/monomers_list.txt") -> list[str]:
    """
    Loads monomer names from a file.
    """
    monomers = []
    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if len(line) > 0:
                monomers.append(line)
    return monomers


# Login using e.g. `huggingface-cli login` to access this dataset
class PseudoGenerationWrapper:
    """
    Let's start with the wrapper which picks one of the modules randomly from the predefined collection.
    """
    def __init__(self) -> None:
        nrp_modules_df = pd.read_csv("hf://datasets/latticetower/nrps_modules_asdb4.0/nrps_modules_info_cleaned.csv")
        self.monomer2modules_list = self.prepare_data(nrp_modules_df)
        
    def prepare_data(self, df: pd.DataFrame) -> dict:
        monomer2modules = dict()
        for monomer, row_df in df.groupby("monomer_name"):
            modules_lists = {'start': [], 'elongation': [], 'final': []}
            # let's suppose that there is a difference between module composition aside from presence/absence of specific domains, 
            # which is yet unknown
            for row_index, row in row_df.iterrows():
                nrps_module = NRPSModule.parse(row) 
                if nrps_module.is_start:
                    modules_lists['start'].append(nrps_module)
                elif nrps_module.is_final:
                    modules_lists['final'].append(nrps_module)
                else:
                    modules_lists['elongation'].append(nrps_module)
            monomer2modules[monomer] = modules_lists
        return monomer2modules

    def suggest_module(self, monomer: str, is_start=False, is_final=False):
        """for some monomer simply returns random module (that's a dirty hack you didn't expect!) from the list of known ones.
        
        Flags is_start and is_final are needed to take into account specific structure of start and final modules
        """
        assert monomer in self.monomer2modules_list, f"{monomer} is unknown"
        
        monomer_dict = self.monomer2modules_list[monomer]
        # in fact the following might be simply generated 
        # (we can try to omin condensation domain and call it a starting module, for example),
        # but for simplicity distinguish start, elongation and final parts.
        # i didn't processed the case where there is only 1 module present, let's ignore this for now 
        # (we consider peptides consisted from one monomer to be uninteresting ones and simply omit)
        if is_start:
            candidate_modules = monomer_dict['start']
            if len(candidate_modules) > 0:
                candidate_module = np.random.choice(candidate_modules, 1)
                return candidate_module[0].to_list()
            candidate_modules = monomer_dict['elongation']
            assert len(candidate_modules) > 0, f"{monomer} has no known information on starting modules"
            candidate_module = np.random.choice(candidate_modules, 1)[0].to_start_module()
            return candidate_module.to_list()
            
        if is_final:
            candidate_modules = monomer_dict['final']
            if len(candidate_modules) > 0:
                candidate_module = np.random.choice(candidate_modules, 1)
                return candidate_module[0].to_list()
            candidate_modules = monomer_dict['elongation']
            assert len(candidate_modules) > 0, f"{monomer} has no known information on final modules"
            candidate_module = np.random.choice(candidate_modules, 1)[0]
            return candidate_module.to_list()

        candidate_modules = monomer_dict['elongation']
        assert len(candidate_modules) > 0, f"{monomer} has no known information on elongation modules"
        return np.random.choice(candidate_modules, 1)[0].to_list()


module_generator = PseudoGenerationWrapper()