Spaces:
Running
Running
| import os | |
| import sys | |
| import json | |
| import pandas as pd | |
| import gradio as gr | |
| # 1) Ajusta o path antes de importar o loader | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| INFERENCE_PATH = os.path.join(BASE_DIR, "smi-ted", "inference") | |
| sys.path.insert(0, INFERENCE_PATH) | |
| from smi_ted_light.load import load_smi_ted | |
| # 2) Carrega o modelo SMI-TED Light | |
| MODEL_DIR = os.path.join(INFERENCE_PATH, "smi_ted_light") | |
| model = load_smi_ted( | |
| folder=MODEL_DIR, | |
| ckpt_filename="smi-ted-Light_40.pt", | |
| vocab_filename="bert_vocab_curated.txt", | |
| ) | |
| def process_inputs(smiles: str, file_obj): | |
| # Modo batch | |
| if file_obj is not None: | |
| try: | |
| # autodetecta delimitador (; ou , etc) | |
| df_in = pd.read_csv(file_obj.name, sep=None, engine='python') | |
| # procura coluna "smiles" (case‐insensitive) | |
| smiles_cols = [c for c in df_in.columns if c.lower() == "smiles"] | |
| if not smiles_cols: | |
| return ( | |
| "Error: The CSV must have a column named 'Smiles' with the respective SMILES.", | |
| gr.update(visible=False), | |
| ) | |
| smiles_col = smiles_cols[0] | |
| smiles_list = df_in[smiles_col].astype(str).tolist() | |
| out_records = [] | |
| invalid_smiles = [] | |
| embed_dim = None | |
| # para cada SMILES, tenta gerar embedding | |
| for sm in smiles_list: | |
| try: | |
| vec = model.encode(sm, return_torch=True)[0].tolist() | |
| # guarda dimensão do vetor na primeira vez | |
| if embed_dim is None: | |
| embed_dim = len(vec) | |
| # monta registro válido | |
| record = {"smiles": sm} | |
| record.update({f"dim_{i}": v for i, v in enumerate(vec)}) | |
| except Exception: | |
| # marca como inválido | |
| invalid_smiles.append(sm) | |
| # se já souber quantos dims, preenche com None | |
| if embed_dim is not None: | |
| record = {"smiles": f"SMILES {sm} was invalid"} | |
| record.update({f"dim_{i}": None for i in range(embed_dim)}) | |
| else: | |
| # ainda não sabemos quantos dims: só guarda smiles | |
| record = {"smiles": f"SMILES {sm} was invalid"} | |
| out_records.append(record) | |
| # converte para DataFrame (vai unificar todas as colunas) | |
| out_df = pd.DataFrame(out_records) | |
| out_df.to_csv("embeddings.csv", index=False) | |
| # monta mensagem de saída | |
| total = len(smiles_list) | |
| valid = total - len(invalid_smiles) | |
| if invalid_smiles: | |
| msg = ( | |
| f"{valid} SMILES were successfully processed, " | |
| f"{len(invalid_smiles)} had errors:\n" | |
| + "\n".join(invalid_smiles) | |
| ) | |
| else: | |
| msg = f"Processed batch of {valid} SMILES. Download embeddings.csv." | |
| return msg, gr.update(value="embeddings.csv", visible=True) | |
| except Exception as e: | |
| return f"Error processing batch: {e}", gr.update(visible=False) | |
| # Modo single (sem mudança) | |
| smiles = smiles.strip() | |
| if not smiles: | |
| return "Please enter a SMILES or upload a CSV file.", gr.update(visible=False) | |
| try: | |
| vec = model.encode(smiles, return_torch=True)[0].tolist() | |
| cols = ["smiles"] + [f"dim_{i}" for i in range(len(vec))] | |
| df_out = pd.DataFrame([[smiles] + vec], columns=cols) | |
| df_out.to_csv("embeddings.csv", index=False) | |
| return json.dumps(vec), gr.update(value="embeddings.csv", visible=True) | |
| except Exception: | |
| return f"The following input '{smiles}' is not a valid SMILES", gr.update(visible=False) | |
| # 4) Interface Gradio (sem mudanças) | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # SMI-TED-Embeddings-Extraction | |
| **Single mode:** paste a SMILES string in the left box. | |
| **Batch mode:** upload a CSV file where each row has a SMILES in the first column. | |
| In both cases, an `embeddings.csv` file will be extracted for download, with the first column as SMILES and the embedding values in the following columns. | |
| """ | |
| ) | |
| with gr.Row(): | |
| smiles_in = gr.Textbox(label="SMILES (single mode)", placeholder="e.g. CCO") | |
| file_in = gr.File(label="SMILES CSV (batch mode)", file_types=[".csv"]) | |
| generate_btn = gr.Button("Extract Embeddings") | |
| with gr.Row(): | |
| output_msg = gr.Textbox(label="Message / Embedding (JSON)", interactive=False, lines=4) | |
| download_csv = gr.File(label="Download embeddings.csv", visible=False) | |
| generate_btn.click( | |
| fn=process_inputs, | |
| inputs=[smiles_in, file_in], | |
| outputs=[output_msg, download_csv] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0") | |