Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| import gradio as gr | |
| import pandas as pd | |
| import os | |
| import subprocess | |
| def predict_top_100_genes(disease_id): | |
| # Initialize paths | |
| input_csv_path = 'data/downstream/{}_disease.csv'.format(disease_id) | |
| output_csv_path = 'data/downstream/{}_top100.csv'.format(disease_id) | |
| # Check if the output CSV already exists | |
| if not os.path.exists(output_csv_path): | |
| # Proceed with your existing code if the output file doesn't exist | |
| df = pd.read_csv('data/pretrain/disgenet_latest.csv') | |
| df = df[df['proteinSeq'].notna()] | |
| # Check if the disease_id is present in the dataframe | |
| if disease_id not in df['diseaseId'].values: | |
| return f"Error: Disease ID '{disease_id}' not found in the database. Please check the ID and try again." | |
| desired_diseaseDes = df[df['diseaseId'] == disease_id]['diseaseDes'].iloc[0] | |
| related_proteins = df[df['diseaseDes'] == desired_diseaseDes]['proteinSeq'].unique() | |
| df['score'] = df['proteinSeq'].isin(related_proteins).astype(int) | |
| new_df = pd.DataFrame({ | |
| 'diseaseId': disease_id, | |
| 'diseaseDes': desired_diseaseDes, | |
| 'geneSymbol': df['geneSymbol'], | |
| 'proteinSeq': df['proteinSeq'], | |
| 'score': df['score'] | |
| }).drop_duplicates().reset_index(drop=True) | |
| new_df.to_csv(input_csv_path, index=False) | |
| # Call the model script only if the output CSV does not exist | |
| script_path = 'model.sh' | |
| subprocess.run(['bash', script_path, input_csv_path, output_csv_path], check=True) | |
| # Read the model output file or the existing file to get the top 100 genes | |
| output_df = pd.read_csv(output_csv_path) | |
| # Update here to select only the required columns and rename them | |
| result_df = output_df[['geneSymbol', 'Prediction_score']].rename(columns={'geneSymbol': 'Gene', 'Prediction_score': 'Score'}).head(100) | |
| return result_df | |
| iface = gr.Interface( | |
| fn=predict_top_100_genes, | |
| inputs=gr.Textbox(lines=1, placeholder="Enter Disease ID Here...", label="Disease ID"), | |
| outputs=gr.Dataframe(label="Predicted Top 100 Related Genes"), | |
| title="Gene Disease Association Prediction", | |
| description = ( | |
| "This AI model predicts the top 100 genes associated with a given disease based on 16,733 genes." | |
| " To get started, you need a Disease ID (UMLS CUI), which can be obtained from the DisGeNET database. " | |
| "\n\n**Steps to Obtain a Disease ID from DisGeNET:**\n" | |
| "1. Visit the DisGeNET website: [https://www.disgenet.org/search](https://www.disgenet.org/search).\n" | |
| "2. Use the search bar to enter your disease of interest. For instance, if you're interested in 'Alzheimer's Disease', type 'Alzheimer's Disease' into the search bar.\n" | |
| "3. From the search results, identify the disease you're researching. The Disease ID (UMLS CUI) is listed alongside each disease name, e.g. C0002395.\n" | |
| "4. Enter the Disease ID into the input box below and submit.\n\n" | |
| "The DisGeNET database contains all known gene-disease associations and associated evidence. In addition, it is able to find the corresponding diseases based on a gene.\n" | |
| "\n**The model will take about 18 minutes to inference a new disease.**\n" | |
| ) | |
| ) | |
| iface.launch(share=True) | |