|
""" |
|
BioMed text normalization MCP server. |
|
""" |
|
|
|
import asyncio |
|
from typing import Optional |
|
|
|
import gradio as gr |
|
|
|
from oaklib_utils import get_candidates |
|
from openai_utils import ask_openai |
|
|
|
NER_PROMPT = """ |
|
You are an expert annotator of biomedical text. |
|
|
|
Annotate/Extract all {entity}(s) in this text: {text} |
|
|
|
Instructions: |
|
1. If no such entity or entities are found, then **return exactly**: Not Found |
|
2. Extract only the entity. If only an abbreviation is present, expand it based on the |
|
biomedical context in the given paragraph. For e.g., BA12 full form is Brodmann (1909) area 12. |
|
3. Do not provide any additional information or formatting. |
|
|
|
Do not guess or hallucinate if you are uncertain. This has high-stakes, so it's better to be safe |
|
than sorry. This is very important, so you'd better be sure of your answer, OK? |
|
""" |
|
|
|
RAG_PROMPT = """ |
|
You are an expert normalizer of biomedical entities. |
|
|
|
Given the following list of candidate standard terms: {top_k_preds}, |
|
find the single closest matching term for this unnormalized entity: {entity}. |
|
|
|
Instructions: |
|
1. **IMPORTANT:** Do **NOT** guess or hallucinate. Do **NOT** provide any term that |
|
is not explicitly present in the list of standardized terms. |
|
2. Do not overgeneralize unless no match is available. |
|
3. Do not provide any additional information or formatting. |
|
|
|
This has high-stakes, so it's better to be safe than sorry. This is very important, so you'd better |
|
be sure of your answer, OK? |
|
""" |
|
|
|
|
|
async def extract_entities(paragraph: str, target_entity: str) -> Optional[list[str]]: |
|
""" |
|
Extract entities of a specific type from a given paragraph. |
|
|
|
Args: |
|
paragraph (str): The paragraph from which entities are to be extracted. |
|
target_entity (str): The type of entity to extract from the paragraph (e.g., 'disease', 'tissue'). |
|
|
|
Returns: |
|
Optional[list[str]]: A list of extracted entities of the specified type, or |
|
None if the model did not return a valid response. |
|
""" |
|
prompt = NER_PROMPT.format(entity=target_entity, text=paragraph) |
|
extracted_entities = await ask_openai(prompt, usage="ner") |
|
|
|
return extracted_entities |
|
|
|
|
|
async def normalize_entities( |
|
raw_terms: list[str], |
|
) -> list[dict[Optional[str], Optional[str]]]: |
|
""" |
|
Normalize a list of raw terms to the most appropriate standard terms from a list |
|
of candidates. |
|
|
|
This function is designed to process the output from extract_entities(). |
|
|
|
Args: |
|
raw_terms (list[str]): List of unnormalized terms, typically from extract_entities(). |
|
|
|
Returns: |
|
list[dict[Optional[str], Optional[str]]]: A list of dictionaries, where each dictionary contains |
|
the best matching normalized term (key) and its corresponding URI (value). If normalization fails |
|
for a term, the dictionary will have a None value for both the key and URI. |
|
""" |
|
|
|
|
|
async def process_single_entity(raw_term: str) -> dict[Optional[str], Optional[str]]: |
|
|
|
|
|
candidates = await asyncio.to_thread(get_candidates, raw_term) |
|
candidate_std_terms = [candidates for _, candidates in candidates] |
|
|
|
|
|
prompt = RAG_PROMPT.format(entity=raw_term, top_k_preds=candidate_std_terms) |
|
result = await ask_openai(prompt, usage="rag") |
|
if result is not None: |
|
|
|
result_URI = next((URI for URI, term in candidates if term == result), None) |
|
else: |
|
result_URI = None |
|
|
|
return {result: result_URI} |
|
|
|
|
|
tasks = [process_single_entity(entity) for entity in raw_terms] |
|
normalized_entities = await asyncio.gather(*tasks) |
|
|
|
return normalized_entities |
|
|
|
|
|
async def extract_and_normalize( |
|
paragraph: str, target_entity: str |
|
) -> list[dict[Optional[str], Optional[str]]]: |
|
""" |
|
Extract entities from a paragraph and normalize them in one operation. |
|
|
|
Args: |
|
paragraph: The paragraph from which to extract entities. |
|
target_entity: The type of entity to extract and normalize. |
|
|
|
Returns: |
|
list[dict[Optional[str], Optional[str]]]: A list of dictionaries, where each dictionary contains |
|
the best matching normalized term (key) and its corresponding URI (value). If normalization fails |
|
for a term, the dictionary will have a None value for both the key and URI. |
|
""" |
|
target_entity = target_entity.lower() |
|
extracted_entities = await extract_entities(paragraph, target_entity) |
|
if extracted_entities is None or extracted_entities == ["Not Found"]: |
|
return [] |
|
|
|
result = await normalize_entities(extracted_entities) |
|
return result |
|
|
|
|
|
def toggle_custom_box(selected: str): |
|
show = selected == "Custom" |
|
return ( |
|
gr.Textbox(visible=show, interactive=show), |
|
gr.Markdown( |
|
"**Warning:** This tool is optimized and tested for Disease, Tissue, and Cell Type entities. " |
|
"While you can input custom entities, results may vary in accuracy and reliability.", |
|
visible=show, |
|
), |
|
) |
|
|
|
|
|
|
|
app_theme = gr.themes.Soft( |
|
primary_hue="teal", |
|
secondary_hue="green", |
|
) |
|
|
|
with gr.Blocks(theme=app_theme) as demo: |
|
gr.Markdown(""" |
|
# 🧬 BioMedNorm: Entity Extraction & Normalization |
|
|
|
Welcome to the BioMedNorm MCP Server. |
|
|
|
This server is designed to be used by LLMs to extract and standardize biological |
|
entities (like disease, tissue) from biomedical text. |
|
|
|
Enter the text below, specify the entity type to extract and normalize entities, and voila! |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
paragraph = gr.Textbox( |
|
label="Text Input", |
|
placeholder="Enter paragraph here...", |
|
lines=8, |
|
info="Enter the biomedical text for entity extraction.", |
|
) |
|
target_entity = gr.Dropdown( |
|
["Disease", "Tissue", "Cell Type", "Custom"], |
|
label="Entity Type", |
|
value="Disease", |
|
|
|
info="Select the type of entity you want to extract and normalize from the text.", |
|
) |
|
custom_entity = gr.Textbox( |
|
label="Custom Entity", |
|
placeholder="Enter custom entity type here", |
|
visible=False, |
|
interactive=True, |
|
info="Enter your custom entity type if 'Custom' is selected.", |
|
) |
|
warning = gr.Markdown( |
|
visible=False |
|
) |
|
normalize_btn = gr.Button("Normalize", variant="primary") |
|
|
|
with gr.Column(): |
|
output = gr.JSON(label="Normalized Entities") |
|
|
|
|
|
target_entity.change( |
|
fn=toggle_custom_box, inputs=target_entity, outputs=[custom_entity, warning], |
|
api_name=False |
|
) |
|
|
|
|
|
with gr.Row(): |
|
status = gr.Markdown("") |
|
|
|
with gr.Accordion("Example Inputs", open=False): |
|
gr.Examples( |
|
examples=[ |
|
["The patient was diagnosed with diabetes and hypertension.", "Disease"], |
|
[ |
|
"Samples of BA12 tissue, weighing approximately 50-100 mg each, were homogenized in nuclei extraction buffer.", |
|
"Tissue", |
|
], |
|
[ |
|
"Coupling scTCR-seq with scRNA-seq can reveal the relationship between clonotype and phenotype in T or B cell populations.", |
|
"Cell Type", |
|
], |
|
], |
|
inputs=[paragraph, target_entity], |
|
) |
|
|
|
|
|
normalize_btn.click( |
|
lambda: "Processing...", |
|
None, |
|
status, |
|
queue=False, |
|
api_name=False, |
|
).then( |
|
extract_and_normalize, |
|
[paragraph, target_entity], |
|
output, |
|
).then( |
|
lambda: "", |
|
None, |
|
status, |
|
api_name=False, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch(mcp_server=True) |
|
|