RohanKarthikeyan commited on
Commit
046bc11
·
verified ·
1 Parent(s): 51da8a4

Upload 9 files

Browse files
Files changed (9) hide show
  1. .gitignore +10 -0
  2. .python-version +1 -0
  3. README.md +75 -7
  4. app.py +186 -0
  5. oaklib_utils.py +55 -0
  6. openai_utils.py +69 -0
  7. pyproject.toml +11 -0
  8. ruff.toml +2 -0
  9. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.13
README.md CHANGED
@@ -1,14 +1,82 @@
1
  ---
2
  title: BioMedNorm MCP Server
3
- emoji: 🚀
4
- colorFrom: pink
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.33.1
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
- short_description: 'A MCP server for extracting and normalizing domain-specific '
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: BioMedNorm MCP Server
 
 
 
3
  sdk: gradio
4
+ sdk_version: 5.33.0
5
  app_file: app.py
6
+ pinned: true
7
  license: apache-2.0
8
+ python_version: 3.13.3
9
+ tags:
10
+ - mcp-server-track
11
  ---
12
 
13
+ # BioMedNorm MCP Server
14
+
15
+ A MCP server for extracting and normalizing domain-specific entities from biomedical text. We leverage OpenAI LLMs to identify entities and match them to standardized terminology.
16
+
17
+ ## Installation
18
+
19
+ This project uses `uv` from Astral for dependency management. Follow these steps to set up the project:
20
+
21
+ ### Clone the repository
22
+
23
+ ```bash
24
+ git clone https://github.com/yourusername/entity-extraction-mcp
25
+ cd entity-extraction-mcp
26
+ ```
27
+
28
+ ### Set up Python environment
29
+
30
+ The project includes a .python-version file that specifies the required Python version. Make sure you have uv installed:
31
+
32
+ ```bash
33
+ # Install uv if you don't have it already
34
+ curl -LsSf https://astral.sh/uv/install.sh | sh
35
+ ```
36
+
37
+ ### Install dependencies
38
+
39
+ The project dependencies are defined in `pyproject.toml`. Install them with:
40
+
41
+ ```bash
42
+ uv pip install -e .
43
+ ```
44
+
45
+ ### Set up environment variables
46
+
47
+ The project **requires** an OpenAI API key, which should be stored in a .env file.
48
+
49
+ ## Running the application
50
+
51
+ Run the application using `uv run`:
52
+
53
+ ```bash
54
+ uv run app.py
55
+ ```
56
+
57
+ This command ensures that:
58
+
59
+ - All project dependencies are correctly installed
60
+ - The environment variables from .env are loaded
61
+ - The application runs in the proper environment
62
+
63
+ After starting the server, you can access:
64
+
65
+ - Web interface: `http://your-server:port`
66
+ - MCP endpoint: `http://your-server:port/gradio_api/mcp/sse`
67
+
68
+ ## Using the Web Interface
69
+
70
+ - Enter text in the input area
71
+ - Select the entity type (Disease, Tissue, or Cell Type)
72
+ - Click "Normalize"
73
+ - View the normalized entities in the results area
74
+
75
+ ## Using as an MCP Tool
76
+
77
+ The server exposes an MCP-compatible endpoint that can be used by AI agents. The tool accepts:
78
+
79
+ - `paragraph`: Text to extract entities from
80
+ - `target_entity`: Type of entity to extract ("Disease", "Tissue", or "Cell Type")
81
+
82
+ and returns a list of normalized entities.
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BioMed text normalization MCP server.
3
+ """
4
+
5
+ # import time
6
+ import asyncio
7
+ from typing import Optional
8
+
9
+ import gradio as gr
10
+
11
+ from oaklib_utils import get_candidates
12
+ from openai_utils import ask_openai
13
+
14
+ NER_PROMPT = """
15
+ You are an expert annotator of biomedical text.
16
+
17
+ Annotate/Identify/Extract all {entity}s in this text: {text}
18
+
19
+ Instructions:
20
+ 1. If no such entity or entities are found, then **return exactly**: Not Found
21
+ 2. Extract only the entity. If only an abbreviation is present, expand it based on the
22
+ biomedical context in the given paragraph. For e.g., BA12 full form is Brodmann (1909) area 12.
23
+ 3. Do not provide any additional information or formatting.
24
+
25
+ Do not guess or hallucinate if you are uncertain. This has high-stakes, so it's better to be safe
26
+ than sorry. This is very important, so you'd better be sure of your answer, OK?
27
+ """
28
+
29
+ RAG_PROMPT = """
30
+ You are an expert normalizer of biomedical entities.
31
+
32
+ Given the following list of candidate standard terms: {top_k_preds},
33
+ find the single closest matching term for this unnormalized entity: {entity}.
34
+
35
+ Instructions:
36
+ 1. **IMPORTANT:** Do **NOT** guess or hallucinate. Do **NOT** provide any term that
37
+ is not explicitly present in the list of standardized terms.
38
+ 2. Do not overgeneralize unless no match is available.
39
+ 3. Do not provide any additional information or formatting.
40
+
41
+ This has high-stakes, so it's better to be safe than sorry. This is very important, so you'd better
42
+ be sure of your answer, OK?
43
+ """
44
+
45
+
46
+ async def extract_entities(paragraph: str, target_entity: str) -> Optional[list[str]]:
47
+ """
48
+ Extract entities of a specific type from a given paragraph.
49
+
50
+ Args:
51
+ paragraph (str): The paragraph from which entities are to be extracted.
52
+ target_entity (str): The type of entity to extract from the paragraph (e.g., 'disease', 'tissue').
53
+
54
+ Returns:
55
+ Optional[list[str]]: A list of extracted entities of the specified type, or
56
+ None if the model did not return a valid response.
57
+ """
58
+ prompt = NER_PROMPT.format(entity=target_entity, text=paragraph)
59
+ extracted_entities = await ask_openai(prompt, usage="ner")
60
+
61
+ return extracted_entities
62
+
63
+
64
+ async def normalize_entities(raw_terms: list[str]) -> list[Optional[str]]:
65
+ """
66
+ Normalize a list of raw terms to the most appropriate standard terms from a list
67
+ of candidates.
68
+
69
+ This function is designed to process the output from extract_entities().
70
+
71
+ Args:
72
+ raw_terms (list[str]): List of unnormalized terms, typically from extract_entities().
73
+
74
+ Returns:
75
+ list[Optional[str]]: List of best matching standard terms in the same order as the
76
+ input terms. An entry may be None if normalization failed.
77
+ """
78
+
79
+ # Do normalization for each entity
80
+ async def process_single_entity(raw_term: str) -> Optional[str]:
81
+ # Generate candidates specifically for this entity
82
+ # If the oaklib function is not async, wrap it with run_in_executor
83
+ candidate_std_terms = await asyncio.to_thread(get_candidates, raw_term)
84
+
85
+ # Now use these entity-specific candidates for the OpenAI call
86
+ prompt = RAG_PROMPT.format(entity=raw_term, top_k_preds=candidate_std_terms)
87
+ result = await ask_openai(prompt, usage="rag")
88
+ return result
89
+
90
+ # Process all entities in parallel
91
+ tasks = [process_single_entity(entity) for entity in raw_terms]
92
+ normalized_entities = await asyncio.gather(*tasks)
93
+
94
+ return normalized_entities
95
+
96
+
97
+ async def extract_and_normalize(
98
+ paragraph: str, target_entity: str
99
+ ) -> list[Optional[str]]:
100
+ """
101
+ Extract entities from a paragraph and normalize them in one operation.
102
+
103
+ Args:
104
+ paragraph: The paragraph from which to extract entities.
105
+ target_entity: The type of entity to extract and normalize.
106
+
107
+ Returns:
108
+ list[Optional[str]]: List of best matching standard terms in the same order as the
109
+ input terms. An entry may be None if normalization failed.
110
+ """
111
+ extracted_entities = await extract_entities(paragraph, target_entity)
112
+ if not extracted_entities or len(extracted_entities) == 0:
113
+ return []
114
+
115
+ result = await normalize_entities(extracted_entities)
116
+ return result
117
+
118
+
119
+ # Create a visually appealing Gradio app
120
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
121
+ gr.Markdown("# Entity Extraction & Normalization")
122
+ gr.Markdown(
123
+ "Enter text and specify the entity type to extract and normalize entities."
124
+ )
125
+
126
+ with gr.Row():
127
+ with gr.Column(scale=3):
128
+ paragraph = gr.Textbox(
129
+ label="Text Input",
130
+ placeholder="Enter paragraph here...",
131
+ lines=5,
132
+ info="Enter biomedical text input for entity extraction.",
133
+ )
134
+ with gr.Column(scale=1):
135
+ target_entity = gr.Dropdown(
136
+ ["Disease", "Tissue", "Cell Type"],
137
+ label="Entity Type",
138
+ value="Disease",
139
+ info="Select the type of entity you want to extract and normalize from the text.",
140
+ )
141
+
142
+ normalize_btn = gr.Button("Normalize", variant="primary")
143
+
144
+ with gr.Row():
145
+ with gr.Column():
146
+ output = gr.JSON(label="Normalized Entities")
147
+
148
+ # Add a loading indicator
149
+ with gr.Row():
150
+ status = gr.Markdown("")
151
+
152
+ with gr.Accordion("Example Inputs", open=False):
153
+ gr.Examples(
154
+ examples=[
155
+ ["The patient was diagnosed with diabetes and hypertension.", "Disease"],
156
+ [
157
+ "Samples of BA12 tissue, weighing approximately 50-100 mg each, were homogenized in nuclei extraction buffer.",
158
+ "Tissue",
159
+ ],
160
+ [
161
+ "Coupling scTCR-seq with scRNA-seq can reveal the relationship between clonotype and phenotype in T or B cell populations.",
162
+ "Cell Type",
163
+ ],
164
+ ],
165
+ inputs=[paragraph, target_entity],
166
+ )
167
+
168
+ # Set up the button click event
169
+ normalize_btn.click(
170
+ lambda: "Processing...", # Show loading immediately
171
+ None,
172
+ status,
173
+ queue=False,
174
+ ).then(
175
+ extract_and_normalize, # Async processing
176
+ [paragraph, target_entity],
177
+ output,
178
+ ).then(
179
+ lambda: "", # Clear status
180
+ None,
181
+ status,
182
+ )
183
+
184
+
185
+ if __name__ == "__main__":
186
+ demo.launch(mcp_server=True)
oaklib_utils.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Retrieve top k candidate standard terms for normalization using oaklib.
3
+ """
4
+
5
+ # import argparse
6
+
7
+ from oaklib import get_adapter
8
+ from oaklib.datamodels.search import SearchConfiguration
9
+
10
+ adapter = get_adapter("ols:")
11
+
12
+
13
+ def get_candidates(term: str, top_k: int = 10) -> list[str]:
14
+ """
15
+ Get top k candidates for RAG.
16
+ """
17
+ # Set config for search (limit # terms returned)
18
+ cfg = SearchConfiguration(limit=top_k)
19
+
20
+ results = adapter.basic_search(term, config=cfg)
21
+ labels = list(adapter.labels(results)) # list of tuples of ids and labels
22
+
23
+ # print(f"## Query: {term} -> {labels}")
24
+ candidates = list(label for _, label in labels)
25
+ return candidates
26
+
27
+
28
+ # def main():
29
+ # parser = argparse.ArgumentParser(
30
+ # description="Fetch top-K candidate passages for a given term (RAG)"
31
+ # )
32
+ # parser.add_argument(
33
+ # "term", type=str, help="The query term or prompt for which to retrieve candidates"
34
+ # )
35
+ # parser.add_argument(
36
+ # "-k",
37
+ # "--top_k",
38
+ # type=int,
39
+ # default=10,
40
+ # help="Number of top candidates to return (default: 10)",
41
+ # )
42
+
43
+ # args = parser.parse_args()
44
+
45
+ # # Call your function
46
+ # candidates = get_candidates(args.term)
47
+
48
+ # print(f"\nTerm: {args.term!r}")
49
+ # print(f"Top {args.top_k} candidates:")
50
+ # for i, cand in enumerate(candidates, start=1):
51
+ # print(f" {i:2d}. {cand}")
52
+
53
+
54
+ # if __name__ == "__main__":
55
+ # main()
openai_utils.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helper functions for structured OpenAI API calls using Pydantic models.
3
+ Includes NER and RAG-specific prompting logic with retry and error handling.
4
+ """
5
+
6
+ import os
7
+ from typing import Literal, Optional, overload, Union
8
+
9
+ from dotenv import load_dotenv
10
+ from openai import AsyncOpenAI
11
+ from pydantic import BaseModel, Field
12
+ from tenacity import retry, retry_if_result, stop_after_attempt, wait_random_exponential
13
+ from tqdm.auto import tqdm
14
+
15
+ load_dotenv() # take environment variables from .env
16
+ api_key = os.getenv("OPENAI_API_KEY")
17
+ if not api_key:
18
+ raise EnvironmentError("Missing OPENAI_API_KEY in environment.")
19
+ client = AsyncOpenAI(api_key=api_key, timeout=120.0)
20
+
21
+
22
+ class NEROutput(BaseModel):
23
+ answer: list[str] = Field(..., description="List of extracted entities")
24
+
25
+
26
+ class RAGOutput(BaseModel):
27
+ answer: str = Field(..., description="Closest match to input term")
28
+ reason: str = Field(..., description="Why you chose the answer match to input term")
29
+
30
+
31
+ def is_invalid_result(result):
32
+ return result is None
33
+
34
+ @overload
35
+ async def ask_openai(user_prompt: str, usage: Literal["ner"], model: str = ...) -> Optional[list[str]]: ...
36
+ @overload
37
+ async def ask_openai(user_prompt: str, usage: Literal["rag"], model: str = ...) -> Optional[str]: ...
38
+
39
+ @retry(
40
+ retry=retry_if_result(is_invalid_result),
41
+ wait=wait_random_exponential(min=1, max=60),
42
+ stop=stop_after_attempt(6),
43
+ )
44
+ async def ask_openai(
45
+ user_prompt: str,
46
+ usage: Literal['ner', 'rag'],
47
+ model: str = "o4-mini-2025-04-16",
48
+ ) -> Optional[Union[list[str], str]]:
49
+ """
50
+ Function to interact with the OpenAI API.
51
+ """
52
+ if model in ["chatgpt-4o-latest", "o1-mini"]:
53
+ raise ValueError(f"Model {model} does not support structured outputs.")
54
+
55
+ response_format = NEROutput if usage == 'ner' else RAGOutput
56
+
57
+ try:
58
+ response = await client.responses.parse(
59
+ model=model,
60
+ input=[{"role": "user", "content": user_prompt}],
61
+ text_format=response_format,
62
+ # temperature=0.05,
63
+ )
64
+ response_obj = response.output_parsed
65
+ return response_obj.answer if response_obj else None
66
+
67
+ except Exception as e:
68
+ tqdm.write(f"❌ Unexpected error. Error: {e}")
69
+ raise
pyproject.toml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "biomednorm-mcp-server"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "oaklib>=0.6.23",
9
+ "openai>=1.84.0",
10
+ "python-dotenv>=1.1.0",
11
+ ]
ruff.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ indent-width = 2
2
+ target-version = "py312"
uv.lock ADDED
The diff for this file is too large to render. See raw diff