Spaces:

AllIllusion
/

MCP-Server_TextSimilarity_ModernBERT

Running

App Files Files Community

Li commited on 9 days ago

Commit

078f90f

1 Parent(s): 2d0e045

ModernBERT

Browse files

Files changed (4) hide show

.gitignore +299 -0
README.md +6 -3
app.py +115 -0
requirements.txt +7 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,299 @@

+.gradio/
+shap_e_model_cache/
+corgi.png
+# PyCharm Files
+.idea/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
 ---
-title: MCP-Server TextSimilarity ModernBERT
 emoji: 🌍
-colorFrom: yellow
 colorTo: green
 sdk: gradio
-sdk_version: 5.44.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MCP-Server ModernBERT TextSimilarity
 emoji: 🌍
+colorFrom: purple
 colorTo: green
 sdk: gradio
+sdk_version: 5.33.1
 app_file: app.py
 pinned: false
+tags:
+- tool
+- SL-Sprout
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#########################################################################
+# Copyright (C)                                                       	#
+# 2025-August Sen Li ([email protected])		     			    #
+# Permission given to modify the code only for Non-Profit Research		#
+# as long as you keep this declaration at the top 						#
+#########################################################################
+import os
+import gradio as gr
+import huggingface_hub
+import sentence_transformers
+from transformers import AutoTokenizer, AutoModel
+import torch
+# ----------------------------------------------------------------------------------------------------------------------
+def func_ClearInputs():        return "", "", ""
+# str_ModelID_ClinicalBERT = "medicalai/ClinicalBERT"
+str_ModelID_ModernBERT = "answerdotai/ModernBERT-large"
+# str_ModelID_ClinicalBERT = "TsinghuaC3I/Llama-3-8B-UltraMedical"
+# tokenizer = AutoTokenizer.from_pretrained(str_ModelID_ModernBERT)
+# model_ClinicalBERT = AutoModel.from_pretrained(str_ModelID_ModernBERT)
+# Wrap ClinicalBERT inside SentenceTransformers
+word_embedding_model = sentence_transformers.models.Transformer(str_ModelID_ModernBERT)
+pooling_model = sentence_transformers.models.Pooling(word_embedding_model.get_word_embedding_dimension())
+sentenceModel_ModernBERT = sentence_transformers.SentenceTransformer(modules=[word_embedding_model, pooling_model])
+                                                               # device="cuda")
+def get_SentenceEmbeddings_ModernBERT(sentence):
+    # Encode sentences in batches efficiently
+    embeddings = sentenceModel_ModernBERT.encode(sentence)
+    return embeddings
+def get_sentence_embedding(sentence: str) -> torch.Tensor:
+    # Tokenize and encode
+    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
+    # Get hidden states
+    with torch.no_grad():
+        outputs = sentenceModel_ModernBERT(**inputs)
+    # outputs.last_hidden_state shape: (batch_size, seq_len, hidden_dim)
+    token_embeddings = outputs.last_hidden_state
+    # Create sentence embedding (mean pooling across tokens, ignoring padding)
+    attention_mask = inputs['attention_mask']
+    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sentence_embedding = torch.sum(token_embeddings * mask_expanded, dim=1) / torch.clamp(mask_expanded.sum(dim=1),
+                                                                                          min=1e-9)
+    return sentence_embedding.squeeze()
+def func_sBERT_SimilarityResult(str_Text_1, str_Text_2):
+    if not str_Text_1.strip() or not str_Text_2.strip():
+        return "Both text inputs must be non-empty."
+    # 01. Load SBERT model (you can choose other pre-trained models too)
+    inferenceClient = huggingface_hub.InferenceClient(provider="hf-inference")
+    # 02. Get sentence embeddings
+    # str_ModelID_sBERT = "medicalai/ClinicalBERT"
+    # arrEmbedding_Text_1 = inferenceClient.feature_extraction(text=str_Text_1, model=str_ModelID_sBERT)
+    # arrEmbedding_Text_2 = inferenceClient.feature_extraction(text=str_Text_2, model=str_ModelID_sBERT)
+    # arrEmbedding_Text_1 = get_sentence_embedding(str_Text_1)
+    # arrEmbedding_Text_2 = get_sentence_embedding(str_Text_2)
+    arrEmbedding_Text_1 = get_SentenceEmbeddings_ModernBERT(str_Text_1)
+    arrEmbedding_Text_2 = get_SentenceEmbeddings_ModernBERT(str_Text_2)
+    # 03. Compute cosine similarity
+    tensor_Similarity = sentence_transformers.util.pytorch_cos_sim(arrEmbedding_Text_1, arrEmbedding_Text_2)
+    f_Similarity = tensor_Similarity.item()
+    return f"Clinical Similarity Score: {f_Similarity:.4f}"
+# ----------------------------------------------------------------------------------------------------------------------
+# Launch the interface and MCP server
+if __name__ == "__main__":
+    print(f"os.getcwd() = {os.getcwd()}")
+    os.system(f"echo ls -al {os.getcwd()} && ls -al {os.getcwd()}")
+    os.system(f"echo ls -al /: && ls -al /")
+    os.system(f"echo ls -al /home/: && ls -al /home/")
+    # 03. Gradio UI elements
+    with gr.Blocks() as grBlocks_SentenceSimilarity__MCP_Server:
+        gr.Markdown("# ModernBERT for Clinical Text Similarity using HF Inference Server, MaxSeqLength==8192")
+        gr.Markdown("This application calculates Cosine Similarity Score between two Texts' ModernBERT Sentence-Embeddings")
+        with gr.Row():
+            grTextBox_Input_1 = gr.Textbox(label="Text Panel 1", lines=20)
+            grTextBox_Input_2 = gr.Textbox(label="Text Panel 2", lines=20)
+        with gr.Row():
+            with gr.Column(scale=1):
+                grButton_Clear  = gr.Button("Clear")
+                grButton_Submit = gr.Button("Submit")
+            with gr.Column(scale=3):
+                grTextbox_Output = gr.Textbox(label="Similarity Result", interactive=False)
+        # Set button functionality
+        grButton_Submit.click(fn=func_sBERT_SimilarityResult, inputs=[grTextBox_Input_1, grTextBox_Input_2], outputs=grTextbox_Output)
+        grButton_Clear.click(fn=func_ClearInputs, inputs=[], outputs=[grTextBox_Input_1, grTextBox_Input_2, grTextbox_Output])
+    # 04. Launch Gradio MCP server
+    grBlocks_SentenceSimilarity__MCP_Server.launch(mcp_server=True, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+huggingface_hub
+torch
+transformers
+sentence_transformers
+smolagents[mcp]
+gradio[mcp]
+mcp