Li commited on
Commit
078f90f
·
1 Parent(s): 2d0e045

ModernBERT

Browse files
Files changed (4) hide show
  1. .gitignore +299 -0
  2. README.md +6 -3
  3. app.py +115 -0
  4. requirements.txt +7 -0
.gitignore ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .gradio/
2
+ shap_e_model_cache/
3
+ corgi.png
4
+
5
+
6
+ # PyCharm Files
7
+ .idea/
8
+
9
+
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ pip-wheel-metadata/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+
63
+ # Translations
64
+ *.mo
65
+ *.pot
66
+
67
+ # Django stuff:
68
+ *.log
69
+ local_settings.py
70
+ db.sqlite3
71
+ db.sqlite3-journal
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ target/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # IPython
90
+ profile_default/
91
+ ipython_config.py
92
+
93
+ # pyenv
94
+ .python-version
95
+
96
+ # pipenv
97
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
99
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
100
+ # install all needed dependencies.
101
+ #Pipfile.lock
102
+
103
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104
+ __pypackages__/
105
+
106
+ # Celery stuff
107
+ celerybeat-schedule
108
+ celerybeat.pid
109
+
110
+ # SageMath parsed files
111
+ *.sage.py
112
+
113
+ # Environments
114
+ .env
115
+ .venv
116
+ env/
117
+ venv/
118
+ ENV/
119
+ env.bak/
120
+ venv.bak/
121
+
122
+ # Spyder project settings
123
+ .spyderproject
124
+ .spyproject
125
+
126
+ # Rope project settings
127
+ .ropeproject
128
+
129
+ # mkdocs documentation
130
+ /site
131
+
132
+ # mypy
133
+ .mypy_cache/
134
+ .dmypy.json
135
+ dmypy.json
136
+
137
+ # Pyre type checker
138
+ .pyre/
139
+
140
+
141
+ # Byte-compiled / optimized / DLL files
142
+ __pycache__/
143
+ *.py[cod]
144
+ *$py.class
145
+
146
+ # C extensions
147
+ *.so
148
+
149
+ # Distribution / packaging
150
+ .Python
151
+ build/
152
+ develop-eggs/
153
+ dist/
154
+ downloads/
155
+ eggs/
156
+ .eggs/
157
+ lib/
158
+ lib64/
159
+ parts/
160
+ sdist/
161
+ var/
162
+ wheels/
163
+ share/python-wheels/
164
+ *.egg-info/
165
+ .installed.cfg
166
+ *.egg
167
+ MANIFEST
168
+
169
+ # PyInstaller
170
+ # Usually these files are written by a python script from a template
171
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
172
+ *.manifest
173
+ *.spec
174
+
175
+ # Installer logs
176
+ pip-log.txt
177
+ pip-delete-this-directory.txt
178
+
179
+ # Unit test / coverage reports
180
+ htmlcov/
181
+ .tox/
182
+ .nox/
183
+ .coverage
184
+ .coverage.*
185
+ .cache
186
+ nosetests.xml
187
+ coverage.xml
188
+ *.cover
189
+ *.py,cover
190
+ .hypothesis/
191
+ .pytest_cache/
192
+ cover/
193
+
194
+ # Translations
195
+ *.mo
196
+ *.pot
197
+
198
+ # Django stuff:
199
+ *.log
200
+ local_settings.py
201
+ db.sqlite3
202
+ db.sqlite3-journal
203
+
204
+ # Flask stuff:
205
+ instance/
206
+ .webassets-cache
207
+
208
+ # Scrapy stuff:
209
+ .scrapy
210
+
211
+ # Sphinx documentation
212
+ docs/_build/
213
+
214
+ # PyBuilder
215
+ .pybuilder/
216
+ target/
217
+
218
+ # Jupyter Notebook
219
+ .ipynb_checkpoints
220
+
221
+ # IPython
222
+ profile_default/
223
+ ipython_config.py
224
+
225
+ # pyenv
226
+ # For a library or package, you might want to ignore these files since the code is
227
+ # intended to run in multiple environments; otherwise, check them in:
228
+ # .python-version
229
+
230
+ # pipenv
231
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
232
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
233
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
234
+ # install all needed dependencies.
235
+ #Pipfile.lock
236
+
237
+ # poetry
238
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
239
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
240
+ # commonly ignored for libraries.
241
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
242
+ #poetry.lock
243
+
244
+ # pdm
245
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
246
+ #pdm.lock
247
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
248
+ # in version control.
249
+ # https://pdm.fming.dev/#use-with-ide
250
+ .pdm.toml
251
+
252
+ __pypackages__/
253
+
254
+ # Celery stuff
255
+ celerybeat-schedule
256
+ celerybeat.pid
257
+
258
+ # SageMath parsed files
259
+ *.sage.py
260
+
261
+ # Environments
262
+ .env
263
+ .venv
264
+ env/
265
+ venv/
266
+ ENV/
267
+ env.bak/
268
+ venv.bak/
269
+
270
+ # Spyder project settings
271
+ .spyderproject
272
+ .spyproject
273
+
274
+ # Rope project settings
275
+ .ropeproject
276
+
277
+ # mkdocs documentation
278
+ /site
279
+
280
+ # mypy
281
+ .mypy_cache/
282
+ .dmypy.json
283
+ dmypy.json
284
+
285
+ # Pyre type checker
286
+ .pyre/
287
+
288
+ # pytype static type analyzer
289
+ .pytype/
290
+
291
+ # Cython debug symbols
292
+ cython_debug/
293
+
294
+ # PyCharm
295
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
296
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
297
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
298
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
299
+ #.idea/
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
- title: MCP-Server TextSimilarity ModernBERT
3
  emoji: 🌍
4
- colorFrom: yellow
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: MCP-Server ModernBERT TextSimilarity
3
  emoji: 🌍
4
+ colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.33.1
8
  app_file: app.py
9
  pinned: false
10
+ tags:
11
+ - tool
12
+ - SL-Sprout
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #########################################################################
2
+ # Copyright (C) #
3
+ # 2025-August Sen Li ([email protected]) #
4
+ # Permission given to modify the code only for Non-Profit Research #
5
+ # as long as you keep this declaration at the top #
6
+ #########################################################################
7
+ import os
8
+
9
+ import gradio as gr
10
+ import huggingface_hub
11
+ import sentence_transformers
12
+
13
+ from transformers import AutoTokenizer, AutoModel
14
+ import torch
15
+
16
+ # ----------------------------------------------------------------------------------------------------------------------
17
+ def func_ClearInputs(): return "", "", ""
18
+
19
+
20
+ # str_ModelID_ClinicalBERT = "medicalai/ClinicalBERT"
21
+ str_ModelID_ModernBERT = "answerdotai/ModernBERT-large"
22
+
23
+ # str_ModelID_ClinicalBERT = "TsinghuaC3I/Llama-3-8B-UltraMedical"
24
+
25
+ # tokenizer = AutoTokenizer.from_pretrained(str_ModelID_ModernBERT)
26
+ # model_ClinicalBERT = AutoModel.from_pretrained(str_ModelID_ModernBERT)
27
+
28
+ # Wrap ClinicalBERT inside SentenceTransformers
29
+ word_embedding_model = sentence_transformers.models.Transformer(str_ModelID_ModernBERT)
30
+ pooling_model = sentence_transformers.models.Pooling(word_embedding_model.get_word_embedding_dimension())
31
+ sentenceModel_ModernBERT = sentence_transformers.SentenceTransformer(modules=[word_embedding_model, pooling_model])
32
+ # device="cuda")
33
+
34
+
35
+ def get_SentenceEmbeddings_ModernBERT(sentence):
36
+ # Encode sentences in batches efficiently
37
+ embeddings = sentenceModel_ModernBERT.encode(sentence)
38
+
39
+ return embeddings
40
+
41
+
42
+ def get_sentence_embedding(sentence: str) -> torch.Tensor:
43
+ # Tokenize and encode
44
+ inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
45
+
46
+ # Get hidden states
47
+ with torch.no_grad():
48
+ outputs = sentenceModel_ModernBERT(**inputs)
49
+
50
+ # outputs.last_hidden_state shape: (batch_size, seq_len, hidden_dim)
51
+ token_embeddings = outputs.last_hidden_state
52
+
53
+ # Create sentence embedding (mean pooling across tokens, ignoring padding)
54
+ attention_mask = inputs['attention_mask']
55
+ mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
56
+ sentence_embedding = torch.sum(token_embeddings * mask_expanded, dim=1) / torch.clamp(mask_expanded.sum(dim=1),
57
+ min=1e-9)
58
+
59
+ return sentence_embedding.squeeze()
60
+
61
+ def func_sBERT_SimilarityResult(str_Text_1, str_Text_2):
62
+ if not str_Text_1.strip() or not str_Text_2.strip():
63
+ return "Both text inputs must be non-empty."
64
+
65
+ # 01. Load SBERT model (you can choose other pre-trained models too)
66
+ inferenceClient = huggingface_hub.InferenceClient(provider="hf-inference")
67
+
68
+ # 02. Get sentence embeddings
69
+ # str_ModelID_sBERT = "medicalai/ClinicalBERT"
70
+ # arrEmbedding_Text_1 = inferenceClient.feature_extraction(text=str_Text_1, model=str_ModelID_sBERT)
71
+ # arrEmbedding_Text_2 = inferenceClient.feature_extraction(text=str_Text_2, model=str_ModelID_sBERT)
72
+
73
+ # arrEmbedding_Text_1 = get_sentence_embedding(str_Text_1)
74
+ # arrEmbedding_Text_2 = get_sentence_embedding(str_Text_2)
75
+
76
+ arrEmbedding_Text_1 = get_SentenceEmbeddings_ModernBERT(str_Text_1)
77
+ arrEmbedding_Text_2 = get_SentenceEmbeddings_ModernBERT(str_Text_2)
78
+
79
+ # 03. Compute cosine similarity
80
+ tensor_Similarity = sentence_transformers.util.pytorch_cos_sim(arrEmbedding_Text_1, arrEmbedding_Text_2)
81
+ f_Similarity = tensor_Similarity.item()
82
+
83
+ return f"Clinical Similarity Score: {f_Similarity:.4f}"
84
+
85
+ # ----------------------------------------------------------------------------------------------------------------------
86
+ # Launch the interface and MCP server
87
+ if __name__ == "__main__":
88
+ print(f"os.getcwd() = {os.getcwd()}")
89
+ os.system(f"echo ls -al {os.getcwd()} && ls -al {os.getcwd()}")
90
+ os.system(f"echo ls -al /: && ls -al /")
91
+ os.system(f"echo ls -al /home/: && ls -al /home/")
92
+
93
+ # 03. Gradio UI elements
94
+ with gr.Blocks() as grBlocks_SentenceSimilarity__MCP_Server:
95
+ gr.Markdown("# ModernBERT for Clinical Text Similarity using HF Inference Server, MaxSeqLength==8192")
96
+ gr.Markdown("This application calculates Cosine Similarity Score between two Texts' ModernBERT Sentence-Embeddings")
97
+
98
+ with gr.Row():
99
+ grTextBox_Input_1 = gr.Textbox(label="Text Panel 1", lines=20)
100
+ grTextBox_Input_2 = gr.Textbox(label="Text Panel 2", lines=20)
101
+
102
+ with gr.Row():
103
+ with gr.Column(scale=1):
104
+ grButton_Clear = gr.Button("Clear")
105
+ grButton_Submit = gr.Button("Submit")
106
+ with gr.Column(scale=3):
107
+ grTextbox_Output = gr.Textbox(label="Similarity Result", interactive=False)
108
+
109
+ # Set button functionality
110
+ grButton_Submit.click(fn=func_sBERT_SimilarityResult, inputs=[grTextBox_Input_1, grTextBox_Input_2], outputs=grTextbox_Output)
111
+ grButton_Clear.click(fn=func_ClearInputs, inputs=[], outputs=[grTextBox_Input_1, grTextBox_Input_2, grTextbox_Output])
112
+
113
+
114
+ # 04. Launch Gradio MCP server
115
+ grBlocks_SentenceSimilarity__MCP_Server.launch(mcp_server=True, share=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ torch
3
+ transformers
4
+ sentence_transformers
5
+ smolagents[mcp]
6
+ gradio[mcp]
7
+ mcp