Spaces:

fxmarty
/

bettertransformer-demo

Runtime error

App Files Files Community

Felix Marty commited on Nov 22, 2022

Commit

7d58e23

1 Parent(s): 4843fe3

hopefully stable

Browse files

Files changed (3) hide show

app.py +79 -69
backend.py +15 -19
defaults.py +21 -21

app.py CHANGED Viewed

@@ -1,55 +1,66 @@
-import gradio as gr
 import json
-from backend import get_message_single, get_message_spam, send_single, send_spam, tokenizer
-from defaults import (
-    ADDRESS_BETTERTRANSFORMER,
-    ADDRESS_VANILLA,
-    defaults_bt_single,
-    defaults_bt_spam,
-    defaults_vanilla_single,
-    defaults_vanilla_spam,
-)
 import datasets
 import torch
-def dispatch_single(input_model_single, address_input_vanilla, address_input_bettertransformer):
     result_vanilla = send_single(input_model_single, address_input_vanilla)
-    result_bettertransformer = send_single(input_model_single, address_input_bettertransformer)
     return result_vanilla, result_bettertransformer
-def dispatch_spam_artif(input_n_spam_artif, sequence_length, padding_ratio, address_input_vanilla, address_input_bettertransformer):
     sequence_length = int(sequence_length)
     input_n_spam_artif = int(input_n_spam_artif)
     inp_tokens = torch.randint(tokenizer.vocab_size - 1, (sequence_length,)) + 1
     n_pads = max(int(padding_ratio * len(inp_tokens)), 1)
-    inp_tokens[- n_pads:] = 0
     inp_tokens[0] = 101
-    inp_tokens[- n_pads - 1] = 102
     attention_mask = torch.zeros((sequence_length,), dtype=torch.int64)
-    attention_mask[:- n_pads] = 1
-    str_input = json.dumps({
-        "input_ids": inp_tokens.cpu().tolist(),
-        "attention_mask": attention_mask.cpu().tolist(),
-        "pre_tokenized": True,
-    })
     input_dataset = datasets.Dataset.from_dict(
         {"sentence": [str_input for _ in range(input_n_spam_artif)]}
     )
     result_vanilla = send_spam(input_dataset, address_input_vanilla)
     result_bettertransformer = send_spam(input_dataset, address_input_bettertransformer)
     return result_vanilla, result_bettertransformer
 TTILE_IMAGE = """
 <div
     style="
@@ -63,34 +74,17 @@ TTILE_IMAGE = """
 </div>
 """
-TITLE = """
-<div
-    style="
-        display: inline-flex;
-        align-items: center;
-        text-align: center;
-        max-width: 1400px;
-        gap: 0.8rem;
-        font-size: 2.2rem;
-    "
->
-<h1 style="font-weight: 500; margin-bottom: 10px; margin-top: 10px;">
-    Speed up your inference and support more workload with PyTorch's BetterTransformer 🤗
-</h1>
-</div>
-"""
 with gr.Blocks() as demo:
     gr.HTML(TTILE_IMAGE)
-    gr.HTML(TITLE)
     gr.Markdown(
         """
-    Let's try out TorchServe + BetterTransformer!
-    BetterTransformer is a stable feature made available with [PyTorch 1.13](https://pytorch.org/blog/PyTorch-1.13-release/) allowing to use a fastpath execution for encoder attention blocks.
-    As a one-liner, you can convert your 🤗 Transformers models to use BetterTransformer thanks to the [🤗 Optimum](https://huggingface.co/docs/optimum/main/en/index) library:
     ```
     from optimum.bettertransformer import BetterTransformer
@@ -98,18 +92,13 @@ with gr.Blocks() as demo:
     better_model = BetterTransformer.transform(model)
     ```
-    This Space is a demo of an **end-to-end** deployement of PyTorch eager-mode models, both with and without BetterTransformer. The goal is to see what are the benefits server-side and client-side of using BetterTransformer.
-    ## Inference using...
-    """
     )
-    with gr.Row():
-        with gr.Column(scale=50):
-            gr.Markdown("### Vanilla Transformers + TorchServe")
-        with gr.Column(scale=50):
-            gr.Markdown("### BetterTransformer + TorchServe")
     address_input_vanilla = gr.Textbox(
         max_lines=1, label="ip vanilla", value=ADDRESS_VANILLA, visible=False
     )
@@ -124,30 +113,44 @@ with gr.Blocks() as demo:
     input_model_single = gr.Textbox(
         max_lines=1,
         label="Text",
-        value="Expectations were low, enjoyment was high",
     )
     btn_single = gr.Button("Send single text request")
     with gr.Row():
         with gr.Column(scale=50):
             output_single_vanilla = gr.Markdown(
                 label="Output single vanilla",
                 value=get_message_single(**defaults_vanilla_single),
             )
         with gr.Column(scale=50):
             output_single_bt = gr.Markdown(
                 label="Output single bt", value=get_message_single(**defaults_bt_single)
             )
     btn_single.click(
         fn=dispatch_single,
-        inputs=[input_model_single, address_input_vanilla, address_input_bettertransformer],
         outputs=[output_single_vanilla, output_single_bt],
     )
     input_n_spam_artif = gr.Number(
         label="Number of inputs to send",
-        value=8,
     )
     sequence_length = gr.Number(
         label="Sequence length (in tokens)",
@@ -155,28 +158,35 @@ with gr.Blocks() as demo:
     )
     padding_ratio = gr.Number(
         label="Padding ratio",
-        value=0.5,
-    )
-    btn_spam_artif = gr.Button(
-        "Spam text requests (using artificial data)"
     )
     with gr.Row():
         with gr.Column(scale=50):
             output_spam_vanilla_artif = gr.Markdown(
                 label="Output spam vanilla",
                 value=get_message_spam(**defaults_vanilla_spam),
             )
         with gr.Column(scale=50):
             output_spam_bt_artif = gr.Markdown(
                 label="Output spam bt", value=get_message_spam(**defaults_bt_spam)
             )
     btn_spam_artif.click(
         fn=dispatch_spam_artif,
-        inputs=[input_n_spam_artif, sequence_length, padding_ratio, address_input_vanilla, address_input_bettertransformer],
         outputs=[output_spam_vanilla_artif, output_spam_bt_artif],
     )
 demo.queue(concurrency_count=1)
-demo.launch()

 import json
 import datasets
+import gradio as gr
 import torch
+from backend import (get_message_single, get_message_spam, send_single,
+                     send_spam, tokenizer)
+from defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA,
+                      defaults_bt_single, defaults_bt_spam,
+                      defaults_vanilla_single, defaults_vanilla_spam)
+def dispatch_single(
+    input_model_single, address_input_vanilla, address_input_bettertransformer
+):
     result_vanilla = send_single(input_model_single, address_input_vanilla)
+    result_bettertransformer = send_single(
+        input_model_single, address_input_bettertransformer
+    )
     return result_vanilla, result_bettertransformer
+def dispatch_spam_artif(
+    input_n_spam_artif,
+    sequence_length,
+    padding_ratio,
+    address_input_vanilla,
+    address_input_bettertransformer,
+):
     sequence_length = int(sequence_length)
     input_n_spam_artif = int(input_n_spam_artif)
     inp_tokens = torch.randint(tokenizer.vocab_size - 1, (sequence_length,)) + 1
     n_pads = max(int(padding_ratio * len(inp_tokens)), 1)
+    inp_tokens[-n_pads:] = 0
     inp_tokens[0] = 101
+    inp_tokens[-n_pads - 1] = 102
     attention_mask = torch.zeros((sequence_length,), dtype=torch.int64)
+    attention_mask[:-n_pads] = 1
+    str_input = json.dumps(
+        {
+            "input_ids": inp_tokens.cpu().tolist(),
+            "attention_mask": attention_mask.cpu().tolist(),
+            "pre_tokenized": True,
+        }
+    )
     input_dataset = datasets.Dataset.from_dict(
         {"sentence": [str_input for _ in range(input_n_spam_artif)]}
     )
     result_vanilla = send_spam(input_dataset, address_input_vanilla)
     result_bettertransformer = send_spam(input_dataset, address_input_bettertransformer)
     return result_vanilla, result_bettertransformer
 TTILE_IMAGE = """
 <div
     style="
 </div>
 """
 with gr.Blocks() as demo:
     gr.HTML(TTILE_IMAGE)
+    gr.Markdown(
+        "# Speed up your inference and support more workload with PyTorch's BetterTransformer 🤗"
+    )
     gr.Markdown(
         """
+    Let's try out [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) + [TorchServe](https://pytorch.org/serve/)!
+    BetterTransformer is a stable feature made available with [PyTorch 1.13](https://pytorch.org/blog/PyTorch-1.13-release/) allowing to use a fastpath execution for encoder attention blocks. Depending on your hardware, batch size, sequence length, padding ratio, it can bring large speedups at inference **at no cost in prediction quality**. As a one-liner, you can convert your 🤗 Transformers models to use BetterTransformer thanks to the integration in the [🤗 Optimum](https://github.com/huggingface/optimum) library:
     ```
     from optimum.bettertransformer import BetterTransformer
     better_model = BetterTransformer.transform(model)
     ```
+    This Space is a demo of an **end-to-end** deployement of PyTorch eager-mode models, both with and without BetterTransformer. The goal is to see what are the benefits server-side and client-side of using BetterTransformer. The model used is [`distilbert-base-uncased-finetuned-sst-2-english`](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english), and TorchServe is parametrized to use a maximum batch size of 8. **Beware:** you may be queued in case several persons use the Space at the same time.
+    For more details on the TorchServe implementation and to reproduce, see [this reference code](https://github.com/fxmarty/bettertransformer_demo). For more details on BetterTransformer, check out the [blog post on PyTorch's Medium](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2), and [the Optimum documentation](https://huggingface.co/docs/optimum/bettertransformer/overview)!"""
     )
+    gr.Markdown("## Single input scenario")
     address_input_vanilla = gr.Textbox(
         max_lines=1, label="ip vanilla", value=ADDRESS_VANILLA, visible=False
     )
     input_model_single = gr.Textbox(
         max_lines=1,
         label="Text",
+        value="Expectations were low, enjoyment was high. Although the music was not top level, the story was well-paced.",
     )
     btn_single = gr.Button("Send single text request")
     with gr.Row():
         with gr.Column(scale=50):
+            gr.Markdown("### Vanilla Transformers + TorchServe")
             output_single_vanilla = gr.Markdown(
                 label="Output single vanilla",
                 value=get_message_single(**defaults_vanilla_single),
             )
         with gr.Column(scale=50):
+            gr.Markdown("### BetterTransformer + TorchServe")
             output_single_bt = gr.Markdown(
                 label="Output single bt", value=get_message_single(**defaults_bt_single)
             )
     btn_single.click(
         fn=dispatch_single,
+        inputs=[
+            input_model_single,
+            address_input_vanilla,
+            address_input_bettertransformer,
+        ],
         outputs=[output_single_vanilla, output_single_bt],
     )
+    gr.Markdown(
+        """
+    **Beware that the end-to-end latency can be impacted by a different ping time between the two servers.**
+    ## Heavy workload scenario
+    """
+    )
     input_n_spam_artif = gr.Number(
         label="Number of inputs to send",
+        value=80,
     )
     sequence_length = gr.Number(
         label="Sequence length (in tokens)",
     )
     padding_ratio = gr.Number(
         label="Padding ratio",
+        value=0.7,
     )
+    btn_spam_artif = gr.Button("Spam text requests (using artificial data)")
     with gr.Row():
         with gr.Column(scale=50):
+            gr.Markdown("### Vanilla Transformers + TorchServe")
             output_spam_vanilla_artif = gr.Markdown(
                 label="Output spam vanilla",
                 value=get_message_spam(**defaults_vanilla_spam),
             )
         with gr.Column(scale=50):
+            gr.Markdown("### BetterTransformer + TorchServe")
             output_spam_bt_artif = gr.Markdown(
                 label="Output spam bt", value=get_message_spam(**defaults_bt_spam)
             )
     btn_spam_artif.click(
         fn=dispatch_spam_artif,
+        inputs=[
+            input_n_spam_artif,
+            sequence_length,
+            padding_ratio,
+            address_input_vanilla,
+            address_input_bettertransformer,
+        ],
         outputs=[output_spam_vanilla_artif, output_spam_bt_artif],
     )
 demo.queue(concurrency_count=1)
+demo.launch()

backend.py CHANGED Viewed

@@ -1,16 +1,12 @@
 import json
-from defaults import (
-    ADDRESS_BETTERTRANSFORMER,
-    ADDRESS_VANILLA,
-    HEADERS,
-    MODEL_NAME,
-)
 from requests_futures.sessions import FuturesSession
 from transformers import AutoTokenizer
-import time
 RETURN_MESSAGE_SINGLE = """
 Inference statistics:
@@ -23,10 +19,8 @@ Inference statistics:
 * Padding ratio: 0.0 %
 """
-RETURN_MESSAGE_SPAM = (
-    """
-Processing """
-    + "NUMBER REQ" + """ inputs sent asynchronously. Grab a coffee.
 Inference statistics:
@@ -37,10 +31,10 @@ Inference statistics:
 * Mean sequence length: {4} tokens
 * Effective mean batch size: {5}
 """
-)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 def get_message_single(
     status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
 ):
@@ -70,14 +64,16 @@ def get_message_spam(
 SESSION = FuturesSession()
-def send_single(input_model_vanilla, address: str):
     assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
     # should not take more than 10 s, so timeout if that's the case
-    start = time.time()
-    promise = SESSION.post(
-        address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"), timeout=10
     )
     try:
         response = promise.result()  # resolve ASAP
@@ -98,7 +94,7 @@ def send_single(input_model_vanilla, address: str):
     )
-def send_spam(inp, address: str):
     assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
     mean_inference_latency = 0
@@ -129,7 +125,7 @@ def send_spam(inp, address: str):
             response = promise.result()  # resolve ASAP
         except Exception as e:
             return f"{e}"
         end = max(time.time(), end)
     # then other metrics

 import json
+import time
+from datasets import Dataset
 from requests_futures.sessions import FuturesSession
 from transformers import AutoTokenizer
+from defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA, HEADERS,
+                      MODEL_NAME)
 RETURN_MESSAGE_SINGLE = """
 Inference statistics:
 * Padding ratio: 0.0 %
 """
+RETURN_MESSAGE_SPAM = """
+Processing inputs sent asynchronously. Grab a coffee.
 Inference statistics:
 * Mean sequence length: {4} tokens
 * Effective mean batch size: {5}
 """
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 def get_message_single(
     status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
 ):
 SESSION = FuturesSession()
+def send_single(input_model_vanilla: str, address: str):
     assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
     # should not take more than 10 s, so timeout if that's the case
+    inp = json.dumps({"text": input_model_vanilla, "pre_tokenized": False}).encode(
+        "utf-8"
     )
+    start = time.time()
+    promise = SESSION.post(address, headers=HEADERS, data=inp, timeout=10)
     try:
         response = promise.result()  # resolve ASAP
     )
+def send_spam(inp: Dataset, address: str):
     assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
     mean_inference_latency = 0
             response = promise.result()  # resolve ASAP
         except Exception as e:
             return f"{e}"
         end = max(time.time(), end)
     # then other metrics

defaults.py CHANGED Viewed

@@ -1,35 +1,35 @@
 defaults_vanilla_single = {
     "status": 200,
-    "prediction": "Accepted",
-    "inf_latency": 20.77,
-    "peak_gpu_memory": 2717.36,
-    "end_to_end_latency": 93.65,
 }
 defaults_bt_single = {
     "status": 200,
-    "prediction": "Accepted",
-    "inf_latency": 20.77,
-    "peak_gpu_memory": 2717.36,
-    "end_to_end_latency": 93.65,
 }
 defaults_vanilla_spam = {
-    "throughput": 20,
-    "mean_inference_latency": 29.69,
-    "mean_peak_gpu_memory": 3620.9,
-    "mean_padding_ratio": 35.26,
-    "mean_sequence_length": 39.395,
-    "effective_batch_size": 8,
 }
 defaults_bt_spam = {
-    "throughput": 20,
-    "mean_inference_latency": 29.69,
-    "mean_peak_gpu_memory": 3620.9,
-    "mean_padding_ratio": 35.26,
-    "mean_sequence_length": 39.395,
-    "effective_batch_size": 8,
 }
 BATCH_SIZE = 8  # fixed!
@@ -37,4 +37,4 @@ BATCH_SIZE = 8  # fixed!
 HEADERS = {"Content-Type": "text/plain"}
 ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
 ADDRESS_BETTERTRANSFORMER = "http://3.95.136.2:8080/predictions/my_tc"
-MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"

 defaults_vanilla_single = {
     "status": 200,
+    "prediction": "Positive",
+    "inf_latency": 7.66,
+    "peak_gpu_memory": 2706.21,
+    "end_to_end_latency": 309.65,
 }
 defaults_bt_single = {
     "status": 200,
+    "prediction": "Positive",
+    "inf_latency": 6.01,
+    "peak_gpu_memory": 2706.22,
+    "end_to_end_latency": 303.53,
 }
 defaults_vanilla_spam = {
+    "throughput": 28.04,
+    "mean_inference_latency": 24.43,
+    "mean_peak_gpu_memory": 2907.92,
+    "mean_padding_ratio": 69.53,
+    "mean_sequence_length": 128.0,
+    "effective_batch_size": 4.3,
 }
 defaults_bt_spam = {
+    "throughput": 38.53,
+    "mean_inference_latency": 12.73,
+    "mean_peak_gpu_memory": 2761.64,
+    "mean_padding_ratio": 69.53,
+    "mean_sequence_length": 128.0,
+    "effective_batch_size": 4.7,
 }
 BATCH_SIZE = 8  # fixed!
 HEADERS = {"Content-Type": "text/plain"}
 ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
 ADDRESS_BETTERTRANSFORMER = "http://3.95.136.2:8080/predictions/my_tc"
+MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"