Spaces:

fxmarty
/

bettertransformer-demo

Running

File size: 3,675 Bytes

import json

from datasets import load_dataset

from .defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA, HEADERS,
                       SPAM_N_REQUESTS)
from .utils import ElapsedFuturesSession

data = load_dataset("glue", "sst2", split="validation")

RETURN_MESSAGE_SINGLE = """
Inference statistics:

* Response status: {0}
* Prediction: {1}
* Inference latency (preprocessing/forward/postprocessing): {2} ms
* Peak GPU memory usage: {3} MB
* End-to-end latency (communication + pre/forward/post): {4} ms
* Padding ratio: 0.0 %
"""

RETURN_MESSAGE_SPAM = (
    """
Processing """
    + f"{SPAM_N_REQUESTS}"
    + """ inputs sent asynchronously. Grab a coffee.

Inference statistics:

* Promise resolution time: {0} ms
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
* Mean peak GPU memory: {2} MB
* Mean padding ratio: {3} %
* Mean sequence length: {4} tokens
"""
)


def get_message_single(
    status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
):
    return RETURN_MESSAGE_SINGLE.format(
        status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
    )


def get_message_spam(
    resolution_time,
    mean_inference_latency,
    mean_peak_gpu_memory,
    mean_padding_ratio,
    mean_sequence_length,
    **kwargs,
):
    return RETURN_MESSAGE_SPAM.format(
        resolution_time,
        mean_inference_latency,
        mean_peak_gpu_memory,
        mean_padding_ratio,
        mean_sequence_length,
    )


SESSION = ElapsedFuturesSession()


def send_single(input_model_vanilla, address: str):
    assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]

    promise = SESSION.post(
        address, headers=HEADERS, data=input_model_vanilla.encode("utf-8")
    )

    response = promise.result()  # resolve immediately

    status = response.status_code

    response_text = json.loads(response.text)
    prediction = response_text[0]
    inf_latency = response_text[1]
    peak_gpu_memory = response_text[2]
    end_to_end_latency = response.elapsed

    return get_message_single(
        status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
    )


def send_spam(address: str):
    assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]

    # data = "this is positive lol"  #TODO: use dynamic data with padding

    assert SPAM_N_REQUESTS <= len(data)

    inp = data.shuffle().select(range(SPAM_N_REQUESTS))

    resolution_time = 0
    mean_inference_latency = 0
    mean_peak_gpu_memory = 0

    n_pads = 0
    n_elems = 0
    sequence_length = 0

    promises = []

    for i in range(SPAM_N_REQUESTS):
        input_data = inp[i]["sentence"].encode("utf-8")
        promises.append(SESSION.post(address, headers=HEADERS, data=input_data))

    for promise in promises:
        response = promise.result()

        response_text = json.loads(response.text)

        resolution_time = max(resolution_time, response.elapsed)

        mean_inference_latency += response_text[1]
        mean_peak_gpu_memory += response_text[2]
        n_pads += response_text[3]
        n_elems += response_text[4]
        sequence_length += response_text[5]

    mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
    mean_sequence_length = sequence_length / SPAM_N_REQUESTS

    resolution_time = round(resolution_time, 2)
    mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
    mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)

    return get_message_spam(
        resolution_time,
        mean_inference_latency,
        mean_peak_gpu_memory,
        mean_padding_ratio,
        mean_sequence_length,
    )