Spaces:
Running
Running
File size: 3,675 Bytes
35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de 35e3254 64721de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import json
from datasets import load_dataset
from .defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA, HEADERS,
SPAM_N_REQUESTS)
from .utils import ElapsedFuturesSession
data = load_dataset("glue", "sst2", split="validation")
RETURN_MESSAGE_SINGLE = """
Inference statistics:
* Response status: {0}
* Prediction: {1}
* Inference latency (preprocessing/forward/postprocessing): {2} ms
* Peak GPU memory usage: {3} MB
* End-to-end latency (communication + pre/forward/post): {4} ms
* Padding ratio: 0.0 %
"""
RETURN_MESSAGE_SPAM = (
"""
Processing """
+ f"{SPAM_N_REQUESTS}"
+ """ inputs sent asynchronously. Grab a coffee.
Inference statistics:
* Promise resolution time: {0} ms
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
* Mean peak GPU memory: {2} MB
* Mean padding ratio: {3} %
* Mean sequence length: {4} tokens
"""
)
def get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs
):
return RETURN_MESSAGE_SINGLE.format(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def get_message_spam(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
**kwargs,
):
return RETURN_MESSAGE_SPAM.format(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
)
SESSION = ElapsedFuturesSession()
def send_single(input_model_vanilla, address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
promise = SESSION.post(
address, headers=HEADERS, data=input_model_vanilla.encode("utf-8")
)
response = promise.result() # resolve immediately
status = response.status_code
response_text = json.loads(response.text)
prediction = response_text[0]
inf_latency = response_text[1]
peak_gpu_memory = response_text[2]
end_to_end_latency = response.elapsed
return get_message_single(
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency
)
def send_spam(address: str):
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
# data = "this is positive lol" #TODO: use dynamic data with padding
assert SPAM_N_REQUESTS <= len(data)
inp = data.shuffle().select(range(SPAM_N_REQUESTS))
resolution_time = 0
mean_inference_latency = 0
mean_peak_gpu_memory = 0
n_pads = 0
n_elems = 0
sequence_length = 0
promises = []
for i in range(SPAM_N_REQUESTS):
input_data = inp[i]["sentence"].encode("utf-8")
promises.append(SESSION.post(address, headers=HEADERS, data=input_data))
for promise in promises:
response = promise.result()
response_text = json.loads(response.text)
resolution_time = max(resolution_time, response.elapsed)
mean_inference_latency += response_text[1]
mean_peak_gpu_memory += response_text[2]
n_pads += response_text[3]
n_elems += response_text[4]
sequence_length += response_text[5]
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
mean_sequence_length = sequence_length / SPAM_N_REQUESTS
resolution_time = round(resolution_time, 2)
mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
return get_message_spam(
resolution_time,
mean_inference_latency,
mean_peak_gpu_memory,
mean_padding_ratio,
mean_sequence_length,
)
|