Delete modal
Browse files- modal/config.py +0 -20
- modal/engine.py +0 -103
- modal/requirements.txt +0 -0
modal/config.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
MODEL_DIR = "/model"
|
2 |
-
BASE_MODEL="mistralai/Mistral-7B-Instruct-v0.1"
|
3 |
-
|
4 |
-
|
5 |
-
# Name the stub (it should all be in lower case)
|
6 |
-
STUB_NAME=f"{BASE_MODEL.lower()}-deployement"
|
7 |
-
|
8 |
-
### Server level default configs
|
9 |
-
# Keep warm: is the warm pool size or the minimum number of containers that will always be up for your serverless function to get executed (Modal will scale up more containers from there based on need or demand)
|
10 |
-
|
11 |
-
KEEP_WARM = 1
|
12 |
-
|
13 |
-
# num of concurrent requests: is the number of concurrent requests a container should handle
|
14 |
-
NUM_CONCURRENT_REQUESTS = 10
|
15 |
-
|
16 |
-
# timeout: This is the server timeout after which it would be shutdown the server.
|
17 |
-
TIMEOUT = 600
|
18 |
-
|
19 |
-
# Number of GPUs to use
|
20 |
-
GPU_COUNT = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modal/engine.py
DELETED
@@ -1,103 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import asyncio
|
3 |
-
|
4 |
-
from queue import Empty
|
5 |
-
from typing import List, Union, List
|
6 |
-
|
7 |
-
# Import modal's required imports
|
8 |
-
from modal import Image, Stub, gpu, method, enter, exit
|
9 |
-
|
10 |
-
# Import the constants defined in constants.py
|
11 |
-
from config import (
|
12 |
-
MODEL_DIR,
|
13 |
-
BASE_MODEL,
|
14 |
-
STUB_NAME,
|
15 |
-
NUM_CONCURRENT_REQUESTS,
|
16 |
-
TIMEOUT,
|
17 |
-
GPU_COUNT
|
18 |
-
)
|
19 |
-
|
20 |
-
# Define our GPU Config
|
21 |
-
|
22 |
-
if BASE_MODEL == "mistralai/Mistral-7B-Instruct-v0.1":
|
23 |
-
GPU_CONFIG = gpu.A100(count=GPU_COUNT, memory=80)
|
24 |
-
else:
|
25 |
-
GPU_CONFIG = gpu.Any(count=GPU_COUNT)
|
26 |
-
|
27 |
-
stub = Stub(name=STUB_NAME)
|
28 |
-
|
29 |
-
def download_model_to_folder():
|
30 |
-
from transformers.utils import move_cache
|
31 |
-
from huggingface_hub import snapshot_download
|
32 |
-
|
33 |
-
os.makedirs(MODEL_DIR, exist_ok=True)
|
34 |
-
|
35 |
-
snapshot_download(
|
36 |
-
BASE_MODEL,
|
37 |
-
local_dir=MODEL_DIR,
|
38 |
-
ignore_patterns=["*.pt"], # Using safetensors
|
39 |
-
)
|
40 |
-
|
41 |
-
move_cache()
|
42 |
-
|
43 |
-
HF_DOCKER_IMAGE = (
|
44 |
-
Image.from_registry("nvidia/cuda:12.1.0-base-ubuntu22.04", add_python="3.10").pip_install_from_requirements("./requirements.txt")
|
45 |
-
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
|
46 |
-
.run_function(download_model_to_folder)
|
47 |
-
)
|
48 |
-
|
49 |
-
@stub.cls(
|
50 |
-
gpu=GPU_CONFIG,
|
51 |
-
timeout=TIMEOUT,
|
52 |
-
container_idle_timeout=TIMEOUT,
|
53 |
-
allow_concurrent_inputs=NUM_CONCURRENT_REQUESTS,
|
54 |
-
image=HF_DOCKER_IMAGE,
|
55 |
-
)
|
56 |
-
class HFEngine:
|
57 |
-
model_name_or_path: str = MODEL_DIR
|
58 |
-
device: str = "cuda"
|
59 |
-
|
60 |
-
@enter()
|
61 |
-
def start_engine(self):
|
62 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
63 |
-
|
64 |
-
self.model = AutoModelForCausalLM.from_pretrained(self.model_name_or_path, trust_remote_code=True).to(self.device)
|
65 |
-
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, trust_remote_code=True)
|
66 |
-
self.streamer = TextIteratorStreamer(self.tokenizer)
|
67 |
-
return self
|
68 |
-
|
69 |
-
@exit()
|
70 |
-
def terminate_engine(self):
|
71 |
-
import gc
|
72 |
-
import torch
|
73 |
-
|
74 |
-
del self.model
|
75 |
-
torch.cuda.synchronize()
|
76 |
-
gc.collect()
|
77 |
-
|
78 |
-
@method()
|
79 |
-
async def stream(self, chat_input: Union[str, List[dict]], generation_kwargs: dict):
|
80 |
-
from threading import Thread
|
81 |
-
|
82 |
-
if isinstance(chat_input, str):
|
83 |
-
chat_input = [{"role": "user", "content": chat_input}]
|
84 |
-
input_ids = self.tokenizer.apply_chat_template(
|
85 |
-
conversation=chat_input, tokenize=True, return_tensors="pt"
|
86 |
-
).to(self.device)
|
87 |
-
|
88 |
-
gen_kwargs = dict(
|
89 |
-
input_ids=input_ids,
|
90 |
-
streamer=self.streamer,
|
91 |
-
pad_token_id=self.tokenizer.eos_token_id,
|
92 |
-
**generation_kwargs
|
93 |
-
)
|
94 |
-
|
95 |
-
thread = Thread(target=self.model.generate, kwargs=gen_kwargs)
|
96 |
-
thread.start()
|
97 |
-
|
98 |
-
for next_token in self.streamer:
|
99 |
-
try:
|
100 |
-
if next_token is not None:
|
101 |
-
yield next_token
|
102 |
-
except Empty:
|
103 |
-
await asyncio.sleep(0.001)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modal/requirements.txt
DELETED
File without changes
|