mashirong commited on
Commit
1e90cb3
1 Parent(s): 935c70a
Files changed (1) hide show
  1. README.md +8 -12
README.md CHANGED
@@ -217,6 +217,8 @@ We also provide OpenAI-Compatible API at DeepSeek Platform: [platform.deepseek.c
217
 
218
  ## 8. How to run locally
219
  **To utilize DeepSeek-V2 in BF16 format for inference, 80GB*8 GPUs are required.**
 
 
220
  ### Inference with Huggingface's Transformers
221
  You can directly employ [Huggingface's Transformers](https://github.com/huggingface/transformers) for model inference.
222
 
@@ -225,12 +227,9 @@ You can directly employ [Huggingface's Transformers](https://github.com/huggingf
225
  import torch
226
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
227
 
228
- model_name = "deepseek-ai/DeepSeek-V2"
229
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
230
- # `max_memory` should be set based on your devices
231
- max_memory = {i: "75GB" for i in range(8)}
232
- # `device_map` cannot be set to `auto`
233
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
234
  model.generation_config = GenerationConfig.from_pretrained(model_name)
235
  model.generation_config.pad_token_id = model.generation_config.eos_token_id
236
 
@@ -247,12 +246,9 @@ print(result)
247
  import torch
248
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
249
 
250
- model_name = "deepseek-ai/DeepSeek-V2-Chat"
251
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
252
- # `max_memory` should be set based on your devices
253
- max_memory = {i: "75GB" for i in range(8)}
254
- # `device_map` cannot be set to `auto`
255
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
256
  model.generation_config = GenerationConfig.from_pretrained(model_name)
257
  model.generation_config.pad_token_id = model.generation_config.eos_token_id
258
 
@@ -297,8 +293,8 @@ To utilize [vLLM](https://github.com/vllm-project/vllm) for model inference, ple
297
  from transformers import AutoTokenizer
298
  from vllm import LLM, SamplingParams
299
 
300
- max_model_len, tp_size = 8192, 8
301
- model_name = "deepseek-ai/DeepSeek-V2-Chat"
302
  tokenizer = AutoTokenizer.from_pretrained(model_name)
303
  llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True, enforce_eager=True)
304
  sampling_params = SamplingParams(temperature=0.3, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
 
217
 
218
  ## 8. How to run locally
219
  **To utilize DeepSeek-V2 in BF16 format for inference, 80GB*8 GPUs are required.**
220
+
221
+ **To utilize DeepSeek-V2-Lite in BF16 format for inference, 40GB*1 GPU is required.**
222
  ### Inference with Huggingface's Transformers
223
  You can directly employ [Huggingface's Transformers](https://github.com/huggingface/transformers) for model inference.
224
 
 
227
  import torch
228
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
229
 
230
+ model_name = "deepseek-ai/DeepSeek-V2-Lite"
231
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
232
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
 
 
 
233
  model.generation_config = GenerationConfig.from_pretrained(model_name)
234
  model.generation_config.pad_token_id = model.generation_config.eos_token_id
235
 
 
246
  import torch
247
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
248
 
249
+ model_name = "deepseek-ai/DeepSeek-V2-Lite-Chat"
250
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
251
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
 
 
 
252
  model.generation_config = GenerationConfig.from_pretrained(model_name)
253
  model.generation_config.pad_token_id = model.generation_config.eos_token_id
254
 
 
293
  from transformers import AutoTokenizer
294
  from vllm import LLM, SamplingParams
295
 
296
+ max_model_len, tp_size = 8192, 1
297
+ model_name = "deepseek-ai/DeepSeek-V2-Lite-Chat"
298
  tokenizer = AutoTokenizer.from_pretrained(model_name)
299
  llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True, enforce_eager=True)
300
  sampling_params = SamplingParams(temperature=0.3, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])