pip uninstall -y torch torchvision torchaudio
pip install --pre torch torchvision torchaudio \
  --index-url https://download.pytorch.org/whl/nightly/cu128

export VLLM_VERSION=0.9.0
pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl

pip install hf_transfer
pip install flashinfer-python
pip install requests

python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model miike-ai/Deepseek-R1-Distill-Llama-70B-fp4
import requests
import json
import sys
from typing import List, Dict

class ChatSession:
    def __init__(self, model: str = "miike-ai/Deepseek-R1-Distill-Llama-70B-fp4"):
        self.url = "http://localhost:8000/v1/chat/completions"
        self.model = model
        self.messages: List[Dict[str, str]] = []
        self.headers = {
            "Content-Type": "application/json",
            "Accept": "text/event-stream"  # For streaming support
        }

    def add_message(self, role: str, content: str):
        self.messages.append({"role": role, "content": content})

    def stream_response(self):
        data = {
            "model": self.model,
            "messages": self.messages,
            "temperature": 0.7,
            "stream": True
        }

        try:
            with requests.post(self.url, headers=self.headers, json=data, stream=True) as response:
                if response.status_code != 200:
                    print(f"\nError: API request failed with status code {response.status_code}")
                    print("Response:", response.text)
                    return

                print("\nAssistant: ", end="", flush=True)
                collected_content = []
                
                for line in response.iter_lines():
                    if line:
                        try:
                            line = line.decode('utf-8')
                            if line.startswith('data: '):
                                json_str = line[6:]  # Remove 'data: ' prefix
                                if json_str.strip() == '[DONE]':
                                    break
                                try:
                                    chunk = json.loads(json_str)
                                    if content := chunk.get('choices', [{}])[0].get('delta', {}).get('content'):
                                        print(content, end="", flush=True)
                                        collected_content.append(content)
                                except json.JSONDecodeError:
                                    continue
                        except Exception as e:
                            print(f"\nError processing chunk: {str(e)}")
                            continue

                print()  # New line after response
                full_content = "".join(collected_content)
                if full_content:
                    self.add_message("assistant", full_content)

        except requests.exceptions.ConnectionError:
            print("\nError: Could not connect to the API. Make sure the server is running on localhost:8000")
        except Exception as e:
            print(f"\nUnexpected error: {str(e)}")

def run_chat_interface():
    """
    Run an interactive chat interface in the terminal
    """
    print("\nChat Interface for Local API Testing")
    print("=====================================")
    print("Endpoint: http://localhost:8000/v1/chat/completions")
    print("Type 'exit' or 'quit' to end the chat")
    print("Type 'clear' to start a new chat session")
    print("----------------------------------------\n")

    chat = ChatSession()

    while True:
        try:
            user_input = input("User: ").strip()

            if not user_input:
                continue

            if user_input.lower() in ['exit', 'quit']:
                print("\nGoodbye!")
                break

            if user_input.lower() == 'clear':
                chat = ChatSession()
                print("\nStarted new chat session")
                continue

            chat.add_message("user", user_input)
            chat.stream_response()

        except KeyboardInterrupt:
            print("\n\nGoodbye!")
            break
        except EOFError:
            print("\nGoodbye!")
            break

if __name__ == "__main__":
    run_chat_interface()
Downloads last month
54
Safetensors
Model size
40.6B params
Tensor type
BF16
·
F8_E4M3
·
U8
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for miike-ai/DeepSeek-R1-Distill-Llama-70B-a16-fp4

Quantized
(58)
this model

Collection including miike-ai/DeepSeek-R1-Distill-Llama-70B-a16-fp4