Llama FP4
Collection
FP4 compressed models
•
6 items
•
Updated
pip uninstall -y torch torchvision torchaudio
pip install --pre torch torchvision torchaudio \
--index-url https://download.pytorch.org/whl/nightly/cu128
export VLLM_VERSION=0.9.0
pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
pip install hf_transfer
pip install flashinfer-python
pip install requests
python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model miike-ai/Llama-3.1-8B-Instruct-fp4
import requests
import json
import sys
from typing import List, Dict
class ChatSession:
def __init__(self, model: str = "miike-ai/Llama-3.1-8B-Instruct-fp4"):
self.url = "http://localhost:8000/v1/chat/completions"
self.model = model
self.messages: List[Dict[str, str]] = []
self.headers = {
"Content-Type": "application/json",
"Accept": "text/event-stream" # For streaming support
}
def add_message(self, role: str, content: str):
self.messages.append({"role": role, "content": content})
def stream_response(self):
data = {
"model": self.model,
"messages": self.messages,
"temperature": 0.7,
"stream": True
}
try:
with requests.post(self.url, headers=self.headers, json=data, stream=True) as response:
if response.status_code != 200:
print(f"\nError: API request failed with status code {response.status_code}")
print("Response:", response.text)
return
print("\nAssistant: ", end="", flush=True)
collected_content = []
for line in response.iter_lines():
if line:
try:
line = line.decode('utf-8')
if line.startswith('data: '):
json_str = line[6:] # Remove 'data: ' prefix
if json_str.strip() == '[DONE]':
break
try:
chunk = json.loads(json_str)
if content := chunk.get('choices', [{}])[0].get('delta', {}).get('content'):
print(content, end="", flush=True)
collected_content.append(content)
except json.JSONDecodeError:
continue
except Exception as e:
print(f"\nError processing chunk: {str(e)}")
continue
print() # New line after response
full_content = "".join(collected_content)
if full_content:
self.add_message("assistant", full_content)
except requests.exceptions.ConnectionError:
print("\nError: Could not connect to the API. Make sure the server is running on localhost:8000")
except Exception as e:
print(f"\nUnexpected error: {str(e)}")
def run_chat_interface():
"""
Run an interactive chat interface in the terminal
"""
print("\nChat Interface for Local API Testing")
print("=====================================")
print("Endpoint: http://localhost:8000/v1/chat/completions")
print("Type 'exit' or 'quit' to end the chat")
print("Type 'clear' to start a new chat session")
print("----------------------------------------\n")
chat = ChatSession()
while True:
try:
user_input = input("User: ").strip()
if not user_input:
continue
if user_input.lower() in ['exit', 'quit']:
print("\nGoodbye!")
break
if user_input.lower() == 'clear':
chat = ChatSession()
print("\nStarted new chat session")
continue
chat.add_message("user", user_input)
chat.stream_response()
except KeyboardInterrupt:
print("\n\nGoodbye!")
break
except EOFError:
print("\nGoodbye!")
break
if __name__ == "__main__":
run_chat_interface()
Base model
meta-llama/Llama-3.1-8B