matt-bcny
/

llama-3b-instruct-ft-function-call

+from typing import Dict, List, Any, Optional, Union
+import os
+import json
+import time
+import torch
+from threading import Thread
+import logging
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TextIteratorStreamer,
+    StoppingCriteriaList,
+    StoppingCriteria,
+    BitsAndBytesConfig
+)
+from peft import PeftModel
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("lora_inference.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+class ImprovedJSONStoppingCriteria(StoppingCriteria):
+    """
+    Stopping criteria that ensures JSON is complete before stopping.
+    Only stops generation when a valid, complete JSON object is detected.
+    """
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.generated = ""
+        self.json_complete = False
+    def __call__(self, input_ids, scores, **kwargs):
+        # If we already found complete JSON, stop immediately
+        if self.json_complete:
+            return True
+        # Decode current text
+        text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        # Skip early if no JSON structure detected
+        if '{' not in text:
+            return False
+        # Don't stop if we don't have at least one closing brace
+        if '}' not in text:
+            return False
+        # Check for complete JSON structure
+        try:
+            # First, try to find a valid JSON object
+            start_pos = text.find('{')
+            # Progressively validate from the first opening brace
+            stack = []
+            end_pos = -1
+            for i, char in enumerate(text[start_pos:], start_pos):
+                if char == '{':
+                    stack.append('{')
+                elif char == '}':
+                    if stack:
+                        stack.pop()
+                        if not stack:  # We have balanced braces
+                            end_pos = i
+                            potential_json = text[start_pos:end_pos+1]
+                            # Make sure this is actually valid JSON
+                            # and not just balanced braces
+                            try:
+                                # Parse JSON to validate
+                                parsed = json.loads(potential_json)
+                                # We need to make sure we have all required fields
+                                # For search_web or tool calls, verify arguments are complete
+                                if "calls" in parsed:
+                                    for call in parsed.get("calls", []):
+                                        # If we have a call with arguments, make sure they're complete
+                                        if "arguments" in call:
+                                            args = call.get("arguments", "")
+                                            # If arguments is a string, it might be JSON itself
+                                            if isinstance(args, str) and args.startswith("{"):
+                                                # If the argument string starts with { but doesn't have a
+                                                # closing }, it's incomplete
+                                                if not args.endswith("}"):
+                                                    return False
+                                                # Try to parse the arguments as JSON
+                                                try:
+                                                    json.loads(args)
+                                                except:
+                                                    # If we can't parse, the JSON is incomplete
+                                                    return False
+                                # All checks passed - we have valid, complete JSON
+                                self.json_complete = True
+                                return True
+                            except:
+                                # Not valid JSON, continue looking
+                                continue
+            # Only stop with excessive braces if we already have a valid structure
+            open_count = text.count('{')
+            close_count = text.count('}')
+            if close_count > open_count:
+                # Check if we have a valid JSON by balancing
+                fixed_text = text[start_pos:]
+                stack = []
+                for i, char in enumerate(fixed_text):
+                    if char == '{':
+                        stack.append('{')
+                    elif char == '}':
+                        if stack:
+                            stack.pop()
+                            if not stack:
+                                try:
+                                    potential_json = fixed_text[:i+1]
+                                    parsed = json.loads(potential_json)
+                                    self.json_complete = True
+                                    return True
+                                except:
+                                    pass
+        except Exception:
+            # Error in parsing or validation, don't stop
+            pass
+        return False
+class ExcessBraceStoppingCriteria(StoppingCriteria):
+    """Stop generation if we're generating excessive closing braces"""
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def __call__(self, input_ids, scores, **kwargs):
+        text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        # Only trigger if we have JSON content
+        if '{' in text and '}' in text:
+            # Check if we're generating excessive closing braces
+            open_count = text.count('{')
+            close_count = text.count('}')
+            # If we have more closing than opening braces, stop generation
+            if close_count > open_count + 3:  # Allow a small buffer
+                return True
+        return False
+def fix_json_output(text):
+    """Fix malformed JSON with excessive closing braces."""
+    if '{' not in text or '}' not in text:
+        return text
+    # Count opening and closing braces
+    open_count = text.count('{')
+    close_count = text.count('}')
+    # If balanced or too few closing braces, return as-is
+    if open_count >= close_count:
+        return text
+    # Track JSON depth to find valid JSON object
+    start_pos = text.find('{')
+    depth = 0
+    for i, char in enumerate(text[start_pos:], start_pos):
+        if char == '{':
+            depth += 1
+        elif char == '}':
+            depth -= 1
+            if depth == 0:
+                # Found balanced JSON, return up to this point
+                return text[:i+1]
+    # If we can't balance it with depth tracking, simply truncate
+    return text[:start_pos + text[start_pos:].find('}')+1]
+def create_stopping_criteria(tokenizer, stop_tokens):
+    """Create stopping criteria from tokens"""
+    stop_token_ids = []
+    for stop_token in stop_tokens:
+        token_ids = tokenizer.encode(stop_token, add_special_tokens=False)
+        if len(token_ids) > 0:
+            stop_token_ids.append(token_ids[-1])
+    return StoppingCriteriaList([StopOnTokens(tokenizer, stop_token_ids)])
+class StopOnTokens(StoppingCriteria):
+    """Custom stopping criteria for text generation."""
+    def __init__(self, tokenizer, stop_token_ids):
+        self.tokenizer = tokenizer
+        self.stop_token_ids = stop_token_ids
+    def __call__(self, input_ids, scores, **kwargs):
+        for stop_id in self.stop_token_ids:
+            if input_ids[0][-1] == stop_id:
+                return True
+        return False
+class EndpointHandler:
+    def __init__(self, path=""):
+        """
+        Initialize the handler by loading model and tokenizer
+        Args:
+            path (str): Path to the model directory (uses environment variable if not provided)
+        """
+        # Get model path from environment or from argument
+        model_path = path if path else os.environ.get("MODEL_PATH", "")
+        adapter_path = os.environ.get("ADAPTER_PATH", None)
+        logger.info(f"Loading model from {model_path}")
+        # Determine quantization settings from environment
+        use_8bit = os.environ.get("USE_8BIT", "False").lower() == "true"
+        use_4bit = os.environ.get("USE_4BIT", "False").lower() == "true"
+        device = os.environ.get("DEVICE", "auto")
+        # Load tokenizer
+        logger.info(f"Loading tokenizer from {model_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Load model with appropriate configuration
+        if use_4bit:
+            logger.info("Using 4-bit quantization for inference...")
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4"
+            )
+            base_model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                quantization_config=quantization_config,
+                device_map=device,
+                low_cpu_mem_usage=True
+            )
+        elif use_8bit:
+            logger.info("Using 8-bit quantization for inference...")
+            base_model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                load_in_8bit=True,
+                device_map=device,
+                low_cpu_mem_usage=True
+            )
+        else:
+            logger.info("Loading model in float16 precision...")
+            base_model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                device_map=device,
+                low_cpu_mem_usage=True
+            )
+        # Apply adapter if specified
+        if adapter_path:
+            logger.info(f"Loading LoRA adapter from {adapter_path}")
+            self.model = PeftModel.from_pretrained(base_model, adapter_path)
+        else:
+            self.model = base_model
+            logger.info("No adapter path provided, using base model only")
+        self.model.eval()
+        # Try to use torch.compile for additional performance if available
+        if torch.__version__ >= "2.0.0" and os.environ.get("USE_COMPILE", "False").lower() == "true":
+            try:
+                logger.info("Applying torch.compile for additional optimization...")
+                self.model = torch.compile(self.model)
+                logger.info("Model successfully compiled!")
+            except Exception as e:
+                logger.warning(f"Could not compile model: {e}")
+        logger.info("Model and tokenizer loaded successfully!")
+    def format_conversation(self, messages, add_generation_prompt=True):
+        """Format a conversation using the tokenizer's chat template"""
+        return self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt
+        )
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Process inference request
+        Args:
+            data (Dict[str, Any]): Request data containing inputs and parameters
+        Returns:
+            List[Dict[str, Any]]: List of response dictionaries
+        """
+        start_time = time.time()
+        # Extract input data and parameters
+        inputs = data.get("inputs", [])
+        parameters = data.get("parameters", {})
+        # Parse generation parameters with defaults
+        max_new_tokens = parameters.get("max_new_tokens", 512)
+        temperature = parameters.get("temperature", 0.7)
+        top_p = parameters.get("top_p", 0.95)
+        do_sample = parameters.get("do_sample", temperature > 0.1)
+        stream = parameters.get("stream", False)
+        json_mode = parameters.get("json_mode", False)
+        system_prompt = parameters.get("system_prompt", None)
+        # Check if input is in various formats and normalize to messages format
+        if isinstance(inputs, str):
+            # Create simple chat with user message
+            messages = [{"role": "user", "content": inputs}]
+        elif isinstance(inputs, dict) and "messages" in inputs:
+            # Input is already in chat format
+            messages = inputs["messages"]
+        elif isinstance(inputs, list):
+            # Assume this is a list of message dicts
+            messages = inputs
+        else:
+            # Invalid input format
+            return [{"error": "Invalid input format. Please provide a string, a list of messages, or a dict with 'messages' key."}]
+        # Prepare conversation with system prompt if provided
+        conversation = []
+        if system_prompt:
+            conversation.append({"role": "system", "content": system_prompt})
+        conversation.extend(messages)
+        # Format the conversation
+        prompt = self.format_conversation(conversation)
+        # Tokenize the prompt
+        inputs_dict = self.tokenizer(prompt, return_tensors="pt")
+        inputs_dict = {k: v.to(self.model.device) for k, v in inputs_dict.items()}
+        # Configure generation parameters
+        generation_config = {
+            "max_new_tokens": max_new_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "do_sample": do_sample,
+            "pad_token_id": self.tokenizer.pad_token_id,
+        }
+        # Add JSON-specific settings if needed
+        if json_mode:
+            stop_tokens = ["\n\n", "\n}", "}\n", "}}", "} }", "}\n]", "}\n{"]
+            stopping_criteria = create_stopping_criteria(self.tokenizer, stop_tokens)
+            generation_config["stopping_criteria"] = stopping_criteria
+            # Lower temperature for JSON mode to get more reliable outputs
+            # but don't set to 0 as that might cause truncation issues
+            temperature = min(temperature, 0.1)
+            do_sample = False
+            generation_config["do_sample"] = do_sample
+            generation_config["temperature"] = temperature
+        # Record input length for proper decoding
+        input_length = inputs_dict["input_ids"].shape[1]
+        generated_text = ""
+        if stream:
+            # Use streaming for interactive responses
+            streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
+            generation_config["streamer"] = streamer
+            # Start generation in a thread
+            thread = Thread(target=self.model.generate, kwargs={**inputs_dict, **generation_config})
+            thread.start()
+            # Stream the output (for local testing)
+            for text in streamer:
+                generated_text += text
+            # Apply JSON cleaning if needed and json_mode is enabled
+            if json_mode and '{' in generated_text and '}' in generated_text:
+                if generated_text.count('}') > generated_text.count('{'):
+                    fixed_text = fix_json_output(generated_text)
+                    if fixed_text != generated_text:
+                        logger.info("Fixed malformed JSON in response")
+                        generated_text = fixed_text
+        else:
+            # Non-streaming generation
+            with torch.no_grad():
+                outputs = self.model.generate(**inputs_dict, **generation_config)
+            # Decode the output
+            generated_ids = outputs[0][input_length:]
+            generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
+            # Apply JSON cleaning if needed and json_mode is enabled
+            if json_mode and '{' in generated_text and '}' in generated_text:
+                if generated_text.count('}') > generated_text.count('{'):
+                    fixed_text = fix_json_output(generated_text)
+                    if fixed_text != generated_text:
+                        logger.info("Fixed malformed JSON in response")
+                        generated_text = fixed_text
+        # Calculate processing time
+        end_time = time.time()
+        processing_time = end_time - start_time
+        # Create response dictionary
+        response = {
+            "generated_text": generated_text,
+            "processing_time": processing_time
+        }
+        # Include input token count if requested
+        if parameters.get("return_token_count", False):
+            response["input_token_count"] = input_length
+            response["output_token_count"] = len(generated_text.split())
+        return [response]
+# For local testing
+if __name__ == "__main__":
+    # Test the handler
+    model_path = os.environ.get("MODEL_PATH", "./model")
+    handler = EndpointHandler(model_path)
+    # Test with a simple query
+    test_data = {
+        "inputs": "Explain the concept of machine learning in simple terms.",
+        "parameters": {
+            "max_new_tokens": 100,
+            "temperature": 0.7,
+            "system_prompt": "You are a helpful AI assistant."
+        }
+    }
+    response = handler(test_data)
+    print("\nTest Response:")
+    print(json.dumps(response, indent=2))
+    # Test with chat format and JSON mode
+    test_chat_data = {
+        "inputs": {
+            "messages": [
+                {"role": "user", "content": "Create a JSON object with information about the solar system. Include at least 3 planets with their name, diameter, and distance from the sun."}
+            ]
+        },
+        "parameters": {
+            "max_new_tokens": 512,
+            "temperature": 0.1,
+            "json_mode": True,
+            "system_prompt": "You are a helpful AI assistant that responds in JSON format."
+        }
+    }
+    chat_response = handler(test_chat_data)
+    print("\nJSON Format Response:")
+    print(json.dumps(chat_response, indent=2))