khaledsayed1
/

llama_QA

Text Generation

Transformers

Safetensors

Arabic

unsloth

Model card Files Files and versions Community

khaledsayed1 commited on Jan 27

Commit

6e5a8e8

verified ·

1 Parent(s): 2881832

Upload 2 files

Browse files

Files changed (2) hide show

handler.py +92 -49
requirements.txt +8 -5

handler.py CHANGED Viewed

@@ -1,49 +1,92 @@
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-# Load the model and tokenizer from Hugging Face (without quantization)
-model_name = "khaledsayed1/llama_QA"  # Replace with your actual model name
-model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")  # Load model on GPU
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-def predict(input_data):
-    """
-    Process the input data and generate an answer from the model.
-    Args:
-        input_data (dict): The input question.
-    Returns:
-        dict: The model's generated answer.
-    """
-    # Extract the question from input_data
-    question = input_data.get('question', '')
-    if not question:
-        return {"error": "No question provided."}
-    # Define the prompt with the user's question
-    alpaca_prompt = f"""
-    السؤال: {question}
-    الإجابة:
-    """
-    formatted_prompt = alpaca_prompt.strip()
-    # Tokenize the input and move it to GPU
-    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
-    # Generate the output using the model
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=128,
-        temperature=0.7,
-        top_k=50,
-        top_p=0.95,
-        use_cache=True
-    )
-    # Decode the output
-    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    # Clean up the output and remove the question itself
-    clean_output = decoded_output[0].replace("السؤال:", "").replace("الإجابة:", "").strip()
-    return {"answer": clean_output}

+import torch
+import os
+from transformers import AutoModelForCausalLM, AutoTokenizer
+class ModelHandler:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = None
+        self.tokenizer = None
+        self.initialized = False
+    def initialize(self):
+        """Initialize the model and tokenizer"""
+        if self.initialized:
+            return
+        try:
+            # Load model and tokenizer from the local path
+            model_path = os.path.dirname(os.path.abspath(__file__))
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map="auto",
+                torch_dtype=torch.float16  # Use float16 for T4 GPU optimization
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+            self.initialized = True
+        except Exception as e:
+            raise RuntimeError(f"Error initializing model: {str(e)}")
+    def predict(self, input_data):
+        """
+        Process the input data and generate an answer from the model.
+        Args:
+            input_data (dict): The input question.
+        Returns:
+            dict: The model's generated answer.
+        """
+        if not self.initialized:
+            self.initialize()
+        try:
+            # Extract the question from input_data
+            question = input_data.get('question', '')
+            if not question:
+                return {"error": "No question provided."}
+            # Define the prompt with the user's question
+            alpaca_prompt = f"""
+            السؤال: {question}
+            الإجابة:
+            """
+            formatted_prompt = alpaca_prompt.strip()
+            # Tokenize the input
+            inputs = self.tokenizer([formatted_prompt], return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Generate with proper error handling and memory management
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=128,
+                    temperature=0.7,
+                    top_k=50,
+                    top_p=0.95,
+                    use_cache=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            # Decode the output
+            decoded_output = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            # Clean up the output
+            clean_output = decoded_output[0].replace("السؤال:", "").replace("الإجابة:", "").strip()
+            # Clear CUDA cache if using GPU
+            if self.device == "cuda":
+                torch.cuda.empty_cache()
+            return {"answer": clean_output}
+        except Exception as e:
+            return {"error": f"Prediction error: {str(e)}"}
+# Create a global handler instance
+handler = ModelHandler()
+def predict(input_data):
+    """
+    Wrapper function for the handler's predict method
+    """
+    return handler.predict(input_data)

requirements.txt CHANGED Viewed

@@ -1,5 +1,8 @@
-transformers==4.22.0
-torch==1.12.1
-bitsandbytes
-fastapi
-uvicorn

+transformers>=4.36.0
+torch>=2.1.0
+accelerate>=0.25.0
+bitsandbytes>=0.41.0
+safetensors>=0.4.0
+fastapi>=0.105.0
+uvicorn>=0.24.0
+numpy>=1.24.0