khaledsayed1 commited on
Commit
6e5a8e8
·
verified ·
1 Parent(s): 2881832

Upload 2 files

Browse files
Files changed (2) hide show
  1. handler.py +92 -49
  2. requirements.txt +8 -5
handler.py CHANGED
@@ -1,49 +1,92 @@
1
- import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
-
4
- # Load the model and tokenizer from Hugging Face (without quantization)
5
- model_name = "khaledsayed1/llama_QA" # Replace with your actual model name
6
- model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda") # Load model on GPU
7
- tokenizer = AutoTokenizer.from_pretrained(model_name)
8
-
9
- def predict(input_data):
10
- """
11
- Process the input data and generate an answer from the model.
12
- Args:
13
- input_data (dict): The input question.
14
- Returns:
15
- dict: The model's generated answer.
16
- """
17
- # Extract the question from input_data
18
- question = input_data.get('question', '')
19
-
20
- if not question:
21
- return {"error": "No question provided."}
22
-
23
- # Define the prompt with the user's question
24
- alpaca_prompt = f"""
25
- السؤال: {question}
26
- الإجابة:
27
- """
28
- formatted_prompt = alpaca_prompt.strip()
29
-
30
- # Tokenize the input and move it to GPU
31
- inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
32
-
33
- # Generate the output using the model
34
- outputs = model.generate(
35
- **inputs,
36
- max_new_tokens=128,
37
- temperature=0.7,
38
- top_k=50,
39
- top_p=0.95,
40
- use_cache=True
41
- )
42
-
43
- # Decode the output
44
- decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
45
-
46
- # Clean up the output and remove the question itself
47
- clean_output = decoded_output[0].replace("السؤال:", "").replace("الإجابة:", "").strip()
48
-
49
- return {"answer": clean_output}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+
5
+ class ModelHandler:
6
+ def __init__(self):
7
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
8
+ self.model = None
9
+ self.tokenizer = None
10
+ self.initialized = False
11
+
12
+ def initialize(self):
13
+ """Initialize the model and tokenizer"""
14
+ if self.initialized:
15
+ return
16
+
17
+ try:
18
+ # Load model and tokenizer from the local path
19
+ model_path = os.path.dirname(os.path.abspath(__file__))
20
+ self.model = AutoModelForCausalLM.from_pretrained(
21
+ model_path,
22
+ device_map="auto",
23
+ torch_dtype=torch.float16 # Use float16 for T4 GPU optimization
24
+ )
25
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
26
+ self.initialized = True
27
+ except Exception as e:
28
+ raise RuntimeError(f"Error initializing model: {str(e)}")
29
+
30
+ def predict(self, input_data):
31
+ """
32
+ Process the input data and generate an answer from the model.
33
+ Args:
34
+ input_data (dict): The input question.
35
+ Returns:
36
+ dict: The model's generated answer.
37
+ """
38
+ if not self.initialized:
39
+ self.initialize()
40
+
41
+ try:
42
+ # Extract the question from input_data
43
+ question = input_data.get('question', '')
44
+ if not question:
45
+ return {"error": "No question provided."}
46
+
47
+ # Define the prompt with the user's question
48
+ alpaca_prompt = f"""
49
+ السؤال: {question}
50
+ الإجابة:
51
+ """
52
+ formatted_prompt = alpaca_prompt.strip()
53
+
54
+ # Tokenize the input
55
+ inputs = self.tokenizer([formatted_prompt], return_tensors="pt")
56
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
57
+
58
+ # Generate with proper error handling and memory management
59
+ with torch.no_grad():
60
+ outputs = self.model.generate(
61
+ **inputs,
62
+ max_new_tokens=128,
63
+ temperature=0.7,
64
+ top_k=50,
65
+ top_p=0.95,
66
+ use_cache=True,
67
+ pad_token_id=self.tokenizer.eos_token_id
68
+ )
69
+
70
+ # Decode the output
71
+ decoded_output = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
72
+
73
+ # Clean up the output
74
+ clean_output = decoded_output[0].replace("السؤال:", "").replace("الإجابة:", "").strip()
75
+
76
+ # Clear CUDA cache if using GPU
77
+ if self.device == "cuda":
78
+ torch.cuda.empty_cache()
79
+
80
+ return {"answer": clean_output}
81
+
82
+ except Exception as e:
83
+ return {"error": f"Prediction error: {str(e)}"}
84
+
85
+ # Create a global handler instance
86
+ handler = ModelHandler()
87
+
88
+ def predict(input_data):
89
+ """
90
+ Wrapper function for the handler's predict method
91
+ """
92
+ return handler.predict(input_data)
requirements.txt CHANGED
@@ -1,5 +1,8 @@
1
- transformers==4.22.0
2
- torch==1.12.1
3
- bitsandbytes
4
- fastapi
5
- uvicorn
 
 
 
 
1
+ transformers>=4.36.0
2
+ torch>=2.1.0
3
+ accelerate>=0.25.0
4
+ bitsandbytes>=0.41.0
5
+ safetensors>=0.4.0
6
+ fastapi>=0.105.0
7
+ uvicorn>=0.24.0
8
+ numpy>=1.24.0