Amir230703 commited on
Commit
2715ee7
Β·
verified Β·
1 Parent(s): 377e612

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -23
app.py CHANGED
@@ -2,37 +2,64 @@ import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
- # Load the model and tokenizer
6
  model_name = "Amir230703/phi3-medmcqa-finetuned"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
 
 
 
 
 
 
 
 
 
9
 
10
  def generate_answer(question):
11
- # Tokenize the input question
12
- input_ids = tokenizer(question, return_tensors="pt").input_ids.to(model.device)
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Generate the answer
15
- output = model.generate(
16
- input_ids,
17
- max_length=100, # Reduced max_length for faster response
18
- temperature=0.7,
19
- top_p=0.9,
20
- do_sample=True,
21
- num_return_sequences=1 # Only return one answer
22
- )
 
 
23
 
24
- # Decode the output
25
- answer = tokenizer.decode(output[0], skip_special_tokens=True)
26
- return answer
27
 
28
- # Gradio Interface
29
  demo = gr.Interface(
30
  fn=generate_answer,
31
- inputs=gr.Textbox(placeholder="Enter a medical question here..."),
32
- outputs=gr.Textbox(),
33
- title="Medical QA Model",
34
- description="Enter a medical question, and the AI will provide an answer."
 
35
  )
36
 
37
- # Launch the Gradio app
38
- demo.launch()
 
 
 
 
 
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
 
5
+ # Load model and tokenizer with optimizations
6
  model_name = "Amir230703/phi3-medmcqa-finetuned"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModelForCausalLM.from_pretrained(
9
+ model_name,
10
+ torch_dtype=torch.float16,
11
+ device_map="auto",
12
+ attn_implementation="flash_attention_2" # Faster attention
13
+ ).eval()
14
+
15
+ # Use faster kernels if available
16
+ if torch.cuda.is_available():
17
+ model = torch.compile(model)
18
 
19
  def generate_answer(question):
20
+ # Create structured prompt
21
+ prompt = f"""Instruction: Answer the following medical question concisely.
22
+ Question: {question}
23
+ Answer:"""
24
+
25
+ # Tokenize with optimized settings
26
+ inputs = tokenizer(
27
+ prompt,
28
+ return_tensors="pt",
29
+ max_length=512,
30
+ truncation=True,
31
+ padding=True
32
+ ).to(model.device)
33
 
34
+ # Generate with optimized parameters
35
+ with torch.no_grad():
36
+ outputs = model.generate(
37
+ **inputs,
38
+ max_new_tokens=150, # Reduced from 200
39
+ temperature=0.7,
40
+ top_p=0.9,
41
+ do_sample=True,
42
+ repetition_penalty=1.1, # Prevent repetition
43
+ num_return_sequences=1
44
+ )
45
 
46
+ # Decode and clean output
47
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
+ return answer.split("Answer:")[-1].strip()
49
 
50
+ # Gradio interface with queueing
51
  demo = gr.Interface(
52
  fn=generate_answer,
53
+ inputs=gr.Textbox(placeholder="Enter your medical question...", lines=3),
54
+ outputs=gr.Textbox(label="Answer"),
55
+ title="Medical QA Assistant",
56
+ description="AI-powered medical question answering. Please be specific in your queries.",
57
+ allow_flagging="never"
58
  )
59
 
60
+ # Launch with performance settings
61
+ demo.launch(
62
+ server_name="0.0.0.0" if torch.cuda.is_available() else None,
63
+ share=False,
64
+ max_threads=2
65
+ )