Roberta2024 commited on
Commit
68d4493
·
verified ·
1 Parent(s): 15a2416

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio import Interface, Textbox, Image, Markdown
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPProcessor, CLIPModel
4
+ import torch
5
+ from PIL import Image as PILImage
6
+
7
+ # Load models and tokenizers
8
+ openelm_270m_instruct = AutoModelForCausalLM.from_pretrained("apple/OpenELM-270M", trust_remote_code=True)
9
+ tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")
10
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
11
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
12
+
13
+ def generate_text(prompt, num_tokens):
14
+ tokenized_prompt = tokenizer(prompt, return_tensors="pt")
15
+
16
+ output_ids = openelm_270m_instruct.generate(
17
+ tokenized_prompt["input_ids"],
18
+ max_length=int(num_tokens),
19
+ pad_token_id=0,
20
+ )
21
+
22
+ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
23
+ return output_text
24
+
25
+ def process_image(image):
26
+ if isinstance(image, str):
27
+ image = PILImage.open(image).convert("RGB")
28
+ inputs = clip_processor(images=image, return_tensors="pt")
29
+ image_features = clip_model.get_image_features(**inputs)
30
+ return image_features
31
+
32
+ def generate_multimodal(text_prompt, image, num_tokens):
33
+ # Process the image
34
+ image_features = process_image(image)
35
+
36
+ # Combine text prompt with image features (you may need to adjust this based on OpenELM's specific requirements)
37
+ combined_input = f"{text_prompt} [IMAGE]"
38
+
39
+ # Generate text based on the combined input
40
+ output = generate_text(combined_input, num_tokens)
41
+
42
+ return output
43
+
44
+ def greet(text_input, image_input, num_tokens):
45
+ if image_input is not None:
46
+ return generate_multimodal(text_input, image_input, num_tokens)
47
+ else:
48
+ return generate_text(text_input, num_tokens)
49
+
50
+ developer_info = """
51
+ This space is developed by Ahmadreza Anaami
52
+ Feel free to set via API key too
53
+ Models used: apple/OpenELM-270M, openai/clip-vit-base-patch32
54
+ """
55
+
56
+ iface = gr.Interface(
57
+ fn=greet,
58
+ inputs=[
59
+ Textbox(label="Enter Text Here:", type="text"),
60
+ Image(label="Upload Image (optional):"),
61
+ Textbox(label="Number of generated tokens:", type="text")
62
+ ],
63
+ outputs=[Textbox(label="Generated answer:")],
64
+ title="OpenELM-270M Multimodal",
65
+ description=developer_info,
66
+ css="""
67
+ #dev-info {
68
+ font-size: 0.8rem;
69
+ color: #888;
70
+ margin-top: 1rem;
71
+ text-align: center;
72
+ }
73
+ .gr-input text {
74
+ padding: 10px;
75
+ border-radius: 5px;
76
+ font-size: 1rem;
77
+ }
78
+ .gr-output.gr-slider label {
79
+ font-weight: bold;
80
+ }
81
+ """
82
+ )
83
+
84
+ if __name__ == "__main__":
85
+ iface.launch()