File size: 10,595 Bytes
cc63034
 
 
 
 
 
 
 
 
 
ffa0700
 
cc63034
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffa0700
cc63034
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffa0700
cc63034
ffa0700
 
 
cc63034
 
 
 
 
 
 
 
ffa0700
cc63034
 
 
 
 
 
 
 
ffa0700
 
 
cc63034
ffa0700
 
 
 
 
 
 
 
 
 
cc63034
 
ffa0700
 
cc63034
 
 
 
 
6946f01
 
cc63034
 
 
 
 
 
 
 
 
 
 
 
 
9d46d12
 
cc63034
 
 
 
 
 
 
 
 
 
 
 
 
c13527d
 
ffa0700
cc63034
 
 
 
 
 
 
 
 
 
 
 
ffa0700
cc63034
 
 
ffa0700
cc63034
 
 
ffa0700
cc63034
 
 
 
ffa0700
cc63034
 
 
 
 
 
 
 
 
ffa0700
 
 
cc63034
 
 
 
 
 
 
 
 
ffa0700
cc63034
 
6946f01
cc63034
 
 
ffa0700
 
 
 
 
 
 
 
 
 
 
 
 
cc63034
ffa0700
cc63034
 
 
ffa0700
 
cc63034
 
 
ffa0700
 
cc63034
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import streamlit as st
import extra_streamlit_components as stx
import requests
from PIL import Image
from io import BytesIO
from llama_index.llms.palm import PaLM
from llama_index import ServiceContext, VectorStoreIndex, Document, StorageContext, load_index_from_storage
from llama_index.memory import ChatMemoryBuffer
import os
import datetime
from llama_index.llms import Cohere
from llama_index.query_engine import CitationQueryEngine

#imports for resnet
from transformers import AutoFeatureExtractor, ResNetForImageClassification
import torch
from io import BytesIO

# Set up the title of the application
st.title("AInimal Go!")
#st.set_page_config(layout="wide")
st.write("My Pokemon Go inspired 'AInimal Go!' app. You can upload an image or snap a picture of an animal and start chatting with it")

# Sidebar
st.sidebar.markdown('## Created By')
st.sidebar.markdown("""
Harshad Suryawanshi 
- [Linkedin](https://www.linkedin.com/in/harshadsuryawanshi/)
- [Medium](https://harshadsuryawanshi.medium.com/)
""")


st.sidebar.markdown('## Other Projects')
st.sidebar.markdown("""
- [Building My Own GPT4-V with PaLM and Kosmos](https://lnkd.in/dawgKZBP)
- [AI Equity Research Analyst](https://ai-eqty-rsrch-anlyst.streamlit.app/)
- [Recasting "The Office" Scene](https://blackmirroroffice.streamlit.app/)
- [Story Generator](https://appstorycombined-agaf9j4ceit.streamlit.app/)
""")

st.sidebar.markdown('## Disclaimer')
st.sidebar.markdown("""
This application, titled 'AInimal Go!', is a conceptual prototype designed to demonstrate the innovative use of Large Language Models (LLMs) in enabling interactive conversations with animals through images. While the concept is vaguely inspired by the interactive and augmented reality elements popularized by games like Pokemon Go, it does not use any assets, characters, or intellectual property from the Pokemon franchise. The interactions and conversations generated by this application are entirely fictional and created for entertainment and educational purposes. They should not be regarded as factual or accurate representations of animal behavior or communication. The author and the application do not hold any affiliation with the Pokemon brand or its creators, and no endorsement from them is implied. Users are encouraged to use this application responsibly and with an understanding of its purely illustrative nature.
""")

# Initialize the cookie manager
cookie_manager = stx.CookieManager()

#Function to init resnet
@st.cache_resource(show_spinner="Initializing ResNet model for image classification. Please wait...")
def load_model_and_labels():
    # Load animal labels as a dictionary
    animal_labels_dict = {}
    with open('imagenet_animal_labels_subset.txt', 'r') as file:
        for line in file:
            parts = line.strip().split(':')
            class_id = int(parts[0].strip())
            label_name = parts[1].strip().strip("'")
            animal_labels_dict[class_id] = label_name

    # Initialize feature extractor and model
    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-18")
    model = ResNetForImageClassification.from_pretrained("microsoft/resnet-18")

    return feature_extractor, model, animal_labels_dict

feature_extractor, model, animal_labels_dict = load_model_and_labels()

# Function to predict image label
@st.cache_data
def get_image_caption(image_data):
    image = Image.open(image_data)
    inputs = feature_extractor(images=image, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_label_id = logits.argmax(-1).item()
    predicted_label_name = model.config.id2label[predicted_label_id]
    st.write(predicted_label_name)
    # Return the predicted animal name
    return predicted_label_name, predicted_label_id


@st.cache_resource(show_spinner="Initializing LLM and setting up service context. Please wait...")
def init_llm(api_key):
#    llm = PaLM(api_key=api_key)
    llm = Cohere(model="command", api_key=st.secrets['COHERE_API_TOKEN'])

    service_context = ServiceContext.from_defaults(llm=llm, embed_model="local")

    storage_context = StorageContext.from_defaults(persist_dir="storage")
    index = load_index_from_storage(storage_context, index_id="index", service_context=service_context)
    chatmemory = ChatMemoryBuffer.from_defaults(token_limit=1500)
    
    return llm, service_context, storage_context, index, chatmemory

llm, service_context, storage_context, index, chatmemory = init_llm(os.environ["GOOGLE_API_KEY"])

def is_animal(predicted_label_id):
    # Check if the predicted label ID is within the animal classes range
    return 0 <= predicted_label_id <= 398

# Function to create the chat engine.
@st.cache_resource
def create_chat_engine(img_desc, api_key):
    
    #llm = PaLM(api_key=api_key)
    #service_context = ServiceContext.from_defaults(llm=llm,embed_model="local")
    doc = Document(text=img_desc)
    
    # Now is_animal is a boolean indicating whether the image is of an animal
    print("Is the image of an animal:", is_animal)
    
    query_engine = CitationQueryEngine.from_args(
        index,
        similarity_top_k=3,
        # here we can control how granular citation sources are, the default is 512
        citation_chunk_size=512,
        verbose=True
    )
    
    return query_engine    
    
    
# Clear chat function
def clear_chat():
    if "messages" in st.session_state:
        del st.session_state.messages
    if "image_data" in st.session_state:
        del st.session_state.image_data

# Callback function to clear the chat when a new image is uploaded
def on_image_upload():
    clear_chat()        

# Retrieve the message count from cookies
message_count = cookie_manager.get(cookie='message_count')
if message_count is None:
    message_count = 0
else:
    message_count = int(message_count)

# If the message limit has been reached, disable the inputs
#if message_count <= 20:
if 0:
    st.error("Notice: The maximum message limit for this demo version has been reached.")
    # Disabling the uploader and input by not displaying them
    image_uploader_placeholder = st.empty()  # Placeholder for the uploader
    chat_input_placeholder = st.empty()      # Placeholder for the chat input
    st.stop()
else:
    # Add a clear chat button
    if st.button("Clear Chat"):
        clear_chat()

    # Image upload section.
    image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"], key="uploaded_image", on_change=on_image_upload)
    
    col1, col2, col3 = st.columns([1, 2, 1])
    with col2:  # Camera input will be in the middle column
        camera_image = st.camera_input("Take a picture", on_change=on_image_upload)
        
    
    # Determine the source of the image (upload or camera)
    if image_file is not None:
        image_data = BytesIO(image_file.getvalue())
    elif camera_image is not None:
        image_data = BytesIO(camera_image.getvalue())
    else:
        image_data = None
    
    if image_data:
        # Display the uploaded image at a standard width.
        st.session_state['assistant_avatar'] = image_data
        st.image(image_data, caption='Uploaded Image.', width=200)

        # Process the uploaded image to get a caption.
        #img_desc = get_image_caption(image_data)
        img_desc, label_id = get_image_caption(image_data)
        
        if not (is_animal(label_id)):
            #st.error("Please upload image of an animal!")
            st.error("Please upload image of an animal!")
            st.stop()

        # Initialize the chat engine with the image description.
        chat_engine = create_chat_engine(img_desc, os.environ["GOOGLE_API_KEY"])
        st.write("Image Uploaded Successfully. Ask me anything about it.")


    # Initialize session state for messages if it doesn't exist
    if "messages" not in st.session_state:
        st.session_state.messages = []

    # Display previous messages
    for message in st.session_state.messages:
        avatar = st.session_state['assistant_avatar'] if message["role"] == "assistant" else None
        with st.chat_message(message["role"], avatar = avatar):
            st.write(message["content"])

    # Handle new user input
    user_input = st.chat_input("Ask me about the image:", key="chat_input")
    if user_input:
        # Append user message to the session state
        st.session_state.messages.append({"role": "user", "content": user_input})

        # Display user message immediately
        with st.chat_message("user"):
            st.write(user_input)

        # Call the chat engine to get the response if an image has been uploaded
        if image_data and user_input:
            try:
                with st.spinner('Waiting for the chat engine to respond...'):
                    # Get the response from your chat engine
                    system_prompt=f"""
                    You are a chatbot, able to have normal interactions. Do not make up information.
                    You always answer in great detail and are polite. Your job is to roleplay as an {img_desc}. 
                    Remember to make {img_desc} sounds while talking but dont overdo it.
                    """
                    
                    response = chat_engine.query(f"{system_prompt}. {user_input}")

                    #response = chat_engine.chat(f"""You are a chatbot that roleplays as an animal and also makes animal sounds when chatting.
                    #You always answer in great detail and are polite. Your responses always descriptive.
                    #Your job is to rolelpay as the animal that is mentioned in the image the user has uploaded. Image description: {img_desc}. User question
                    #{user_input}""")

                # Append assistant message to the session state
                st.session_state.messages.append({"role": "assistant", "content": response.response})
        
                # Display the assistant message
                with st.chat_message("assistant"):
                    st.write(response.response)
                    st.expander("hello")
        
            except Exception as e:
                st.error(f'An error occurred.')
                # Optionally, you can choose to break the flow here if a critical error happens
                # return
        
            # Increment the message count and update the cookie
            message_count += 1
            cookie_manager.set('message_count', str(message_count), expires_at=datetime.datetime.now() + datetime.timedelta(days=30))