# Importing the requirements import torch from transformers import AutoModel, AutoTokenizer import spaces # Device for the model device = "cuda" # Load the model and tokenizer model = AutoModel.from_pretrained( "openbmb/MiniCPM-V-2_6", trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16, ) model = model.to(device=device) tokenizer = AutoTokenizer.from_pretrained( "openbmb/MiniCPM-V-2_6", trust_remote_code=True ) model.eval() @spaces.GPU(duration=120) def answer_question(image, question): """ Generates an answer to a given question based on the provided image and question. Args: - image (str): The path to the image file. - question (str): The question text. Returns: str: The generated answer to the question. """ # Message format for the model msgs = [{"role": "user", "content": [image, question]}] # Generate the answer res = model.chat( image=None, msgs=msgs, tokenizer=tokenizer, sampling=True, temperature=0.7, stream=True, system_prompt="You are an AI assistant specialized in visual content analysis. Given an image and a related question, analyze the image thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.", ) # Return the answer return "".join(res)