Spaces:

toilaluan
/

Prompt-Adherence-Scorer

Runtime error

App Files Files Community

toilaluan commited on Aug 21, 2024

Commit

25bd6f8

1 Parent(s): 83f4a0c

update

Browse files

Files changed (3) hide show

app.py +69 -4
requirements.txt +7 -0
scorer.py +402 -0

app.py CHANGED Viewed

@@ -1,7 +1,72 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import spaces
+from scorer import DSGPromptProcessor
+import matplotlib.pyplot as plt
+import networkx as nx
+from PIL import Image
+def draw_colored_graph(dependencies, questions, answers):
+    # Create a directed graph
+    G = nx.DiGraph()
+    # Add nodes with labels and colors based on answers
+    for node, question in questions.items():
+        color = 'green' if answers[node] else 'red'
+        G.add_node(int(node), label=question, color=color)
+    # Add edges based on dependencies
+    for node, deps in dependencies.items():
+        for dep in deps:
+            G.add_edge(dep, int(node))
+    # Set node positions using a layout
+    pos = nx.spring_layout(G)  # You can use other layouts like 'shell_layout' or 'circular_layout'
+    # Draw nodes with custom colors and labels
+    node_colors = [G.nodes[node]['color'] for node in G.nodes()]
+    nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=2000, edgecolors='black')
+    # Draw edges with arrows
+    nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrows=True, arrowsize=20, connectionstyle='arc3,rad=0.1')
+    # Draw labels
+    labels = nx.get_node_attributes(G, 'label')
+    nx.draw_networkx_labels(G, pos, labels, font_size=10, font_color='black')
+    # Save the graph as a Pillow image
+    buf = io.BytesIO()
+    plt.axis('off')
+    plt.savefig(buf, format='png')
+    buf.seek(0)
+    img = Image.open(buf)
+    return img
+dsg_scorer = DSGPromptProcessor("mistralai/Mixtral-8x7B-Instruct-v0.1")
+def process_image(image, prompt):
+    tuples, _ = processor.generate_tuples(prompt)
+    dependencies, _ = processor.generate_dependencies(tuples)
+    questions, _ = processor.generate_questions(
+        input_text, tuples.tuples, dependencies
+    )
+    reward = processor.get_reward(input_text, questions, dependencies, [image])
+    reward = reward[0]
+    answers = {i: v > 0.5 for i, v in enumerate(reward)}
+    graph_img = draw_colored_graph(dependencies, questions, answers)
+    return reward, f"""
+Question: {questions}.
+Reward per question: {reward}"""
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=process_image,
+    inputs=[gr.Image(type="pil"), gr.Textbox(label="Enter your prompt")],
+    outputs=[gr.Image(type="pil"), gr.Textbox(label="Output text")],
+    title="Image and Prompt Interface",
+    description="Upload an image and enter a prompt. The output is an image and text below it."
+)
+# Launch the Gradio app
+interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+openai
+pydantic
+transformers
+torch
+pillow
+timm
+einops

scorer.py ADDED Viewed

	@@ -0,0 +1,402 @@

+import openai
+import json
+from pydantic import BaseModel, Field
+from PIL import Image
+from tqdm import tqdm
+from transformers import AutoProcessor, AutoModelForCausalLM
+import torch
+import requests
+import spaces
+class PromptTuple(BaseModel):
+    class Tuple(BaseModel):
+        type: str = Field(
+            description="The type of the tuple. One of entity, attribute, relation",
+            example="attribute",
+        )
+        type_detail: str = Field(
+            description="""The detail of the type. For example:
+            - Entity: whole (entire entity, e.g., chair), part (part of entity, e.g., back of chair).
+            - Attribute: color (e.g., red book), type (e.g., aviator goggles), material (e.g., wooden chair), count (e.g., 5 geese), texture (e.g., rough surface), text rendering (e.g., letters “Macaroni”), shape (e.g., triangle block), size (e.g., large fence).
+            - Relation: spatial (e.g., A next to B); action (A kicks B).""",
+            example="color",
+        )
+        semantics: list = Field(
+            description="List of strings that explain the existence of type and type_detail in the tuple",
+            example=["motorcycle", "blue"],
+        )
+    tuples: list[Tuple] = Field(
+        description="List of tuples. Maximum 8 tuples.",
+        example=[
+            {
+                "type": "attribute",
+                "type_detail": "color",
+                "semantics": ["motorcycle", "blue"],
+            }
+        ],
+    )
+class DSGPromptProcessor:
+    def __init__(self, model_name="gpt-4o-mini"):
+        self.client = openai.OpenAI()
+        self.model_name = model_name
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.binary_vqa = AutoModelForCausalLM.from_pretrained("toilaluan/Florence-2-base-Yes-No-VQA", trust_remote_code=True).to(self.device, torch.float16)
+        self.binary_vqa_processor = processor = AutoProcessor.from_pretrained("toilaluan/Florence-2-base-Yes-No-VQA", trust_remote_code=True)
+    def generate_tuples(self, input_text: str) -> PromptTuple:
+        system_message = """
+        Given an image caption, extract the relevant entities, attributes, and relations present in the caption, and structure them into JSON format according to the following schema:
+Each tuple contains the following information:
+    - Id: A unique identifier for the tuple.
+    - Type: The category of the tuple. Choose from "entity," "attribute," or "relation."
+    - Type Detail: Provide additional details based on the selected type:
+        - Entity: Specify whether it refers to the whole entity (e.g., "chair") or a part of the entity (e.g., "back of chair").
+        - Attribute: Specify the attribute type, such as "color", "type", "material", "count", "style", "texture", "text rendering", "shape" or "size".
+        - Relation: Specify the relation type, such as "spatial" (e.g., "A next to B") or "action" (e.g., "A kicks B").
+    - Semantics: A list of strings that represent the words or phrases from the caption that correspond to the tuple.
+    Example Input: "A blue motorcycle parked next to a red car."
+    Example output:
+    {
+        "tuples": [
+            {
+                "type": "entity",
+                "type_detail": "whole",
+                "semantics": ["motorcycle"]
+            },
+            {
+                "type": "attribute",
+                "type_detail": "color",
+                "semantics": ["motorcycle", "blue"]
+            },
+            {
+                "type": "entity",
+                "type_detail": "whole",
+                "semantics": ["car"]
+            },
+            {
+                "type": "attribute",
+                "type_detail": "color",
+                "semantics": ["car", "red"]
+            },
+            {
+                "type": "relation",
+                "type_detail": "spatial",
+                "semantics": ["motorcycle", "next to", "car"]
+            }
+        ]
+    }
+    The final JSON should contain a list of tuples, each describing a unique entity, attribute, or relation from the image caption. Each JSON should contain a maximum of 8 tuples.
+        """
+        messages = [
+            {
+                "role": "system",
+                "content": system_message,
+            },
+            {
+                "role": "user",
+                "content": input_text,
+            },
+        ]
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            response_format={"type": "json_object"},
+            max_tokens=512,
+        )
+        output = json.loads(response.choices[0].message.content)
+        return PromptTuple(**output), response.usage.total_tokens
+    def generate_dependencies(self, tuples: PromptTuple) -> dict:
+        DEPENDENCY_PROMPT = """
+        Given the following tuples extracted from an image caption, determine the dependencies between the entities, attributes, and relations in the JSON format.
+        Each tuple contains the following information:
+        - Id: A unique identifier for the tuple.
+        - Type: The category of the tuple. Choose from "entity," "attribute," or "relation."
+        - Type Detail: Provide additional details based on the selected type:
+          - Entity: Specify whether it refers to the whole entity (e.g., "chair") or a part of the entity (e.g., "back of chair").
+          - Attribute: Specify the attribute type, such as "color," "type," "material," "count," "texture," "text rendering," "shape," or "size."
+          - Relation: Specify the relation type, such as "spatial" (e.g., "A next to B") or "action" (e.g., "A kicks B").
+        - Semantics: A list of strings that represent the words or phrases from the caption that correspond to the tuple.
+        Output is a dictionary where the key is the id of the tuple and the value is a list of ids that the tuple depends on.
+        Example input:
+        [
+            {
+                "id": 1,
+                "type": "entity",
+                "type_detail": "whole",
+                "semantics": ["motorcycle"]
+            },
+            {
+                "id": 2,
+                "type": "attribute",
+                "type_detail": "color",
+                "semantics": ["motorcycle", "blue"]
+            },
+            {
+                "id": 3,
+                "type": "entity",
+                "type_detail": "whole",
+                "semantics": ["car"]
+            },
+            {
+                "id": 4,
+                "type": "attribute",
+                "type_detail": "color",
+                "semantics": ["car", "red"]
+            },
+            {
+                "id": 5,
+                "type": "relation",
+                "type_detail": "spatial",
+                "semantics": ["motorcycle", "next to", "car"]
+            }
+        ]
+        Example output:
+        {
+            "1": [],
+            "2": [1],
+            "3": [],
+            "4": [3],
+            "5": [1, 3]
+        }
+        """
+        input_obj = [{"id": i, **t.dict()} for i, t in enumerate(tuples.tuples)]
+        messages = [
+            {
+                "role": "system",
+                "content": DEPENDENCY_PROMPT,
+            },
+            {
+                "role": "user",
+                "content": json.dumps(input_obj),
+            },
+        ]
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            response_format={"type": "json_object"},
+        )
+        return (
+            json.loads(response.choices[0].message.content),
+            response.usage.total_tokens,
+        )
+    def generate_questions(
+        self, prompt: str, tuples: list[dict], dependencies: dict
+    ) -> list[str]:
+        """Generate validate question based on tuples and dependencies.
+        Args:
+            prompt (str): a prompt describe the image
+            tuples (list[dict]): each tuple is a unit of information extracted from the prompt
+            dependencies (dict): the dependencies between tuples
+        """
+        system_message = """
+        Task: Given a prompt that describe the image and a list of tuples extracted from the prompt. Generate questions based on tuple in natural language as a list.
+        Each tuple contains the following information:
+        - Id: A unique identifier for the tuple.
+        - Type: The category of the tuple. Choose from "entity," "attribute," or "relation."
+        - Type Detail: Provide additional details based on the selected type:
+            - Entity: Specify whether it refers to the whole entity (e.g., "chair") or a part of the entity (e.g., "back of chair").
+            - Attribute: Specify the attribute type, such as "color", "type", "material", "count", "style", "texture", "text rendering", "shape" or "size".
+            - Relation: Specify the relation type, such as "spatial" (e.g., "A next to B") or "action" (e.g., "A kicks B").
+        - Semantics: A list of strings that represent the words or phrases from the caption that correspond to the tuple.
+        Output is a list of questions, each question corresponds to a tuple. The number of questions must be the same as the number of tuples.
+        Example input:
+        Prompt: "A traffic light and a signpost at a crossroads intersection near a waterway"
+        Tuples:
+        [
+            {
+                "id": 1,
+                "type": "entity",
+                "type_detail": "whole",
+                "semantics": ["traffic light"]
+            },
+            {
+                "id": 2,
+                "type": "entity",
+                "type_detail": "whole",
+                "semantics": ["signpost"]
+            },
+            {
+                "id": 3,
+                "type": "relation",
+                "type_detail": "spatial",
+                "semantics": ["traffic light", "at", "crossroads intersection"]
+            },
+            {
+                "id": 4,
+                "type": "relation",
+                "type_detail": "spatial",
+                "semantics": ["crossroads intersection", "near", "waterway"]
+            }
+        ]
+        Dependencies:
+        {
+            "1": [],
+            "2": [],
+            "3": [1, 2],
+            "4": [3]
+        }
+        Example output is a json object. Each question ask about the existence of the tuple in the prompt and the answer should always be yes.
+        {
+            "1": "Is there a light?",
+            "2": "Is there a signpost?",
+            "3": "Is the traffic light at a crossroads intersection?",
+            "4": "Is the crossroads intersection near a waterway?"
+        }
+        """
+        user_str = f"""
+        Prompt: {prompt}
+        Tuples: {tuples}
+        Dependencies: {dependencies}
+        """
+        messages = [
+            {
+                "role": "system",
+                "content": system_message,
+            },
+            {
+                "role": "user",
+                "content": user_str,
+            },
+        ]
+        response = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            response_format={"type": "json_object"},
+        )
+        return (
+            json.loads(response.choices[0].message.content),
+            response.usage.total_tokens,
+        )
+    def find_layers(self, dep_dict):
+        layers = []
+        remaining_keys = set(dep_dict.keys())
+        while remaining_keys:
+            current_layer = []
+            for key in list(remaining_keys):
+                # If all dependencies of the key are in previous layers
+                if all(
+                    str(dep) in [k for layer in layers for k in layer]
+                    for dep in dep_dict[key]
+                ):
+                    current_layer.append(key)
+            # If no new layer is formed, break to avoid infinite loop
+            if not current_layer:
+                break
+            # Add the current layer to the list of layers
+            layers.append(current_layer)
+            # Remove the keys that are now layered
+            remaining_keys -= set(current_layer)
+            if len(layers) == 3:
+                break
+        ordered_indexes = [item for sublist in layers for item in sublist]
+        return ordered_indexes
+    def _create_graph_questions(self, questions: dict, dependencies: dict) -> set:
+        # create a question graph
+        layered_indexes = self.find_layers(dependencies)
+        print(layered_indexes)
+        sorted_questions = [questions[i] for i in layered_indexes]
+        return sorted_questions
+    @spaces.GPU(duration=120)
+    def get_reward(
+        self,
+        prompt: str,
+        questions: list[str],
+        dependencies: dict[list],
+        images: list[str],
+        mode="hybrid",
+    ):
+        """Get reward for the generated questions use structured question graph.
+        Args:
+            prompt (str): a prompt describe the image
+            questions (list[str]): a list of questions generated based on the tuples
+            dependencies (dict[list]): the dependencies between tuples
+            images (list[str]): a list of image urls
+        """
+        scores = {}
+        sorted_questions = self._create_graph_questions(questions, dependencies)
+        print(sorted_questions)
+        for i in range(len(images)):
+            scores[i] = [0] * len(sorted_questions)
+        def get_reward_for_a_question(
+            question: str,
+            question_dependencies: list[int],
+            image: Image.Image,
+            prev_scores: list[int],
+        ) -> float:
+            if any([not (prev_scores[i] > 0.5) for i in question_dependencies]):
+                print(
+                    f"Skipping question: {question}. It depends on {[sorted_questions[i] for i in range(len(question_dependencies))]} that was answered as No."
+                )
+                return 0
+            if not isinstance(image, Image.Image):
+                raise ValueError("Invalid image type")
+            inputs = self.binary_vqa_processor(text=question, images=image, return_tensors="pt").to(self.device, torch.float16)
+            decoder_input_ids = torch.LongTensor([[self.binary_vqa.language_model.config.pad_token_id, self.binary_vqa.language_model.config.decoder_start_token_id]]).to(self.device)
+            outputs = self.binary_vqa(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+                decoder_input_ids=decoder_input_ids
+            )
+            logits = outputs.logits[:, -1]
+            score = logits[0].sigmoid().item()
+            print(f"The answer Yes has {score} probs")
+            return score
+        pbar = tqdm(
+            total=len(sorted_questions) * len(images),
+            desc=f"Calculating reward over {len(images)} images and {len(sorted_questions)} questions",
+        )
+        for i, question in enumerate(sorted_questions):
+            for j, image in enumerate(images):
+                scores[j][i] = get_reward_for_a_question(
+                    question, dependencies[str(i)], image, scores[j]
+                )
+                pbar.update(1)
+        return scores
+if __name__ == "__main__":
+    processor = DSGPromptProcessor(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
+    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
+    image = Image.open(requests.get(url, stream=True).raw)
+    input_text = "ghibli style image of a cat"
+    tuples, tokens = processor.generate_tuples(input_text)
+    print(tuples)
+    dependencies, tokens = processor.generate_dependencies(tuples)
+    print(dependencies)
+    questions, tokens = processor.generate_questions(
+        input_text, tuples.tuples, dependencies
+    )
+    print(questions)
+    reward = processor.get_reward(input_text, questions, dependencies, [image])
+    print(reward)