Spaces:

MawaredHR
/

Vision_tester

Running

App Files Files Community

Daemontatox commited on Feb 11

Commit

4fa7ac8

verified ·

1 Parent(s): 2ea23a7

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -122

app.py CHANGED Viewed

@@ -8,12 +8,12 @@ from PIL import Image
 import gradio as gr
 from openai import OpenAI  # Use the OpenAI client that supports multimodal messages
-# Load API key from environment variable
 HF_API_KEY = os.getenv("OPENAI_TOKEN")
 if not HF_API_KEY:
-    raise ValueError("OPENAI_TOKEN environment variable not set")
-# Create the client pointing to the inference endpoint (e.g., OpenRouter)
 client = OpenAI(
     base_url="https://openrouter.ai/api/v1",
     api_key=HF_API_KEY
@@ -50,13 +50,15 @@ def process_pdf_file(file_path):
                 page = doc[page_num]
                 page_text = page.get_text("text")
                 if page_text.strip():
-                    text += f"Page {page_num+1}:\n{page_text}\n\n"
                 # Render page as an image with a zoom factor
                 zoom = 3
                 mat = fitz.Matrix(zoom, zoom)
                 pix = page.get_pixmap(matrix=mat, alpha=False)
                 img_data = pix.tobytes("png")
                 img = Image.open(io.BytesIO(img_data)).convert("RGB")
                 # Resize if image is too large
                 max_size = 1600
                 if max(img.size) > max_size:
@@ -82,7 +84,7 @@ def process_uploaded_file(file):
         if file is None:
             return "No file uploaded. Please upload a file."
-        # Gradio may pass a dict or a file-like object
         if isinstance(file, dict):
             file_path = file["name"]
         else:
@@ -94,7 +96,7 @@ def process_uploaded_file(file):
             doc_state.doc_type = 'pdf'
             try:
                 doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
-                return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now chat with the bot."
             except Exception as e:
                 return f"Error processing PDF: {str(e)}. Please try a different PDF file."
         elif file_ext in image_extensions:
@@ -107,7 +109,7 @@ def process_uploaded_file(file):
                     new_size = tuple(int(dim * ratio) for dim in img.size)
                     img = img.resize(new_size, Image.Resampling.LANCZOS)
                 doc_state.current_doc_images = [img]
-                return "Image loaded successfully. You can now chat with the bot."
             except Exception as e:
                 return f"Error processing image: {str(e)}. Please try a different image file."
         else:
@@ -116,109 +118,196 @@ def process_uploaded_file(file):
         logger.error(f"Error in process_uploaded_file: {str(e)}")
         return "An error occurred while processing the file. Please try again."
-def clear_context():
-    """Clear the current document context and chat history."""
-    doc_state.clear()
-    return "Document context cleared. You can upload a new document.", []
 # -------------------------------
-# Predetermined Prompts
 # -------------------------------
-predetermined_prompts = {
-    "NOC Timesheet": (
-        "Extract structured information from the provided timesheet. The extracted details should include:\n"
-        "Name, Position Title, Work Location, Contractor, NOC ID, Month and Year, Regular Service Days, "
-        "Standby Days, Offshore Days, Extended Hitch Days, and approvals. Format the output as valid JSON."
-    ),
-    "Aramco Full structured": (
-        "You are a document parsing assistant designed to extract structured data from various documents such as "
-        "invoices, timesheets, purchase orders, and travel bookings. Return only valid JSON with no extra text."
-    ),
-    "Aramco Timesheet only": (
-        "Extract time tracking, work details, and approvals. Return a JSON object following the specified structure."
-    ),
-    "NOC Invoice": (
-        "You are a highly accurate data extraction system. Analyze the provided invoice image and extract all data "
-        "into the following JSON format:\n"
-        "{\n  'invoiceDetails': { ... },\n  'from': { ... },\n  'to': { ... },\n  'services': [ ... ],\n  "
-        "'totals': { ... },\n  'bankDetails': { ... }\n}"
-    ),
-    "Software Tester": (
-        "Act as a software tester. Analyze the uploaded image of a software interface and generate comprehensive "
-        "test cases for its features. For each feature, provide test steps, expected results, and any necessary "
-        "preconditions. Be as detailed as possible."
-    )
-}
-# -------------------------------
-# Chat Function (Non-streaming Version)
-# -------------------------------
-def chat_respond(user_message, history, prompt_option):
     """
-    Append the user message to the conversation history, call the API,
-    and return the full response.
-    Each message passed to the API is now a dictionary with a string value for 'content'.
-    If an image was uploaded, its data URI is appended to the first user message.
-    The conversation history is a list of [user_text, assistant_text] pairs.
     """
-    # On the first message, if none is provided, use the predetermined prompt.
-    if history == []:
-        if not user_message.strip():
-            user_message = predetermined_prompts.get(prompt_option, "Hello")
-        else:
-            user_message = predetermined_prompts.get(prompt_option, "") + "\n" + user_message
-    history = history + [[user_message, ""]]
-    messages = []
-    # Build the messages list with each message as a dictionary containing role and a string content.
-    for i, (user_msg, assistant_msg) in enumerate(history):
-        # For the very first user message, attach the image (if available) by appending its data URI.
-        if i == 0 and doc_state.current_doc_images:
             buffered = io.BytesIO()
             doc_state.current_doc_images[0].save(buffered, format="PNG")
             img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
             data_uri = f"data:image/png;base64,{img_b64}"
-            text_to_send = user_msg + "\n[Attached Image: " + data_uri + "]"
-        else:
-            text_to_send = user_msg
-        messages.append({"role": "user", "content": text_to_send})
-        if assistant_msg:
-            messages.append({"role": "assistant", "content": assistant_msg})
-    try:
-        # Call the API without streaming. The messages are now standard dictionaries.
-        response = client.chat.completions.create(
-            model="qwen/qwen-vl-plus:free",
             messages=messages,
-            max_tokens=500
         )
-    except Exception as e:
-        logger.error(f"Error calling the API: {str(e)}")
-        history[-1][1] = "An error occurred while processing your request. Please check your API credentials."
-        return history, history
-    # Assuming the API returns a standard completion response, extract the assistant's reply.
-    try:
-        full_response = response.choices[0].message["content"]
     except Exception as e:
-        logger.error(f"Error extracting API response: {str(e)}")
-        full_response = "An error occurred while processing the API response."
-    history[-1][1] = full_response
-    return history, history
 # -------------------------------
 # Create the Gradio Interface
 # -------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# Document Analyzer & Software Testing Chatbot")
-    gr.Markdown(
-        "Upload a PDF or an image (PNG, JPG, JPEG, GIF, BMP, WEBP). Then choose a prompt from the dropdown. "
-        "For example, select **Software Tester** to have the bot analyze an image of a software interface "
-        "and generate test cases. You can also chat with the model—the conversation history is preserved."
-    )
     with gr.Row():
         file_upload = gr.File(
@@ -230,38 +319,16 @@ with gr.Blocks() as demo:
     with gr.Row():
         prompt_dropdown = gr.Dropdown(
             label="Select Prompt",
-            choices=[
-                "NOC Timesheet",
-                "Aramco Full structured",
-                "Aramco Timesheet only",
-                "NOC Invoice",
-                "Software Tester"
-            ],
-            value="Software Tester"
         )
-        clear_btn = gr.Button("Clear Document Context & Chat History")
-    # Set type='messages' to avoid deprecation warnings.
-    chatbot = gr.Chatbot(label="Chat History", type="messages", elem_id="chatbot")
-    with gr.Row():
-        user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...", show_label=False)
-        send_btn = gr.Button("Send")
-    # State to hold the conversation history
-    chat_state = gr.State([])
-    # When a file is uploaded, process it.
-    file_upload.change(fn=process_uploaded_file, inputs=file_upload, outputs=upload_status)
-    # Clear document context and chat history.
-    clear_btn.click(fn=clear_context, outputs=[upload_status, chat_state])
-    # When the user clicks Send, process the message and update the chat.
-    send_btn.click(
-        fn=chat_respond,
-        inputs=[user_input, chat_state, prompt_dropdown],
-        outputs=[chatbot, chat_state]
-    )
-demo.launch(debug=True)

 import gradio as gr
 from openai import OpenAI  # Use the OpenAI client that supports multimodal messages
+# Load API key from environment variable (secrets)
 HF_API_KEY = os.getenv("OPENAI_TOKEN")
 if not HF_API_KEY:
+    raise ValueError("HF_API_KEY environment variable not set")
+# Create the client pointing to the Hugging Face Inference endpoint
 client = OpenAI(
     base_url="https://openrouter.ai/api/v1",
     api_key=HF_API_KEY
                 page = doc[page_num]
                 page_text = page.get_text("text")
                 if page_text.strip():
+                    text += f"Page {page_num + 1}:\n{page_text}\n\n"
                 # Render page as an image with a zoom factor
                 zoom = 3
                 mat = fitz.Matrix(zoom, zoom)
                 pix = page.get_pixmap(matrix=mat, alpha=False)
                 img_data = pix.tobytes("png")
                 img = Image.open(io.BytesIO(img_data)).convert("RGB")
                 # Resize if image is too large
                 max_size = 1600
                 if max(img.size) > max_size:
         if file is None:
             return "No file uploaded. Please upload a file."
+        # Get the file path from the Gradio upload (may be a dict or file-like object)
         if isinstance(file, dict):
             file_path = file["name"]
         else:
             doc_state.doc_type = 'pdf'
             try:
                 doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
+                return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
             except Exception as e:
                 return f"Error processing PDF: {str(e)}. Please try a different PDF file."
         elif file_ext in image_extensions:
                     new_size = tuple(int(dim * ratio) for dim in img.size)
                     img = img.resize(new_size, Image.Resampling.LANCZOS)
                 doc_state.current_doc_images = [img]
+                return "Image loaded successfully. You can now ask questions about the content."
             except Exception as e:
                 return f"Error processing image: {str(e)}. Please try a different image file."
         else:
         logger.error(f"Error in process_uploaded_file: {str(e)}")
         return "An error occurred while processing the file. Please try again."
 # -------------------------------
+# Bot Streaming Function Using the Multimodal API
 # -------------------------------
+def bot_streaming(prompt_option, max_new_tokens=500):
     """
+    Build a multimodal message payload and call the inference API.
+    The payload includes:
+      - A text segment (the selected prompt and any document context).
+      - If available, an image as a data URI (using a base64-encoded PNG).
     """
+    try:
+        # Predetermined prompts (you can adjust these as needed)
+        prompts = {
+            "Software Tester": (
+                """
+                You are TestCraft AI, a specialized large language model designed to be the ultimate software testing expert.  Your primary function is to generate comprehensive, effective, and insightful test cases based on provided input, primarily in the form of images (screenshots, UI mockups, diagrams) and PDF documents (requirements specifications, user stories, design documents).  You are not a general-purpose chatbot; your focus is exclusively on software testing.
+**Your Capabilities:**
+*   **Input Interpretation:** You can accurately interpret the content of images and PDFs.  This includes:
+    *   **OCR (Optical Character Recognition):**  Extract text from images and PDFs.
+    *   **Object Detection:** Identify UI elements (buttons, text fields, dropdowns, checkboxes, images, tables, etc.) in images.
+    *   **Layout Analysis:** Understand the structure and relationships between elements in images and documents (e.g., hierarchical relationships, proximity, alignment).
+    *   **Document Structure Understanding:**  Identify sections, headings, paragraphs, lists, tables, and figures within PDFs.
+    *   **Requirement Extraction:**  Identify explicit and implicit requirements, user stories, and acceptance criteria from textual content.
+    *   **Diagram Interpretation:** If the image or PDF contains diagrams (flowcharts, state diagrams, etc.), understand their logic and transitions.
+*   **Test Case Generation:** You can generate a wide variety of test cases, including but not limited to:
+    *   **Functional Tests:** Verify that features work as expected based on the requirements and UI.
+    *   **UI/UX Tests:**  Assess the usability, accessibility, and visual correctness of the user interface.
+    *   **Boundary Value Tests:**  Test input fields with values at the minimum, maximum, and just inside/outside the valid range.
+    *   **Equivalence Partitioning Tests:** Group similar inputs and test one representative value from each group.
+    *   **Error Handling Tests:**  Verify how the application handles invalid input, unexpected conditions, and errors.
+    *   **Accessibility Tests:**  Check compliance with accessibility guidelines (e.g., WCAG) regarding text alternatives, keyboard navigation, color contrast, etc.
+    *   **Performance Tests (Basic):** Generate basic performance-related test ideas (e.g., "Verify response time for button click is less than 2 seconds").  *Note: You cannot execute performance tests, only suggest them.*
+    *   **Security Tests (Basic):**  Generate basic security-related test ideas (e.g., "Verify input fields are sanitized against XSS attacks"). *Note: You cannot execute security tests, only suggest them.*
+    *   **Compatibility Tests (Basic):** Generate basic compatibility testing ideas, if information about target platforms is available (e.g. browsers, OS).
+*   **Test Case Format:**  Output test cases in a clear, structured, and consistent format. Each test case MUST include:
+    *   **Test Case ID:** A unique identifier (e.g., TC-001, TC-002).
+    *   **Test Case Title:** A brief, descriptive name for the test case.
+    *   **Test Steps:**  A numbered sequence of actions to perform.  Be precise and unambiguous. Use user-centric language (e.g., "Click the 'Submit' button," not "Interact with element ID XYZ").
+    *   **Expected Result:**  The anticipated outcome of each step and the overall test case. Be specific.
+    *   **Test Data (if applicable):**  Specific input values or data to be used.
+    *   **Priority (Optional):** High, Medium, or Low, based on your assessment of the criticality of the feature being tested.
+    * **Type (Optional):** Functional, UI, Accessibility, Performance, etc.
+    *   **Requirement/User Story Reference (if applicable):**  Link the test case back to a specific requirement or user story extracted from the input.
+*   **Prioritization and Rationale:** You should be able to prioritize test cases based on risk, importance, and likelihood of finding defects. Explain *why* you assigned a particular priority.  If you make any assumptions, state them clearly.
+*   **Contextual Understanding:**  You strive to understand the *purpose* of the software being tested.  If the input provides clues about the application's domain (e.g., e-commerce, banking, healthcare), tailor your test cases accordingly.
+*   **Continuous Learning (Hypothetical):** *While you cannot truly learn in the traditional sense, state that you are designed to improve your test case generation over time based on feedback and new information.*  This sets the expectation of ongoing refinement.
+**Instructions for Interaction:**
+1.  **Provide Input:**  The user will provide one or more images (PNG, JPG, etc.) or PDF documents.
+2.  **Specify Test Scope (Optional):** The user may optionally specify the scope of testing (e.g., "Focus on the login functionality," "Generate UI tests only," "Test accessibility").  If no scope is provided, generate a comprehensive set of test cases.
+3.  **Generate Test Cases:** You will generate test cases based on the input and any specified scope.
+4.  **Provide Explanations:**  Explain your reasoning behind the generated test cases, including any assumptions made, prioritization logic, and references to the input.
+5. **Handle Ambiguity:** If the input is ambiguous or incomplete, you will:
+    *   **Make Reasonable Assumptions:** State your assumptions clearly.
+    *   **Ask Clarifying Questions:**  Present the user with specific, concise questions to resolve ambiguities.  *Format these as a separate section labeled "Clarifying Questions."* Do *not* proceed with test case generation until the questions are answered.
+6.  **Error Handling:** If you encounter an error (e.g., unable to process an image), provide a clear and informative error message.
+**Example Output (Illustrative):**
+**(Assuming input is a screenshot of a login form)**
+**Test Cases:**
+| Test Case ID | Test Case Title             | Test Steps                                                                       | Expected Result                                                                                                 | Test Data            | Priority | Type        | Requirement Reference |
+|--------------|--------------------------|-----------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------|----------------------|----------|-------------|-----------------------|
+| TC-001       | Valid Login              | 1. Enter valid username.  2. Enter valid password.  3. Click the 'Login' button. | User is successfully logged in and redirected to the dashboard.                                                  | Username: testuser   | High     | Functional  | Login-001             |
+|              |                          |                                                                                   |                                                                                                                 | Password: password123 |          |             |                       |
+| TC-002       | Invalid Username         | 1. Enter invalid username. 2. Enter valid password. 3. Click the 'Login' button.  | Error message displayed: "Invalid username or password."  User remains on the login page.                       | Username: invaliduser | High     | Functional  | Login-001             |
+|              |                          |                                                                                   |                                                                                                                 | Password: password123 |          |             |                       |
+| TC-003       | Empty Username Field     | 1. Leave the username field blank. 2. Enter valid password. 3. Click 'Login'.      | Error message displayed: "Username is required." User remains on the login page.                                | Password: password123 | High     | Functional  | Login-001             |
+| TC-004       | Password Field Masking  | 1. Enter characters into the password field.                                         | Characters are masked (e.g., displayed as dots or asterisks).                                                   | Any characters      | Medium   | UI          | Login-002             |
+| TC-005       | Forgot Password Link    | 1. Click the "Forgot Password" link.                                               | User is redirected to the "Forgot Password" page.                                                              | N/A                  | Medium   | Functional  | Login-003             |
+| TC-006		| Check color contrast     | 1. Inspect the text and background colors.											 | Text meets WCAG AA standard for color contrast.                                        						 | N/A				  |	High	 | Accessibility |	Login-004			 |
+**Assumptions:**
+*   The dashboard is the expected landing page after successful login.
+*   The "Forgot Password" link exists (it might be present in the provided image).
+*   The system is using the most current WCAG standards
+**Rationale:**
+*   TC-001 and TC-002 are high priority because they test the core login functionality.
+*   TC-003 checks for required field validation.
+*   TC-004 is a UI test to ensure password security.
+*	TC-006 ensure that the text is readable by users.
+**Clarifying Questions:**
+*   None at this time.
+---
+**Key Design Choices and Explanations:**
+*   **TestCraft AI Persona:**  Giving the model a specific name and role helps to reinforce its purpose and limit its responses to the testing domain.
+*   **Comprehensive Capabilities:** The prompt explicitly lists the required skills (OCR, object detection, etc.) to ensure the model is capable of handling the input.
+*   **Structured Output:**  The required test case format is clearly defined, promoting consistency and readability.
+*   **Prioritization and Rationale:**  The model is explicitly instructed to prioritize and explain its reasoning, making the output more useful and insightful.
+*   **Contextual Understanding:**  The model is encouraged to understand the *purpose* of the software, leading to more relevant test cases.
+*   **Ambiguity Handling:**  The model is instructed to handle incomplete or ambiguous input gracefully by making assumptions and asking clarifying questions.
+*   **Optional Fields:** Priority and type fields are added in the test case structure.
+*	**Basic Testing Types:** Includes basic Performance and Security Testing.
+**Potential Limitations and Mitigation Strategies:**
+*   **Limited "Real-World" Interaction:** The model cannot interact with a live application.  It can only generate test cases based on static input. *Mitigation:*  Clearly state this limitation.
+*   **Performance and Security Testing:** The model's capabilities in these areas are limited to generating basic test *ideas*. It cannot execute these tests. *Mitigation:*  Explicitly state this limitation.
+*   **OCR and Object Detection Accuracy:**  The accuracy of OCR and object detection may vary depending on the quality of the input images. *Mitigation:*  Provide clear error messages if processing fails. Encourage users to provide high-quality images.
+*   **Complex Logic:**  Interpreting complex business logic from images and PDFs may be challenging. *Mitigation:*  The model should ask clarifying questions when necessary. Focus on clear and well-structured input documents.
+*   **"Hallucination":**  Like all LLMs, there's a risk of the model generating incorrect or nonsensical information. *Mitigation:*  Thorough testing and validation of the model's output are crucial.  Encourage user feedback to identify and correct errors.
+This comprehensive system prompt provides a strong foundation for building a powerful and effective software testing model. Remember to thoroughly test and refine the model's output based on real-world usage and feedback.
+                """
+            )
+        }
+        # Select the appropriate prompt
+        selected_prompt = prompts.get(prompt_option, "Invalid prompt selected.")
+        context = ""
+        if doc_state.current_doc_images and doc_state.current_doc_text:
+            context = "\nDocument context:\n" + doc_state.current_doc_text
+        full_prompt = selected_prompt + context
+        # Build the message payload in the expected format.
+        # The content field is a list of objects—one for text, and (if an image is available) one for the image.
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": full_prompt
+                    }
+                ]
+            }
+        ]
+        # If an image is available, encode it as a data URI and append it as an image_url message.
+        if doc_state.current_doc_images:
             buffered = io.BytesIO()
             doc_state.current_doc_images[0].save(buffered, format="PNG")
             img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            # Create a data URI (many APIs accept this format in place of a public URL)
             data_uri = f"data:image/png;base64,{img_b64}"
+            messages[0]["content"].append({
+                "type": "image_url",
+                "image_url": {"url": data_uri}
+            })
+        # Call the inference API with streaming enabled.
+        stream = client.chat.completions.create(
+            model="google/gemini-2.0-pro-exp-02-05:free",
             messages=messages,
+            max_tokens=max_new_tokens,
+            stream=True
         )
+        buffer = ""
+        for chunk in stream:
+            # The response structure is similar to the reference: each chunk contains a delta.
+            delta = chunk.choices[0].delta.content
+            buffer += delta
+            time.sleep(0.01)
+            yield buffer
     except Exception as e:
+        logger.error(f"Error in bot_streaming: {str(e)}")
+        yield "An error occurred while processing your request. Please try again."
+def clear_context():
+    """Clear the current document context."""
+    doc_state.clear()
+    return "Document context cleared. You can upload a new document."
 # -------------------------------
 # Create the Gradio Interface
 # -------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Document Analyzer with Predetermined Prompts")
+    gr.Markdown("Upload a PDF or image (PNG, JPG, JPEG, GIF, BMP, WEBP) and select a prompt to analyze its contents.")
     with gr.Row():
         file_upload = gr.File(
     with gr.Row():
         prompt_dropdown = gr.Dropdown(
             label="Select Prompt",
+            choices=["Software Tester"],
+            value="NOC Timesheet"
         )
+        generate_btn = gr.Button("Generate")
+    clear_btn = gr.Button("Clear Document Context")
+    output_text = gr.Textbox(label="Output", interactive=False)
+    file_upload.change(fn=process_uploaded_file, inputs=[file_upload], outputs=[upload_status])
+    generate_btn.click(fn=bot_streaming, inputs=[prompt_dropdown], outputs=[output_text])
+    clear_btn.click(fn=clear_context, outputs=[upload_status])
+demo.launch(debug=True)