Spaces:

lisabdunlap
/

Whatever-this-is

Sleeping

App Files Files Community

Lisa Dunlap commited on Aug 9

Commit

43f99d0

1 Parent(s): b369b7e

added text localization

Browse files

Files changed (3) hide show

lmmvibes/examples_helpers.py +214 -0
lmmvibes/vis_gradio/conversation_display.py +1 -1
lmmvibes/vis_gradio/utils.py +7 -0

lmmvibes/examples_helpers.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from __future__ import annotations
+"""Evidence localization & highlighting helpers.
+This module provides a **single public API**:
+    highlight_html_with_evidence(html_text: str, evidence: str, *, n: int = 3,
+                                 overlap_threshold: float = 0.5) -> str
+It will:
+1. Extract quoted segments from *evidence* – if any exist they are treated as
+   exact strings that must appear verbatim in *html_text* (case–insensitive).
+2. If the evidence string had **no quotes**, we first try an exact match of the
+   raw evidence text.  When that fails, we fall back to an n-gram overlap
+   heuristic (default n = 3).  The window in *html_text* with ≥ *overlap_threshold*
+   Jaccard overlap is considered a match.
+3. All matched character spans are wrapped in `<mark>` tags.
+The helper is HTML-agnostic – it simply operates on the raw string.  That means
+it may occasionally highlight inside an HTML attribute if the evidence happens
+to occur there, but for our Gradio viewer the relevant text lives in normal
+content nodes so this trade-off keeps the implementation lightweight.
+No try/excepts are used in accordance with user guidelines; we prefer clear
+errors.
+"""
+from typing import List, Tuple
+import re
+import html
+__all__ = [
+    "localize_evidence",
+    "highlight_html_with_evidence",
+]
+# ---------------------------------------------------------------------------
+# Internal helpers -----------------------------------------------------------
+# ---------------------------------------------------------------------------
+def _extract_targets(evidence: str) -> List[str]:
+    """Return the substrings we need to look for in *evidence*.
+    1. If there are quoted regions – e.g. `"foo"` – each quoted region is
+       returned separately **without** the quotes.
+    2. Otherwise we return the full evidence string (stripped of whitespace).
+    """
+    if not evidence or evidence.strip() == "":
+        return []
+    # Pull out "quoted" substrings
+    quoted = re.findall(r'"([^"\\]*(?:\\.[^"\\]*)*)"', evidence)
+    return quoted if quoted else [evidence.strip()]
+def _tokenize(text: str) -> List[str]:
+    """A very small tokenizer – splits on word boundaries."""
+    return re.findall(r"\b\w+\b", text.lower())
+def _ngrams(tokens: List[str], n: int) -> List[str]:
+    return [" ".join(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]
+# ---------------------------------------------------------------------------
+# Public localisation logic --------------------------------------------------
+# ---------------------------------------------------------------------------
+def localize_evidence(
+    text: str,
+    evidence: str,
+    *,
+    n: int = 3,
+    overlap_threshold: float = 0.5,
+) -> List[Tuple[int, int]]:
+    """Return a list of (start, end) indices where *evidence* occurs in *text*.
+    If *evidence* contains quotes we treat the quoted substrings as exact
+    matches (case-insensitive).
+    When *evidence* has no quotes, we apply a simple n-gram Jaccard overlap to
+    approximate the location.  The window with the highest overlap ≥
+    *overlap_threshold* is returned.  Only a single window is selected in that
+    fuzzy path to keep things deterministic.
+    """
+    if not evidence or evidence in {"N/A", "None"}:
+        return []
+    matches: List[Tuple[int, int]] = []
+    targets = _extract_targets(evidence)
+    # ------------------------------------------------------------------
+    # 1. Exact search for each target (quoted or the raw evidence string)
+    # ------------------------------------------------------------------
+    lowered_text = text.lower()
+    for target in targets:
+        lowered_target = target.lower()
+        for m in re.finditer(re.escape(lowered_target), lowered_text):
+            matches.append(m.span())
+    if matches:
+        return _merge_overlaps(matches)
+    # ---------------------------------------------------------------
+    # 2. Fuzzy n-gram overlap if *evidence* had NO quotes and no exact
+    #    substring match was detected above.
+    # ---------------------------------------------------------------
+    raw_target = targets[0]  # either the only quoted string, or the full evidence
+    evid_tokens = _tokenize(raw_target)
+    if not evid_tokens:
+        return []
+    n = min(n, len(evid_tokens))
+    target_ngrams = set(_ngrams(evid_tokens, n))
+    if not target_ngrams:
+        return []
+    # Tokenise *text* and keep char offsets for each token start
+    token_spans: List[Tuple[str, int]] = [
+        (m.group().lower(), m.start()) for m in re.finditer(r"\b\w+\b", text)
+    ]
+    if not token_spans:
+        return []
+    tokens_only = [tok for tok, _ in token_spans]
+    window_size = len(evid_tokens)
+    best_overlap = 0.0
+    best_span: Tuple[int, int] | None = None
+    for i in range(len(tokens_only) - window_size + 1):
+        window_tokens = tokens_only[i : i + window_size]
+        window_ngrams = set(_ngrams(window_tokens, n))
+        if not window_ngrams:
+            continue
+        overlap = len(window_ngrams & target_ngrams) / float(len(target_ngrams))
+        if overlap >= overlap_threshold and overlap > best_overlap:
+            start_char = token_spans[i][1]
+            end_char = token_spans[i + window_size - 1][1] + len(token_spans[i + window_size - 1][0])
+            best_overlap = overlap
+            best_span = (start_char, end_char)
+    if best_span is not None:
+        matches.append(best_span)
+    return _merge_overlaps(matches)
+# ---------------------------------------------------------------------------
+# Highlighting ---------------------------------------------------------------
+# ---------------------------------------------------------------------------
+def _merge_overlaps(spans: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
+    """Merge overlapping/adjacent spans."""
+    if not spans:
+        return []
+    spans = sorted(spans, key=lambda x: x[0])
+    merged = [spans[0]]
+    for start, end in spans[1:]:
+        last_start, last_end = merged[-1]
+        if start <= last_end:  # overlapping or touching
+            merged[-1] = (last_start, max(last_end, end))
+        else:
+            merged.append((start, end))
+    return merged
+def _insert_tags(text: str, spans: List[Tuple[int, int]], tag: str = "mark") -> str:
+    """Return *text* with each span wrapped in `<tag>` .. `</tag>`.
+    Assumes *spans* are non-overlapping **and sorted ascending**.
+    """
+    if not spans:
+        return text
+    parts: List[str] = []
+    last_idx = 0
+    for start, end in spans:
+        parts.append(text[last_idx:start])
+        parts.append(f"<{tag}>" + text[start:end] + f"</{tag}>")
+        last_idx = end
+    parts.append(text[last_idx:])
+    return "".join(parts)
+def highlight_html_with_evidence(
+    html_text: str,
+    evidence: str,
+    *,
+    n: int = 3,
+    overlap_threshold: float = 0.5,
+) -> str:
+    """Return *html_text* with occurrences of *evidence* wrapped in `<mark>` tags."""
+    if not evidence or evidence in {"N/A", "None"}:
+        return html_text
+    # Working on the raw HTML string directly – obtain spans first
+    spans = localize_evidence(html_text, evidence, n=n, overlap_threshold=overlap_threshold)
+    if not spans:
+        return html_text  # nothing to highlight
+    highlighted = _insert_tags(html_text, spans, tag="mark")
+    # Inject tiny CSS once – id attribute prevents duplicates
+    style_block = (
+        "<style id=\"evidence-highlight-style\">\n"
+        "mark { background: #fffd6b; font-weight: 600; padding: 0 2px; border-radius: 2px; }\n"
+        "</style>\n"
+    )
+    if "evidence-highlight-style" not in html_text:
+        highlighted = style_block + highlighted
+    return highlighted

lmmvibes/vis_gradio/conversation_display.py CHANGED Viewed

@@ -121,7 +121,7 @@ def pretty_print_embedded_dicts(text: str) -> str:
             new_parts.append(html.escape(text[last_idx:start], quote=False))
             pretty = json.dumps(parsed, indent=2, ensure_ascii=False)
             new_parts.append(
-                f"<pre style='background:#f8f9fa;padding:10px;border-radius:4px;overflow-x:auto;'>{pretty}</pre>"
             )
             last_idx = end
     new_parts.append(html.escape(text[last_idx:], quote=False))

             new_parts.append(html.escape(text[last_idx:start], quote=False))
             pretty = json.dumps(parsed, indent=2, ensure_ascii=False)
             new_parts.append(
+                f"<span style='white-space: pre-wrap; font-family: monospace;'>{html.escape(pretty, quote=False)}</span>"
             )
             last_idx = end
     new_parts.append(html.escape(text[last_idx:], quote=False))

lmmvibes/vis_gradio/utils.py CHANGED Viewed

@@ -14,6 +14,9 @@ from typing import Dict, List, Any, Optional, Tuple
 import html
 import ast
 # Conversation rendering helpers are now in a dedicated module for clarity
 from . import conversation_display as _convdisp
 from .conversation_display import (
@@ -1524,6 +1527,10 @@ def format_examples_display(examples: List[Dict[str, Any]],
             else:
                 conversation_html = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
         # Determine cluster info
         cluster_info = ""
         if example['fine_cluster_label'] != 'N/A':

 import html
 import ast
+# Evidence highlighter
+from lmmvibes.examples_helpers import highlight_html_with_evidence
 # Conversation rendering helpers are now in a dedicated module for clarity
 from . import conversation_display as _convdisp
 from .conversation_display import (
             else:
                 conversation_html = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
+        # Highlight evidence (if any) inside the conversation HTML
+        if example.get("evidence") not in ["N/A", None, "None"]:
+            conversation_html = highlight_html_with_evidence(conversation_html, example["evidence"])
         # Determine cluster info
         cluster_info = ""
         if example['fine_cluster_label'] != 'N/A':