Spaces:
Runtime error
Runtime error
| import re | |
| import gradio as gr | |
| from gliner import GLiNER | |
| from cerberus import Validator | |
| # ---------------------------------------------------------------------------- | |
| # Load model + labels | |
| # ---------------------------------------------------------------------------- | |
| model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1") | |
| with open("labels.txt", "r", encoding="utf-8") as f: | |
| labels = [line.strip() for line in f.readlines()] | |
| # ---------------------------------------------------------------------------- | |
| # Simple Cerberus validation for incoming data | |
| # ---------------------------------------------------------------------------- | |
| # We expect a dict with at least {"text": "<some string>"} | |
| schema = { | |
| "text": { | |
| "type": "string", | |
| "empty": False | |
| } | |
| } | |
| validator = Validator(schema) | |
| def validate_input(data: dict) -> str: | |
| """Validate that data has a non-empty 'text' key.""" | |
| if not validator.validate(data): | |
| # If invalid, raise an exception. You could handle this more gracefully if you like. | |
| raise ValueError(f"Invalid input data. Errors: {validator.errors}") | |
| return data["text"] | |
| # ---------------------------------------------------------------------------- | |
| # Core anonymize / de-anonymize logic (same as before) | |
| # ---------------------------------------------------------------------------- | |
| def anonymize_text(text): | |
| """ | |
| 1) Detect PII using GLiNER, | |
| 2) Replace each entity with a placeholder (<PII_LABEL_INDEX>) | |
| 3) Return anonymized_text + entity_map | |
| """ | |
| entities = model.predict_entities(text, labels=labels, threshold=0.2) | |
| # Sort by start index to apply placeholders in correct order | |
| entities.sort(key=lambda e: e['start']) | |
| entity_map = {} # e.g. {'PERSON': ['Alice', 'Bob']} | |
| anonymized_text = "" | |
| next_start = 0 | |
| for entity in entities: | |
| label = entity['label'].replace(" ", "_").upper() | |
| original_text = entity['text'] | |
| start_idx, end_idx = entity['start'], entity['end'] | |
| if label not in entity_map: | |
| entity_map[label] = [original_text] | |
| idx = 1 | |
| else: | |
| # If same exact string repeated, use the same index as before | |
| if original_text in entity_map[label]: | |
| idx = entity_map[label].index(original_text) + 1 | |
| else: | |
| entity_map[label].append(original_text) | |
| idx = len(entity_map[label]) | |
| # Copy everything before this entity | |
| anonymized_text += text[next_start:start_idx] | |
| # Insert placeholder | |
| anonymized_text += f"<PII_{label}_{idx}>" | |
| next_start = end_idx | |
| # Remainder of the text after last entity | |
| anonymized_text += text[next_start:] | |
| return anonymized_text, entity_map | |
| def deanonymize_text(anonymized_response, entity_map): | |
| """ | |
| Replace <PII_LABEL_INDEX> placeholders in anonymized_response | |
| with their original strings from entity_map. | |
| """ | |
| def replace_match(match): | |
| label = match.group(1) # e.g. "PERSON" | |
| idx_str = match.group(2) # e.g. "1" | |
| idx = int(idx_str) - 1 # 1-based index -> 0-based list index | |
| if label in entity_map and 0 <= idx < len(entity_map[label]): | |
| return entity_map[label][idx] | |
| return match.group(0) # If something is off, return the placeholder as-is | |
| pattern = r"<PII_(\w+)_(\d+)>" | |
| return re.sub(pattern, replace_match, anonymized_response) | |
| # ---------------------------------------------------------------------------- | |
| # Gradio Interface | |
| # ---------------------------------------------------------------------------- | |
| def anonymize_fn(original_text): | |
| # We’ll do a simple dict so we can pass it to our Cerberus validator: | |
| data = {"text": original_text} | |
| try: | |
| user_text = validate_input(data) | |
| except ValueError as e: | |
| # If invalid, show error in Gradio output | |
| return "", {}, f"Validation error: {str(e)}" | |
| anonymized, entities = anonymize_text(user_text) | |
| return anonymized, entities, "Anonymized successfully!" | |
| def deanonymize_fn(anonymized_llm_response, entity_map): | |
| if not anonymized_llm_response.strip(): | |
| return "", "Please provide an anonymized LLM response." | |
| if not entity_map: | |
| return "", "No entity map found; anonymize some text first." | |
| result = deanonymize_text(anonymized_llm_response, entity_map) | |
| return result, "De-anonymized successfully!" | |
| md_text = """# Анонимизатор психотерапевтических сессий | |
| Вставьте текст в раздел "Исходный текст", чтобы анонимизировать сензитивные данные. | |
| """ | |
| with gr.Blocks() as demo: | |
| gr.Markdown(md_text) | |
| with gr.Row(): | |
| with gr.Column(): | |
| original_text = gr.Textbox( | |
| lines=6, label="Исходный текст (анонимизировать)" | |
| ) | |
| anonymized_text = gr.Textbox( | |
| lines=6, label="Анонимизированный текст", interactive=False | |
| ) | |
| button_anon = gr.Button("Анонимизировать") | |
| # Hidden state to store the entity map | |
| entity_map_state = gr.State() | |
| message_out = gr.Textbox(label="Status", interactive=False) | |
| button_anon.click( | |
| anonymize_fn, | |
| inputs=[original_text], | |
| outputs=[anonymized_text, entity_map_state, message_out] | |
| ) | |
| with gr.Column(): | |
| anonymized_llm_response = gr.Textbox( | |
| lines=6, label="Анонимизированная сессия (вставить))" | |
| ) | |
| deanonymized_text = gr.Textbox( | |
| lines=6, label="Де-анонимизированная сессия", interactive=False | |
| ) | |
| button_deanon = gr.Button("Деанонимизировать") | |
| message_out_de = gr.Textbox(label="Status", interactive=False) | |
| button_deanon.click( | |
| deanonymize_fn, | |
| inputs=[anonymized_llm_response, entity_map_state], | |
| outputs=[deanonymized_text, message_out_de] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |