File size: 26,726 Bytes
2abea01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
# -*- coding: utf-8 -*-
"""Pibit.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1Lug_ARdBOTu2e87sThol1luVksA0986T
"""

import re
import json
import datetime
from collections import defaultdict, Counter
from typing import List, Dict, Tuple, Set, Optional
import unicodedata
import math
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

class PibitInsuranceTokenizer:
    """
    Specialized tokenizer for insurance domain documents, designed for Pibit.ai's
    underwriting automation platform. Handles loss run documents, policy documents,
    claims data, and other insurance-specific text processing needs.
    """

    def __init__(self, vocab_size=15000, model_type="insurance_bpe"):
        self.vocab_size = vocab_size
        self.model_type = model_type

        self.special_tokens = [
            "<PAD>", "<UNK>", "<START>", "<END>", "<MASK>",
            "<CLAIM>", "<POLICY>", "<AMOUNT>", "<DATE>", "<RISK>",
            "<COVERAGE>", "<DEDUCTIBLE>", "<PREMIUM>", "<LOSS>",
            "<UNDERWRITER>", "<CARRIER>", "<INSURED>", "<PERCENTAGE>"
        ]

        self.vocab = {}
        self.id_to_token = {}
        self.token_frequencies = Counter()
        self.merges = []
        self.bpe_ranks = {}

        # COMPUTATION-LIGHT: Compiling regex patterns once at initialization is highly efficient.
        # This avoids re-compiling the same pattern for every function call.
        self.insurance_patterns = self._load_insurance_patterns()
        self.financial_pattern = re.compile(r'\$[\d,]+(?:\.\d{2})?')
        self.policy_pattern = re.compile(r'[A-Z]{2,4}[\d\-]{6,12}')
        self.date_pattern = re.compile(r'\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}|\d{4}[\/\-]\d{1,2}[\/\-]\d{1,2}')
        self.percentage_pattern = re.compile(r'\d+(?:\.\d+)?%')

        self._initialize_special_tokens()

    def _load_insurance_patterns(self) -> Dict[str, List[str]]:
        """Load insurance domain-specific patterns and terminology."""
        return {
            'coverage_types': ['general liability', 'workers compensation', 'property coverage', 'commercial auto', 'cyber liability', 'professional liability', 'directors officers', 'employment practices', 'umbrella coverage', 'commercial crime', 'boiler machinery', 'builders risk'],
            'claim_types': ['bodily injury', 'property damage', 'medical payments', 'personal injury', 'products liability', 'completed operations', 'fire damage', 'theft', 'vandalism', 'water damage', 'slip and fall', 'motor vehicle accident', 'workplace injury'],
            'risk_factors': ['hazardous materials', 'high risk activity', 'prior claims', 'safety violations', 'regulatory issues', 'financial distress', 'industry classification', 'geographic risk', 'seasonal business', 'new venture', 'construction defects', 'product recall'],
            'financial_terms': ['deductible', 'premium', 'limit', 'retention', 'aggregate', 'occurrence', 'claims made', 'prior acts', 'extended reporting', 'loss ratio', 'experience modification', 'rate', 'exposure'],
            'underwriting_terms': ['risk assessment', 'loss run', 'acord forms', 'submission', 'renewal', 'policy period', 'effective date', 'expiration', 'carrier', 'excess', 'reinsurance', 'facultative', 'treaty', 'reserve', 'incurred', 'paid', 'outstanding', 'ibnr']
        }

    def _initialize_special_tokens(self):
        """Initialize special tokens in vocabulary."""
        for i, token in enumerate(self.special_tokens):
            self.vocab[token] = i
            self.id_to_token[i] = token

    def _preprocess_text(self, text: str) -> str:
        """
        Insurance-specific text preprocessing.
        Normalizes financial amounts, dates, policy numbers, and other entities.
        """
        # COMPUTATION-LIGHT: Unicode normalization and regex substitutions are very fast C-level operations.
        text = unicodedata.normalize('NFKC', text)
        text = self.financial_pattern.sub('<AMOUNT>', text)
        text = self.date_pattern.sub('<DATE>', text)
        text = self.policy_pattern.sub('<POLICY>', text)
        text = self.percentage_pattern.sub('<PERCENTAGE>', text)
        text = self._normalize_insurance_terms(text)
        return text.strip()

    def _normalize_insurance_terms(self, text: str) -> str:
        """Normalize insurance-specific terminology."""
        abbreviations = {
            r'\bGL\b': 'general liability', r'\bWC\b': 'workers compensation', r'\bAuto\b': 'automobile',
            r'\bD&O\b': 'directors officers', r'\bE&O\b': 'errors omissions', r'\bEPLI\b': 'employment practices liability',
            r'\bBI\b': 'bodily injury', r'\bPD\b': 'property damage', r'\bMP\b': 'medical payments',
            r'\bTPA\b': 'third party administrator', r'\bMGA\b': 'managing general agent', r'\bACV\b': 'actual cash value',
            r'\bRCV\b': 'replacement cost value'
        }
        for abbrev, full_form in abbreviations.items():
            text = re.sub(abbrev, full_form, text, flags=re.IGNORECASE)
        return text

    def _extract_insurance_entities(self, text: str) -> List[Tuple[str, str]]:
        """Extract insurance-specific entities from text."""
        # COMPUTATION-LIGHT: Finding all matches with `finditer` is highly optimized.
        entities = []
        for match in self.financial_pattern.finditer(text): entities.append(('AMOUNT', match.group()))
        for match in self.date_pattern.finditer(text): entities.append(('DATE', match.group()))
        for match in self.policy_pattern.finditer(text): entities.append(('POLICY', match.group()))
        for match in self.percentage_pattern.finditer(text): entities.append(('PERCENTAGE', match.group()))
        return entities

    def _tokenize_with_domain_awareness(self, text: str) -> List[str]:
        """
        Domain-aware tokenization that preserves insurance terminology.
        """
        # COMPUTATION-LIGHT: A single pass with regex to get initial tokens.
        word_pattern = r"[a-zA-Z]+(?:'[a-zA-Z]+)?|[0-9]+(?:\.[0-9]+)?|[^\w\s]"
        tokens = re.findall(word_pattern, text.lower())

        # COMPUTATION-LIGHT: A single while loop to merge compound terms. Its complexity is linear O(n) with respect to the number of tokens.
        merged_tokens = []
        i = 0
        while i < len(tokens):
            found_compound = False
            for length in [3, 2]:
                if i + length <= len(tokens):
                    candidate = ' '.join(tokens[i:i+length])
                    for category, terms in self.insurance_patterns.items():
                        if candidate in terms:
                            merged_tokens.append(candidate.replace(' ', '_'))
                            i += length
                            found_compound = True
                            break
                    if found_compound: break
            if not found_compound:
                merged_tokens.append(tokens[i])
                i += 1
        return merged_tokens

    def _get_word_frequencies_insurance(self, texts: List[str]) -> Dict[str, int]:
        """Get word frequencies with insurance domain emphasis."""
        # COMPUTATION-LIGHT: Dictionary lookups and updates are very fast, close to constant time O(1) on average.
        word_freqs = defaultdict(int)
        for text in texts:
            preprocessed_text = self._preprocess_text(text)
            tokens = self._tokenize_with_domain_awareness(preprocessed_text)
            for token in tokens:
                token_chars = ' '.join(list(token)) + ' </w>'
                word_freqs[token_chars] += 1
                if self._is_insurance_term(token):
                    word_freqs[token_chars] += 2
        return word_freqs

    def _is_insurance_term(self, token: str) -> bool:
        """Check if token is an insurance-specific term."""
        token_lower = token.lower().replace('_', ' ')
        for category, terms in self.insurance_patterns.items():
            if token_lower in terms: return True
        insurance_keywords = {'claim', 'policy', 'premium', 'deductible', 'coverage', 'liability', 'underwrite', 'insured', 'carrier', 'risk', 'loss', 'damage', 'accident', 'incident', 'hazard', 'peril', 'exposure', 'limit'}
        return token_lower in insurance_keywords

    def _get_pairs(self, word: str) -> Set[Tuple[str, str]]:
        """Get all adjacent pairs in a word."""
        pairs = set()
        prev_char = word[0]
        for char in word[1:]:
            pairs.add((prev_char, char))
            prev_char = char
        return pairs

    def _merge_word(self, word: str, pair: Tuple[str, str]) -> str:
        """Merge a specific pair in a word."""
        return ' '.join(word.split()).replace(f'{pair[0]} {pair[1]}', f'{pair[0]}{pair[1]}')

    def _train_insurance_bpe(self, texts: List[str]) -> None:
        """
        # COMPUTATION-HEAVY: This is the most intensive part of the code.
        # BPE training involves multiple loops over the vocabulary and pairs, which can be slow,
        # especially as the vocabulary and number of merges grow.
        # This should only be run ONCE during setup, not during user interaction.
        """
        word_freqs = self._get_word_frequencies_insurance(texts)
        vocab = set()
        for word in word_freqs.keys(): vocab.update(word.split())
        for category, terms in self.insurance_patterns.items():
            for term in terms: vocab.add(term.replace(' ', '_'))

        num_merges = self.vocab_size - len(self.vocab) - len(vocab)

        for merge_idx in range(num_merges):
            pairs = defaultdict(int)
            for word, freq in word_freqs.items():
                word_pairs = self._get_pairs(word.split())
                for pair in word_pairs:
                    pairs[pair] += freq
            if not pairs: break
            best_pair = max(pairs, key=pairs.get)

            new_word_freqs = {}
            for word, freq in word_freqs.items():
                new_word = self._merge_word(word, best_pair)
                new_word_freqs[new_word] = freq
            word_freqs = new_word_freqs

            self.merges.append(best_pair)
            self.bpe_ranks[best_pair] = merge_idx
            merged_token = best_pair[0] + best_pair[1]
            vocab.add(merged_token)

    def _apply_bpe(self, word: str) -> List[str]:
        """Apply BPE merges to a word."""
        if len(word) == 1: return [word]
        word_tokens = list(word)
        word = ' '.join(word_tokens) + ' </w>'

        while True:
            pairs = self._get_pairs(word.split())
            if not pairs: break

            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks: break

            word = self._merge_word(word, bigram)

        return word.split()

    def train(self, texts: List[str]) -> None:
        """Train the insurance domain tokenizer."""
        if self.model_type == "insurance_bpe":
            self._train_insurance_bpe(texts)
            all_tokens = set()
            for text in texts:
                preprocessed = self._preprocess_text(text)
                tokens = self._tokenize_with_domain_awareness(preprocessed)
                for token in tokens:
                    bpe_tokens = self._apply_bpe(token)
                    all_tokens.update(bpe_tokens)
        else:
            all_tokens = set()
            for text in texts:
                preprocessed = self._preprocess_text(text)
                tokens = self._tokenize_with_domain_awareness(preprocessed)
                all_tokens.update(tokens)
                for token in tokens:
                    self.token_frequencies[token] += 3 if self._is_insurance_term(token) else 1

        if len(all_tokens) > self.vocab_size - len(self.special_tokens):
            if self.model_type != "insurance_bpe":
                insurance_terms = [t for t in all_tokens if self._is_insurance_term(t)]
                other_terms = sorted([t for t in all_tokens if not self._is_insurance_term(t)], key=lambda x: self.token_frequencies[x], reverse=True)
                max_others = self.vocab_size - len(self.special_tokens) - len(insurance_terms)
                all_tokens = insurance_terms + other_terms[:max_others]

        start_idx = len(self.special_tokens)
        for i, token in enumerate(sorted(list(all_tokens))):
            token_id = start_idx + i
            self.vocab[token] = token_id
            self.id_to_token[token_id] = token

    def tokenize(self, text: str) -> List[str]:
        """Tokenize insurance document text."""
        preprocessed = self._preprocess_text(text)
        tokens = self._tokenize_with_domain_awareness(preprocessed)
        if self.model_type == "insurance_bpe":
            result = []
            for token in tokens:
                if token in self.vocab: result.append(token)
                else: result.extend(self._apply_bpe(token))
            return result
        return tokens

    def encode(self, text: str) -> List[int]:
        """Encode text to token IDs."""
        tokens = self.tokenize(text)
        return [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens]

    def decode(self, token_ids: List[int]) -> str:
        """Decode token IDs back to text."""
        tokens = [self.id_to_token[tid] for tid in token_ids if tid in self.id_to_token]
        text = ' '.join(tokens).replace(' </w>', '').replace('_', ' ')
        # Clean up special tokens that shouldn't be in the final text
        for special in self.special_tokens:
            if special not in ["<AMOUNT>", "<DATE>", "<POLICY>", "<PERCENTAGE>"]:
                text = text.replace(special, '')
        return text.strip()

    def analyze_document(self, text: str) -> Dict:
        """Analyze insurance document and extract key information."""
        # COMPUTATION-LIGHT: Analysis is fast as it reuses efficient tokenization and regex methods.
        entities = self._extract_insurance_entities(text)
        tokens = self.tokenize(text)
        if not tokens: return {'document_type': 'Unknown', 'total_tokens': 0, 'insurance_terms': 0, 'insurance_term_ratio': 0, 'entities': [], 'key_terms': [], 'risk_score': 0, 'confidence': 0}

        insurance_term_count = sum(1 for token in tokens if self._is_insurance_term(token))
        doc_type = self._identify_document_type(text, tokens)
        risk_score = self._calculate_risk_score(text, tokens, entities)

        return {
            'document_type': doc_type, 'total_tokens': len(tokens),
            'insurance_terms': insurance_term_count, 'insurance_term_ratio': insurance_term_count / len(tokens),
            'entities': entities, 'key_terms': list(set([t for t in tokens if self._is_insurance_term(t)]))[:20],
            'risk_score': risk_score, 'confidence': min(0.95, insurance_term_count / len(tokens) * 2)
        }

    def _calculate_risk_score(self, text: str, tokens: List[str], entities: List[Tuple[str, str]]) -> float:
        """Calculate risk score based on document content."""
        risk_score = 0.5
        high_risk_terms = ['prior claims', 'safety violations', 'hazardous materials', 'high risk activity', 'regulatory issues', 'financial distress']
        for term in high_risk_terms:
            if term.replace(' ', '_') in tokens or term in text.lower(): risk_score += 0.1

        amounts = [float(re.sub(r'[\$,]', '', entity[1])) for entity in entities if entity[0] == 'AMOUNT' and re.sub(r'[\$,]', '', entity[1]).replace('.','',1).isdigit()]
        if amounts:
            max_amount = max(amounts)
            if max_amount > 1000000: risk_score += 0.2
            elif max_amount > 100000: risk_score += 0.1
        return min(1.0, max(0.0, risk_score))

    def _identify_document_type(self, text: str, tokens: List[str]) -> str:
        """Identify the type of insurance document."""
        doc_indicators = {
            'loss_run': ['loss_run', 'claims_history', 'paid_losses', 'incurred', 'reserves', 'loss_ratio'],
            'policy': ['policy_number', 'effective_date', 'expiration_date', 'coverage_limit', 'policy_period'],
            'claim': ['claim_number', 'date_of_loss', 'claimant', 'adjuster', 'claim_details'],
            'submission': ['submission', 'application', 'proposal', 'quote_request', 'underwriting'],
            'certificate': ['certificate', 'evidence_of_coverage', 'additional_insured']
        }
        scores = {doc_type: sum(2 if ind.replace(' ', '_') in tokens else 1 if ind.replace('_', ' ') in text.lower() else 0 for ind in indicators) for doc_type, indicators in doc_indicators.items()}
        if not scores or max(scores.values()) == 0: return 'general_insurance'
        return max(scores, key=scores.get)

    def get_vocab_size(self) -> int:
        return len(self.vocab)

# --- SINGLE GLOBAL INSTANCE ---
# The tokenizer is created and trained only ONCE when the script starts.
# All functions will now use this single, pre-trained instance.
print("Initializing and training the Pibit.ai Insurance Tokenizer... Please wait.")
tokenizer = PibitInsuranceTokenizer(vocab_size=8000, model_type="insurance_bpe")

# Default training documents
default_training_docs = [
    "Loss Run Report for Policy GL2024-001234. Effective Date: 01/01/2024 to 12/31/2024. Insured: ABC Manufacturing Company. Coverage: General Liability, $1,000,000 per occurrence, $2,000,000 aggregate. Claims History: Claim 1: Slip and fall incident, Date of Loss: 03/15/2024, Paid: $25,000, Incurred: $45,000, Status: Closed. Claim 2: Product liability claim, Date of Loss: 07/22/2024, Paid: $0, Incurred: $75,000, Status: Open, Reserves: $75,000. Total Paid Losses: $25,000. Total Incurred Losses: $120,000. Loss Ratio: 12%. Experience Modification Factor: 0.85",
    "Workers Compensation Submission. Applicant: XYZ Construction LLC. Policy Period: 01/01/2025 to 01/01/2026. Industry Code: 5645 - Residential Construction. Annual Payroll: $2,500,000. Prior Coverage: Carrier ABC, Premium: $125,000, Deductible: $5,000. Claims Experience: 3 claims in past 3 years. Workplace injury, back strain: $15,000 paid. Fall from height accident: $85,000 incurred. Repetitive motion injury: $12,000 paid. Risk Factors: High-risk construction activities, prior OSHA violations. Underwriting Notes: Requires safety program implementation.",
    "Commercial Property Loss Notice. Policy Number: CP2024-567890. Insured: Downtown Office Building LLC. Date of Loss: 11/08/2024. Cause of Loss: Water damage from burst pipe. Coverage Details: Building Coverage: $5,000,000. Business Personal Property: $500,000. Deductible: $10,000. Claim Details: Estimated Repair Cost: $125,000. Business Interruption Loss: $25,000. Adjuster: John Smith, License #12345. Initial Reserve: $150,000.",
    "Underwriting Risk Assessment Report. Account: Tech Startup Solutions Inc. Line of Business: Cyber Liability Insurance. Requested Limits: $5,000,000 per claim, $10,000,000 aggregate. Risk Factors Analysis: Industry: Technology/Software Development. Revenue: $15,000,000 annually. Security Measures: Multi-factor authentication: Yes. Encryption protocols: AES-256. Employee training: Quarterly. Incident response plan: In place. Prior Claims: None reported. Competitive Premium Quote: $45,000 annually. Recommended Deductible: $25,000."
]
tokenizer.train(default_training_docs)
print("Tokenizer is ready!")

# --- Gradio App Functions ---

def create_analysis_plots(analysis_data):
    """Create visualization plots for document analysis."""
    fig_gauge = go.Figure(go.Indicator(
        mode = "gauge+number", value = analysis_data['risk_score'] * 100,
        domain = {'x': [0, 1], 'y': [0, 1]}, title = {'text': "Risk Score"},
        gauge = {'axis': {'range': [None, 100]}, 'bar': {'color': "#2E86AB"},
                 'steps': [{'range': [0, 40], 'color': "lightgreen"}, {'range': [40, 70], 'color': "yellow"}, {'range': [70, 100], 'color': "lightcoral"}]}))
    fig_gauge.update_layout(height=300, margin=dict(l=20, r=20, t=50, b=20))

    insurance_tokens = analysis_data['insurance_terms']
    other_tokens = analysis_data['total_tokens'] - insurance_tokens
    fig_pie = px.pie(values=[insurance_tokens, other_tokens], names=['Insurance Terms', 'Other Terms'], title='Token Distribution', color_discrete_sequence=['#FF6B6B', '#4ECDC4'])
    fig_pie.update_layout(height=300, margin=dict(l=20, r=20, t=50, b=20))
    return fig_gauge, fig_pie

def analyze_insurance_document(text):
    """
    Main function to analyze insurance documents.
    This now uses the single, globally-trained tokenizer and is very fast.
    """
    if not text.strip():
        return "Please enter some text to analyze.", go.Figure(), go.Figure(), pd.DataFrame(), ""

    # The core change: No more retraining! Just analyze.
    analysis = tokenizer.analyze_document(text)

    summary = f"""
## πŸ“Š Pibit.ai Insurance Document Analysis Report
### 🏒 Document Classification
- **Document Type**: {analysis['document_type'].title().replace('_', ' ')}
- **Analysis Confidence**: {analysis['confidence']:.1%}
### πŸ“ˆ Token Analysis
- **Total Tokens**: {analysis['total_tokens']:,}
- **Insurance-Specific Terms**: {analysis['insurance_terms']:,}
- **Domain Relevance**: {analysis['insurance_term_ratio']:.1%}
### ⚠️ Risk Assessment
- **Risk Score**: {analysis['risk_score']:.2f} / 1.00
- **Risk Level**: {"πŸ”΄ HIGH" if analysis['risk_score'] > 0.7 else "🟑 MEDIUM" if analysis['risk_score'] > 0.4 else "🟒 LOW"}
### 🏷️ Entities Detected
{len(analysis['entities'])} entities found:
"""
    for entity_type, entity_value in analysis['entities'][:10]:
        summary += f"- **{entity_type}**: {entity_value}\n"
    if len(analysis['entities']) > 10:
        summary += f"- ... and {len(analysis['entities']) - 10} more\n"

    summary += f"\n### πŸ”‘ Key Insurance Terms\n"
    summary += ", ".join([f"`{term.replace('_', ' ')}`" for term in analysis['key_terms']])

    fig_gauge, fig_pie = create_analysis_plots(analysis)
    entities_df = pd.DataFrame(analysis['entities'], columns=['Entity Type', 'Value'])
    tokens = tokenizer.tokenize(text[:500])
    tokenization_example = f"**Sample Tokenization** (first 500 characters):\n\n{' | '.join(tokens[:20])}"
    if len(tokens) > 20:
        tokenization_example += f" | ... ({len(tokens)} total tokens)"

    return summary, fig_gauge, fig_pie, entities_df, tokenization_example

def tokenize_text(text):
    """Tokenize text and return tokens."""
    if not text.strip(): return "Please enter some text to tokenize."
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.encode(text)
    result = f"**Tokens ({len(tokens)}):**\n{' | '.join(tokens)}\n\n**Token IDs:**\n{' '.join(map(str, token_ids[:50]))}"
    if len(token_ids) > 50: result += f" ... ({len(token_ids)} total IDs)"
    return result

def get_tokenizer_stats():
    """Get tokenizer statistics."""
    vocab_size = tokenizer.get_vocab_size()
    insurance_terms = sum(1 for token in tokenizer.vocab.keys() if tokenizer._is_insurance_term(token))
    return f"""
## πŸ”§ Pibit.ai Insurance Tokenizer Statistics
- **Total Vocabulary Size**: {vocab_size:,}
- **Insurance-Specific Terms**: {insurance_terms:,}
- **Special Tokens**: {len(tokenizer.special_tokens)}
- **Model Type**: {tokenizer.model_type}
"""

# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Pibit.ai Insurance Tokenizer") as demo:
    gr.HTML("""<div style="text-align: center; padding: 20px;"><h1 style="color: #2E86AB;">🏒 Pibit.ai Insurance Tokenizer</h1><p style="font-size: 18px; color: #666;">Specialized NLP Tokenizer for Property & Casualty Insurance Documents</p></div>""")

    with gr.Tabs():
        with gr.Tab("πŸ“Š Document Analysis"):
            with gr.Row():
                with gr.Column(scale=2):
                    input_text = gr.Textbox(lines=15, placeholder="Paste your insurance document here...", label="πŸ“„ Insurance Document Text")
                    analyze_btn = gr.Button("πŸ” Analyze Document", variant="primary", size="lg")
                with gr.Column(scale=3):
                    analysis_output = gr.Markdown(label="πŸ“‹ Analysis Report")
                    with gr.Row():
                        risk_gauge = gr.Plot(label="⚠️ Risk Assessment")
                        token_pie = gr.Plot(label="πŸ₯§ Token Distribution")
                    entities_table = gr.DataFrame(label="🏷️ Detected Entities")
                    tokenization_sample = gr.Markdown(label="πŸ”§ Tokenization Sample")

            # The custom_training input has been removed to fix the performance issue.
            analyze_btn.click(analyze_insurance_document, inputs=[input_text], outputs=[analysis_output, risk_gauge, token_pie, entities_table, tokenization_sample])

            gr.Examples(
                examples=[
                    ["Loss Run Report for Policy GL2024-001234\nEffective Date: 01/01/2024 to 12/31/2024\nInsured: ABC Manufacturing Company\nCoverage: General Liability, $1,000,000 per occurrence, $2,000,000 aggregate\nClaims History:\nClaim 1: Slip and fall incident, Date of Loss: 03/15/2024, Paid: $25,000, Incurred: $45,000, Status: Closed"],
                    ["Workers Compensation Submission\nApplicant: XYZ Construction LLC\nPolicy Period: 01/01/2025 to 01/01/2026\nAnnual Payroll: $2,500,000\nRisk Factors: High-risk construction activities, prior OSHA violations"],
                    ["Underwriting Risk Assessment Report\nAccount: Tech Startup Solutions Inc.\nLine of Business: Cyber Liability Insurance\nRequested Limits: $5,000,000 per claim\nSecurity Measures:\n- Multi-factor authentication: Yes\n- Incident response plan: In place\nPrior Claims: None reported"],
                ],
                inputs=input_text
            )

        with gr.Tab("πŸ”§ Tokenization Tool"):
            with gr.Row():
                with gr.Column():
                    tokenize_input = gr.Textbox(lines=8, placeholder="Enter text to tokenize...", label="πŸ“ Text to Tokenize")
                    tokenize_btn = gr.Button("πŸ”§ Tokenize", variant="primary")
                with gr.Column():
                    tokenize_output = gr.Markdown(label="🎯 Tokenization Results")
            tokenize_btn.click(tokenize_text, inputs=tokenize_input, outputs=tokenize_output)

        with gr.Tab("ℹ️ Tokenizer Info"):
            tokenizer_info = gr.Markdown()
            demo.load(get_tokenizer_stats, inputs=None, outputs=tokenizer_info)

if __name__ == "__main__":
    demo.launch(debug=True)