from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification, pipeline ) from keybert import KeyBERT from summarizer import Summarizer import re import nltk nltk.download('punkt') class TextProcessor: def __init__(self): # Initialize summarization model self.summarizer = Summarizer('bert-base-multilingual-cased') # Initialize KeyBERT for keyword extraction self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2') # Initialize NER for action item detection self.ner_pipeline = pipeline( "ner", model="cahya/bert-base-indonesian-NER", aggregation_strategy="simple" ) # Action item patterns self.action_patterns = [ r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)", r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)", r"follow\s*up", r"action\s*item", r"to\s*do", r"deadline" ] # Decision patterns self.decision_patterns = [ r"(diputuskan|memutuskan)\s+(.+)", r"(disepakati|menyepakati)\s+(.+)", r"(setuju|persetujuan)\s+(.+)", r"keputusan(?:nya)?\s+(.+)", r"final(?:isasi)?\s+(.+)" ] def summarize_transcript(self, transcript_segments, ratio=0.3): """ Hierarchical summarization untuk transcript panjang """ # Gabungkan text dari semua segments full_text = ' '.join([seg['text'] for seg in transcript_segments]) # Chunking untuk dokumen panjang chunks = self._create_chunks(full_text) if len(chunks) == 1: # Direct summarization untuk dokumen pendek return self.summarizer( chunks[0], ratio=ratio, num_sentences=5 ) else: # Hierarchical summarization return self._hierarchical_summarization(chunks, ratio) def extract_key_information(self, transcript_segments): """ Extract action items, decisions, dan key topics """ full_text = ' '.join([seg['text'] for seg in transcript_segments]) # Extract keywords/topics keywords = self.kw_model.extract_keywords( full_text, keyphrase_ngram_range=(1, 3), stop_words='indonesian', top_n=10, use_mmr=True, diversity=0.5 ) # Extract action items dan decisions action_items = [] decisions = [] for segment in transcript_segments: # Check for action items if self._is_action_item(segment['text']): action_items.append({ 'text': segment['text'], 'speaker': segment['speaker'], 'timestamp': f"{segment['start']:.1f}s", 'entities': self._extract_entities(segment['text']) }) # Check for decisions if self._is_decision(segment['text']): decisions.append({ 'text': segment['text'], 'speaker': segment['speaker'], 'timestamp': f"{segment['start']:.1f}s" }) return { 'keywords': keywords, 'action_items': action_items, 'decisions': decisions } def _create_chunks(self, text, max_length=3000): """ Create overlapping chunks for long documents """ sentences = nltk.sent_tokenize(text) chunks = [] current_chunk = [] current_length = 0 for sentence in sentences: sentence_length = len(sentence) if current_length + sentence_length > max_length and current_chunk: chunks.append(' '.join(current_chunk)) # Keep last 2 sentences for overlap current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else [] current_length = sum(len(s) for s in current_chunk) current_chunk.append(sentence) current_length += sentence_length if current_chunk: chunks.append(' '.join(current_chunk)) return chunks def _hierarchical_summarization(self, chunks, ratio): """ Two-level summarization for long documents """ # Level 1: Summarize each chunk chunk_summaries = [] for chunk in chunks: summary = self.summarizer( chunk, ratio=0.4, # Higher ratio for first level num_sentences=4 ) chunk_summaries.append(summary) # Level 2: Summarize the summaries combined_summary = ' '.join(chunk_summaries) final_summary = self.summarizer( combined_summary, ratio=ratio, num_sentences=6 ) return final_summary def _is_action_item(self, text): """ Detect if text contains action item """ text_lower = text.lower() # Check patterns for pattern in self.action_patterns: if re.search(pattern, text_lower): return True # Check for imperative sentences first_word = text.split()[0].lower() if text.split() else "" imperative_verbs = [ 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi', 'follow', 'prepare', 'send', 'contact', 'create' ] return first_word in imperative_verbs def _is_decision(self, text): """ Detect if text contains decision """ text_lower = text.lower() for pattern in self.decision_patterns: if re.search(pattern, text_lower): return True return False def _extract_entities(self, text): """ Extract named entities (person, date, etc) """ entities = self.ner_pipeline(text) return { 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'], 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'], 'dates': self._extract_dates(text) } def _extract_dates(self, text): """ Extract date mentions """ date_patterns = [ r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)', r'(besok|lusa|minggu\s+depan|bulan\s+depan)', r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)' ] dates = [] for pattern in date_patterns: matches = re.findall(pattern, text.lower()) dates.extend(matches) return dates