Spaces:
Sleeping
Sleeping
| # from transformers import ( | |
| # AutoTokenizer, | |
| # AutoModelForSeq2SeqLM, | |
| # AutoModelForTokenClassification, | |
| # pipeline | |
| # ) | |
| # from keybert import KeyBERT | |
| # from summarizer import Summarizer | |
| # import re | |
| # import nltk | |
| # nltk.download('punkt') | |
| # class TextProcessor: | |
| # def __init__(self): | |
| # # Initialize summarization model | |
| # self.summarizer = Summarizer('bert-base-multilingual-cased') | |
| # # Initialize KeyBERT for keyword extraction | |
| # self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2') | |
| # # Initialize NER for action item detection | |
| # self.ner_pipeline = pipeline( | |
| # "ner", | |
| # model="cahya/bert-base-indonesian-NER", | |
| # aggregation_strategy="simple" | |
| # ) | |
| # # Action item patterns | |
| # self.action_patterns = [ | |
| # r"akan\s+(\w+)", | |
| # r"harus\s+(\w+)", | |
| # r"perlu\s+(\w+)", | |
| # r"mohon\s+(\w+)", | |
| # r"tolong\s+(\w+)", | |
| # r"segera\s+(\w+)", | |
| # r"follow\s*up", | |
| # r"action\s*item", | |
| # r"to\s*do", | |
| # r"deadline" | |
| # ] | |
| # # Decision patterns | |
| # self.decision_patterns = [ | |
| # r"(diputuskan|memutuskan)\s+(.+)", | |
| # r"(disepakati|menyepakati)\s+(.+)", | |
| # r"(setuju|persetujuan)\s+(.+)", | |
| # r"keputusan(?:nya)?\s+(.+)", | |
| # r"final(?:isasi)?\s+(.+)" | |
| # ] | |
| # def summarize_transcript(self, transcript_segments, ratio=0.3): | |
| # """ | |
| # Hierarchical summarization untuk transcript panjang | |
| # """ | |
| # # Gabungkan text dari semua segments | |
| # full_text = ' '.join([seg['text'] for seg in transcript_segments]) | |
| # # Chunking untuk dokumen panjang | |
| # chunks = self._create_chunks(full_text) | |
| # if len(chunks) == 1: | |
| # # Direct summarization untuk dokumen pendek | |
| # return self.summarizer( | |
| # chunks[0], | |
| # ratio=ratio, | |
| # num_sentences=5 | |
| # ) | |
| # else: | |
| # # Hierarchical summarization | |
| # return self._hierarchical_summarization(chunks, ratio) | |
| # def extract_key_information(self, transcript_segments): | |
| # """ | |
| # Extract action items, decisions, dan key topics | |
| # """ | |
| # full_text = ' '.join([seg['text'] for seg in transcript_segments]) | |
| # # Extract keywords/topics | |
| # keywords = self.kw_model.extract_keywords( | |
| # full_text, | |
| # keyphrase_ngram_range=(1, 3), | |
| # stop_words='indonesian', | |
| # top_n=10, | |
| # use_mmr=True, | |
| # diversity=0.5 | |
| # ) | |
| # # Extract action items dan decisions | |
| # action_items = [] | |
| # decisions = [] | |
| # for segment in transcript_segments: | |
| # # Check for action items | |
| # if self._is_action_item(segment['text']): | |
| # action_items.append({ | |
| # 'text': segment['text'], | |
| # 'speaker': segment['speaker'], | |
| # 'timestamp': f"{segment['start']:.1f}s", | |
| # 'entities': self._extract_entities(segment['text']) | |
| # }) | |
| # # Check for decisions | |
| # if self._is_decision(segment['text']): | |
| # decisions.append({ | |
| # 'text': segment['text'], | |
| # 'speaker': segment['speaker'], | |
| # 'timestamp': f"{segment['start']:.1f}s" | |
| # }) | |
| # return { | |
| # 'keywords': keywords, | |
| # 'action_items': action_items, | |
| # 'decisions': decisions | |
| # } | |
| # def _create_chunks(self, text, max_length=3000): | |
| # """ | |
| # Create overlapping chunks for long documents | |
| # """ | |
| # sentences = nltk.sent_tokenize(text) | |
| # chunks = [] | |
| # current_chunk = [] | |
| # current_length = 0 | |
| # for sentence in sentences: | |
| # sentence_length = len(sentence) | |
| # if current_length + sentence_length > max_length and current_chunk: | |
| # chunks.append(' '.join(current_chunk)) | |
| # # Keep last 2 sentences for overlap | |
| # current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else [] | |
| # current_length = sum(len(s) for s in current_chunk) | |
| # current_chunk.append(sentence) | |
| # current_length += sentence_length | |
| # if current_chunk: | |
| # chunks.append(' '.join(current_chunk)) | |
| # return chunks | |
| # def _hierarchical_summarization(self, chunks, ratio): | |
| # """ | |
| # Two-level summarization for long documents | |
| # """ | |
| # # Level 1: Summarize each chunk | |
| # chunk_summaries = [] | |
| # for chunk in chunks: | |
| # summary = self.summarizer( | |
| # chunk, | |
| # ratio=0.4, # Higher ratio for first level | |
| # num_sentences=4 | |
| # ) | |
| # chunk_summaries.append(summary) | |
| # # Level 2: Summarize the summaries | |
| # combined_summary = ' '.join(chunk_summaries) | |
| # final_summary = self.summarizer( | |
| # combined_summary, | |
| # ratio=ratio, | |
| # num_sentences=6 | |
| # ) | |
| # return final_summary | |
| # def _is_action_item(self, text): | |
| # """ | |
| # Detect if text contains action item | |
| # """ | |
| # text_lower = text.lower() | |
| # # Check patterns | |
| # for pattern in self.action_patterns: | |
| # if re.search(pattern, text_lower): | |
| # return True | |
| # # Check for imperative sentences | |
| # first_word = text.split()[0].lower() if text.split() else "" | |
| # imperative_verbs = [ | |
| # 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi', | |
| # 'follow', 'prepare', 'send', 'contact', 'create' | |
| # ] | |
| # return first_word in imperative_verbs | |
| # def _is_decision(self, text): | |
| # """ | |
| # Detect if text contains decision | |
| # """ | |
| # text_lower = text.lower() | |
| # for pattern in self.decision_patterns: | |
| # if re.search(pattern, text_lower): | |
| # return True | |
| # return False | |
| # def _extract_entities(self, text): | |
| # """ | |
| # Extract named entities (person, date, etc) | |
| # """ | |
| # entities = self.ner_pipeline(text) | |
| # return { | |
| # 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'], | |
| # 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'], | |
| # 'dates': self._extract_dates(text) | |
| # } | |
| # def _extract_dates(self, text): | |
| # """ | |
| # Extract date mentions | |
| # """ | |
| # date_patterns = [ | |
| # r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', | |
| # r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)', | |
| # r'(besok|lusa|minggu\s+depan|bulan\s+depan)', | |
| # r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)' | |
| # ] | |
| # dates = [] | |
| # for pattern in date_patterns: | |
| # matches = re.findall(pattern, text.lower()) | |
| # dates.extend(matches) | |
| # return dates | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForSeq2SeqLM, | |
| pipeline | |
| ) | |
| from keybert import KeyBERT | |
| import re | |
| import nltk | |
| from typing import List, Dict | |
| class TextProcessor: | |
| def __init__(self): | |
| print("Initializing Text Processor...") | |
| # Use transformers pipeline for summarization instead | |
| try: | |
| self.summarizer = pipeline( | |
| "summarization", | |
| model="sshleifer/distilbart-cnn-12-6", | |
| device=-1 # CPU | |
| ) | |
| except: | |
| # Fallback to simple extractive summarization | |
| self.summarizer = None | |
| print("Warning: Summarization model not loaded, using fallback") | |
| # Initialize KeyBERT for keyword extraction | |
| try: | |
| self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2') | |
| except: | |
| self.kw_model = None | |
| print("Warning: KeyBERT not loaded") | |
| # Action item patterns | |
| self.action_patterns = [ | |
| r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)", | |
| r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)", | |
| r"follow\s*up", r"action\s*item", r"to\s*do", r"deadline" | |
| ] | |
| # Decision patterns | |
| self.decision_patterns = [ | |
| r"(diputuskan|memutuskan)\s+(.+)", | |
| r"(disepakati|menyepakati)\s+(.+)", | |
| r"(setuju|persetujuan)\s+(.+)", | |
| r"keputusan(?:nya)?\s+(.+)", | |
| r"final(?:isasi)?\s+(.+)" | |
| ] | |
| print("Text Processor ready!") | |
| def summarize_transcript(self, transcript_segments, ratio=0.3): | |
| """Summarization with fallback methods""" | |
| # Combine text from all segments | |
| full_text = ' '.join([seg['text'] for seg in transcript_segments]) | |
| if not full_text.strip(): | |
| return "No content to summarize." | |
| # Try using the summarization pipeline | |
| if self.summarizer: | |
| try: | |
| # Split into chunks if too long | |
| max_chunk_length = 1024 | |
| if len(full_text) > max_chunk_length: | |
| chunks = self._split_into_chunks(full_text, max_chunk_length) | |
| summaries = [] | |
| for chunk in chunks[:3]: # Limit to first 3 chunks | |
| summary = self.summarizer( | |
| chunk, | |
| max_length=130, | |
| min_length=30, | |
| do_sample=False | |
| )[0]['summary_text'] | |
| summaries.append(summary) | |
| return ' '.join(summaries) | |
| else: | |
| return self.summarizer( | |
| full_text, | |
| max_length=150, | |
| min_length=30, | |
| do_sample=False | |
| )[0]['summary_text'] | |
| except: | |
| pass | |
| # Fallback: Simple extractive summarization | |
| return self._simple_extractive_summary(full_text, ratio) | |
| def extract_key_information(self, transcript_segments): | |
| """Extract action items, decisions, and key topics""" | |
| full_text = ' '.join([seg['text'] for seg in transcript_segments]) | |
| # Extract keywords/topics | |
| keywords = [] | |
| if self.kw_model: | |
| try: | |
| keywords = self.kw_model.extract_keywords( | |
| full_text, | |
| keyphrase_ngram_range=(1, 3), | |
| stop_words=None, | |
| top_n=10, | |
| use_mmr=True, | |
| diversity=0.5 | |
| ) | |
| except: | |
| pass | |
| # If KeyBERT fails, use simple frequency-based extraction | |
| if not keywords: | |
| keywords = self._extract_keywords_simple(full_text) | |
| # Extract action items and decisions | |
| action_items = [] | |
| decisions = [] | |
| for segment in transcript_segments: | |
| # Check for action items | |
| if self._is_action_item(segment['text']): | |
| action_items.append({ | |
| 'text': segment['text'], | |
| 'speaker': segment['speaker'], | |
| 'timestamp': f"{segment['start']:.1f}s" | |
| }) | |
| # Check for decisions | |
| if self._is_decision(segment['text']): | |
| decisions.append({ | |
| 'text': segment['text'], | |
| 'speaker': segment['speaker'], | |
| 'timestamp': f"{segment['start']:.1f}s" | |
| }) | |
| return { | |
| 'keywords': keywords, | |
| 'action_items': action_items, | |
| 'decisions': decisions | |
| } | |
| def _split_into_chunks(self, text, max_length): | |
| """Split text into chunks""" | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for word in words: | |
| current_chunk.append(word) | |
| current_length += len(word) + 1 | |
| if current_length >= max_length: | |
| chunks.append(' '.join(current_chunk)) | |
| current_chunk = [] | |
| current_length = 0 | |
| if current_chunk: | |
| chunks.append(' '.join(current_chunk)) | |
| return chunks | |
| def _simple_extractive_summary(self, text, ratio=0.3): | |
| """Simple extractive summarization fallback""" | |
| sentences = nltk.sent_tokenize(text) | |
| if len(sentences) <= 3: | |
| return text | |
| # Calculate number of sentences to include | |
| num_sentences = max(3, int(len(sentences) * ratio)) | |
| # Simple scoring: prefer sentences with more content words | |
| scored_sentences = [] | |
| for i, sent in enumerate(sentences): | |
| # Score based on length and position | |
| score = len(sent.split()) | |
| if i < 3: # Boost first sentences | |
| score *= 1.5 | |
| if i >= len(sentences) - 2: # Boost last sentences | |
| score *= 1.2 | |
| scored_sentences.append((score, sent)) | |
| # Sort by score and select top sentences | |
| scored_sentences.sort(reverse=True) | |
| selected = [sent for _, sent in scored_sentences[:num_sentences]] | |
| # Return in original order | |
| return ' '.join([s for s in sentences if s in selected]) | |
| def _extract_keywords_simple(self, text): | |
| """Simple keyword extraction fallback""" | |
| # Remove common words | |
| stopwords = { | |
| 'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'adalah', | |
| 'ini', 'itu', 'dengan', 'tersebut', 'dalam', 'dapat', 'akan', | |
| 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', | |
| 'for', 'of', 'with', 'as', 'is', 'was', 'are', 'were' | |
| } | |
| # Count word frequency | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| word_freq = {} | |
| for word in words: | |
| if len(word) > 3 and word not in stopwords: | |
| word_freq[word] = word_freq.get(word, 0) + 1 | |
| # Get top keywords | |
| keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10] | |
| # Format like KeyBERT output | |
| return [(word, freq/len(words)) for word, freq in keywords] | |
| def _is_action_item(self, text): | |
| """Detect if text contains action item""" | |
| text_lower = text.lower() | |
| # Check patterns | |
| for pattern in self.action_patterns: | |
| if re.search(pattern, text_lower): | |
| return True | |
| # Check for imperative sentences | |
| first_word = text.split()[0].lower() if text.split() else "" | |
| imperative_verbs = [ | |
| 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi', | |
| 'follow', 'prepare', 'send', 'contact', 'create' | |
| ] | |
| return first_word in imperative_verbs | |
| def _is_decision(self, text): | |
| """Detect if text contains decision""" | |
| text_lower = text.lower() | |
| for pattern in self.decision_patterns: | |
| if re.search(pattern, text_lower): | |
| return True | |
| return False |