Spaces:
Sleeping
Sleeping
from transformers import ( | |
AutoTokenizer, | |
AutoModelForSeq2SeqLM, | |
AutoModelForTokenClassification, | |
pipeline | |
) | |
from keybert import KeyBERT | |
from summarizer import Summarizer | |
import re | |
import nltk | |
nltk.download('punkt') | |
class TextProcessor: | |
def __init__(self): | |
# Initialize summarization model | |
self.summarizer = Summarizer('bert-base-multilingual-cased') | |
# Initialize KeyBERT for keyword extraction | |
self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2') | |
# Initialize NER for action item detection | |
self.ner_pipeline = pipeline( | |
"ner", | |
model="cahya/bert-base-indonesian-NER", | |
aggregation_strategy="simple" | |
) | |
# Action item patterns | |
self.action_patterns = [ | |
r"akan\s+(\w+)", | |
r"harus\s+(\w+)", | |
r"perlu\s+(\w+)", | |
r"mohon\s+(\w+)", | |
r"tolong\s+(\w+)", | |
r"segera\s+(\w+)", | |
r"follow\s*up", | |
r"action\s*item", | |
r"to\s*do", | |
r"deadline" | |
] | |
# Decision patterns | |
self.decision_patterns = [ | |
r"(diputuskan|memutuskan)\s+(.+)", | |
r"(disepakati|menyepakati)\s+(.+)", | |
r"(setuju|persetujuan)\s+(.+)", | |
r"keputusan(?:nya)?\s+(.+)", | |
r"final(?:isasi)?\s+(.+)" | |
] | |
def summarize_transcript(self, transcript_segments, ratio=0.3): | |
""" | |
Hierarchical summarization untuk transcript panjang | |
""" | |
# Gabungkan text dari semua segments | |
full_text = ' '.join([seg['text'] for seg in transcript_segments]) | |
# Chunking untuk dokumen panjang | |
chunks = self._create_chunks(full_text) | |
if len(chunks) == 1: | |
# Direct summarization untuk dokumen pendek | |
return self.summarizer( | |
chunks[0], | |
ratio=ratio, | |
num_sentences=5 | |
) | |
else: | |
# Hierarchical summarization | |
return self._hierarchical_summarization(chunks, ratio) | |
def extract_key_information(self, transcript_segments): | |
""" | |
Extract action items, decisions, dan key topics | |
""" | |
full_text = ' '.join([seg['text'] for seg in transcript_segments]) | |
# Extract keywords/topics | |
keywords = self.kw_model.extract_keywords( | |
full_text, | |
keyphrase_ngram_range=(1, 3), | |
stop_words='indonesian', | |
top_n=10, | |
use_mmr=True, | |
diversity=0.5 | |
) | |
# Extract action items dan decisions | |
action_items = [] | |
decisions = [] | |
for segment in transcript_segments: | |
# Check for action items | |
if self._is_action_item(segment['text']): | |
action_items.append({ | |
'text': segment['text'], | |
'speaker': segment['speaker'], | |
'timestamp': f"{segment['start']:.1f}s", | |
'entities': self._extract_entities(segment['text']) | |
}) | |
# Check for decisions | |
if self._is_decision(segment['text']): | |
decisions.append({ | |
'text': segment['text'], | |
'speaker': segment['speaker'], | |
'timestamp': f"{segment['start']:.1f}s" | |
}) | |
return { | |
'keywords': keywords, | |
'action_items': action_items, | |
'decisions': decisions | |
} | |
def _create_chunks(self, text, max_length=3000): | |
""" | |
Create overlapping chunks for long documents | |
""" | |
sentences = nltk.sent_tokenize(text) | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for sentence in sentences: | |
sentence_length = len(sentence) | |
if current_length + sentence_length > max_length and current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
# Keep last 2 sentences for overlap | |
current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else [] | |
current_length = sum(len(s) for s in current_chunk) | |
current_chunk.append(sentence) | |
current_length += sentence_length | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return chunks | |
def _hierarchical_summarization(self, chunks, ratio): | |
""" | |
Two-level summarization for long documents | |
""" | |
# Level 1: Summarize each chunk | |
chunk_summaries = [] | |
for chunk in chunks: | |
summary = self.summarizer( | |
chunk, | |
ratio=0.4, # Higher ratio for first level | |
num_sentences=4 | |
) | |
chunk_summaries.append(summary) | |
# Level 2: Summarize the summaries | |
combined_summary = ' '.join(chunk_summaries) | |
final_summary = self.summarizer( | |
combined_summary, | |
ratio=ratio, | |
num_sentences=6 | |
) | |
return final_summary | |
def _is_action_item(self, text): | |
""" | |
Detect if text contains action item | |
""" | |
text_lower = text.lower() | |
# Check patterns | |
for pattern in self.action_patterns: | |
if re.search(pattern, text_lower): | |
return True | |
# Check for imperative sentences | |
first_word = text.split()[0].lower() if text.split() else "" | |
imperative_verbs = [ | |
'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi', | |
'follow', 'prepare', 'send', 'contact', 'create' | |
] | |
return first_word in imperative_verbs | |
def _is_decision(self, text): | |
""" | |
Detect if text contains decision | |
""" | |
text_lower = text.lower() | |
for pattern in self.decision_patterns: | |
if re.search(pattern, text_lower): | |
return True | |
return False | |
def _extract_entities(self, text): | |
""" | |
Extract named entities (person, date, etc) | |
""" | |
entities = self.ner_pipeline(text) | |
return { | |
'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'], | |
'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'], | |
'dates': self._extract_dates(text) | |
} | |
def _extract_dates(self, text): | |
""" | |
Extract date mentions | |
""" | |
date_patterns = [ | |
r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', | |
r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)', | |
r'(besok|lusa|minggu\s+depan|bulan\s+depan)', | |
r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)' | |
] | |
dates = [] | |
for pattern in date_patterns: | |
matches = re.findall(pattern, text.lower()) | |
dates.extend(matches) | |
return dates |