meeting-minutes-ai / utils /text_processor.py
Yermia's picture
First init
5da9a16
raw
history blame
7.22 kB
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoModelForTokenClassification,
pipeline
)
from keybert import KeyBERT
from summarizer import Summarizer
import re
import nltk
nltk.download('punkt')
class TextProcessor:
def __init__(self):
# Initialize summarization model
self.summarizer = Summarizer('bert-base-multilingual-cased')
# Initialize KeyBERT for keyword extraction
self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
# Initialize NER for action item detection
self.ner_pipeline = pipeline(
"ner",
model="cahya/bert-base-indonesian-NER",
aggregation_strategy="simple"
)
# Action item patterns
self.action_patterns = [
r"akan\s+(\w+)",
r"harus\s+(\w+)",
r"perlu\s+(\w+)",
r"mohon\s+(\w+)",
r"tolong\s+(\w+)",
r"segera\s+(\w+)",
r"follow\s*up",
r"action\s*item",
r"to\s*do",
r"deadline"
]
# Decision patterns
self.decision_patterns = [
r"(diputuskan|memutuskan)\s+(.+)",
r"(disepakati|menyepakati)\s+(.+)",
r"(setuju|persetujuan)\s+(.+)",
r"keputusan(?:nya)?\s+(.+)",
r"final(?:isasi)?\s+(.+)"
]
def summarize_transcript(self, transcript_segments, ratio=0.3):
"""
Hierarchical summarization untuk transcript panjang
"""
# Gabungkan text dari semua segments
full_text = ' '.join([seg['text'] for seg in transcript_segments])
# Chunking untuk dokumen panjang
chunks = self._create_chunks(full_text)
if len(chunks) == 1:
# Direct summarization untuk dokumen pendek
return self.summarizer(
chunks[0],
ratio=ratio,
num_sentences=5
)
else:
# Hierarchical summarization
return self._hierarchical_summarization(chunks, ratio)
def extract_key_information(self, transcript_segments):
"""
Extract action items, decisions, dan key topics
"""
full_text = ' '.join([seg['text'] for seg in transcript_segments])
# Extract keywords/topics
keywords = self.kw_model.extract_keywords(
full_text,
keyphrase_ngram_range=(1, 3),
stop_words='indonesian',
top_n=10,
use_mmr=True,
diversity=0.5
)
# Extract action items dan decisions
action_items = []
decisions = []
for segment in transcript_segments:
# Check for action items
if self._is_action_item(segment['text']):
action_items.append({
'text': segment['text'],
'speaker': segment['speaker'],
'timestamp': f"{segment['start']:.1f}s",
'entities': self._extract_entities(segment['text'])
})
# Check for decisions
if self._is_decision(segment['text']):
decisions.append({
'text': segment['text'],
'speaker': segment['speaker'],
'timestamp': f"{segment['start']:.1f}s"
})
return {
'keywords': keywords,
'action_items': action_items,
'decisions': decisions
}
def _create_chunks(self, text, max_length=3000):
"""
Create overlapping chunks for long documents
"""
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence)
if current_length + sentence_length > max_length and current_chunk:
chunks.append(' '.join(current_chunk))
# Keep last 2 sentences for overlap
current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
current_length = sum(len(s) for s in current_chunk)
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def _hierarchical_summarization(self, chunks, ratio):
"""
Two-level summarization for long documents
"""
# Level 1: Summarize each chunk
chunk_summaries = []
for chunk in chunks:
summary = self.summarizer(
chunk,
ratio=0.4, # Higher ratio for first level
num_sentences=4
)
chunk_summaries.append(summary)
# Level 2: Summarize the summaries
combined_summary = ' '.join(chunk_summaries)
final_summary = self.summarizer(
combined_summary,
ratio=ratio,
num_sentences=6
)
return final_summary
def _is_action_item(self, text):
"""
Detect if text contains action item
"""
text_lower = text.lower()
# Check patterns
for pattern in self.action_patterns:
if re.search(pattern, text_lower):
return True
# Check for imperative sentences
first_word = text.split()[0].lower() if text.split() else ""
imperative_verbs = [
'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
'follow', 'prepare', 'send', 'contact', 'create'
]
return first_word in imperative_verbs
def _is_decision(self, text):
"""
Detect if text contains decision
"""
text_lower = text.lower()
for pattern in self.decision_patterns:
if re.search(pattern, text_lower):
return True
return False
def _extract_entities(self, text):
"""
Extract named entities (person, date, etc)
"""
entities = self.ner_pipeline(text)
return {
'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
'dates': self._extract_dates(text)
}
def _extract_dates(self, text):
"""
Extract date mentions
"""
date_patterns = [
r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
]
dates = []
for pattern in date_patterns:
matches = re.findall(pattern, text.lower())
dates.extend(matches)
return dates