Spaces:

Yermia
/

meeting-minutes-ai

Sleeping

App Files Files Community

meeting-minutes-ai / utils /text_processor.py

Yermia

Fix text procesor

7af8df4 4 months ago

raw

history blame

16.1 kB

	# from transformers import (
	# AutoTokenizer,
	# AutoModelForSeq2SeqLM,
	# AutoModelForTokenClassification,
	# pipeline
	# )
	# from keybert import KeyBERT
	# from summarizer import Summarizer
	# import re
	# import nltk
	# nltk.download('punkt')

	# class TextProcessor:
	# def __init__(self):
	# # Initialize summarization model
	# self.summarizer = Summarizer('bert-base-multilingual-cased')

	# # Initialize KeyBERT for keyword extraction
	# self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')

	# # Initialize NER for action item detection
	# self.ner_pipeline = pipeline(
	# "ner",
	# model="cahya/bert-base-indonesian-NER",
	# aggregation_strategy="simple"
	# )

	# # Action item patterns
	# self.action_patterns = [
	# r"akan\s+(\w+)",
	# r"harus\s+(\w+)",
	# r"perlu\s+(\w+)",
	# r"mohon\s+(\w+)",
	# r"tolong\s+(\w+)",
	# r"segera\s+(\w+)",
	# r"follow\s*up",
	# r"action\s*item",
	# r"to\s*do",
	# r"deadline"
	# ]

	# # Decision patterns
	# self.decision_patterns = [
	# r"(diputuskan\|memutuskan)\s+(.+)",
	# r"(disepakati\|menyepakati)\s+(.+)",
	# r"(setuju\|persetujuan)\s+(.+)",
	# r"keputusan(?:nya)?\s+(.+)",
	# r"final(?:isasi)?\s+(.+)"
	# ]

	# def summarize_transcript(self, transcript_segments, ratio=0.3):
	# """
	# Hierarchical summarization untuk transcript panjang
	# """
	# # Gabungkan text dari semua segments
	# full_text = ' '.join([seg['text'] for seg in transcript_segments])

	# # Chunking untuk dokumen panjang
	# chunks = self._create_chunks(full_text)

	# if len(chunks) == 1:
	# # Direct summarization untuk dokumen pendek
	# return self.summarizer(
	# chunks[0],
	# ratio=ratio,
	# num_sentences=5
	# )
	# else:
	# # Hierarchical summarization
	# return self._hierarchical_summarization(chunks, ratio)

	# def extract_key_information(self, transcript_segments):
	# """
	# Extract action items, decisions, dan key topics
	# """
	# full_text = ' '.join([seg['text'] for seg in transcript_segments])

	# # Extract keywords/topics
	# keywords = self.kw_model.extract_keywords(
	# full_text,
	# keyphrase_ngram_range=(1, 3),
	# stop_words='indonesian',
	# top_n=10,
	# use_mmr=True,
	# diversity=0.5
	# )

	# # Extract action items dan decisions
	# action_items = []
	# decisions = []

	# for segment in transcript_segments:
	# # Check for action items
	# if self._is_action_item(segment['text']):
	# action_items.append({
	# 'text': segment['text'],
	# 'speaker': segment['speaker'],
	# 'timestamp': f"{segment['start']:.1f}s",
	# 'entities': self._extract_entities(segment['text'])
	# })

	# # Check for decisions
	# if self._is_decision(segment['text']):
	# decisions.append({
	# 'text': segment['text'],
	# 'speaker': segment['speaker'],
	# 'timestamp': f"{segment['start']:.1f}s"
	# })

	# return {
	# 'keywords': keywords,
	# 'action_items': action_items,
	# 'decisions': decisions
	# }

	# def _create_chunks(self, text, max_length=3000):
	# """
	# Create overlapping chunks for long documents
	# """
	# sentences = nltk.sent_tokenize(text)
	# chunks = []
	# current_chunk = []
	# current_length = 0

	# for sentence in sentences:
	# sentence_length = len(sentence)

	# if current_length + sentence_length > max_length and current_chunk:
	# chunks.append(' '.join(current_chunk))
	# # Keep last 2 sentences for overlap
	# current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
	# current_length = sum(len(s) for s in current_chunk)

	# current_chunk.append(sentence)
	# current_length += sentence_length

	# if current_chunk:
	# chunks.append(' '.join(current_chunk))

	# return chunks

	# def _hierarchical_summarization(self, chunks, ratio):
	# """
	# Two-level summarization for long documents
	# """
	# # Level 1: Summarize each chunk
	# chunk_summaries = []
	# for chunk in chunks:
	# summary = self.summarizer(
	# chunk,
	# ratio=0.4, # Higher ratio for first level
	# num_sentences=4
	# )
	# chunk_summaries.append(summary)

	# # Level 2: Summarize the summaries
	# combined_summary = ' '.join(chunk_summaries)
	# final_summary = self.summarizer(
	# combined_summary,
	# ratio=ratio,
	# num_sentences=6
	# )

	# return final_summary

	# def _is_action_item(self, text):
	# """
	# Detect if text contains action item
	# """
	# text_lower = text.lower()

	# # Check patterns
	# for pattern in self.action_patterns:
	# if re.search(pattern, text_lower):
	# return True

	# # Check for imperative sentences
	# first_word = text.split()[0].lower() if text.split() else ""
	# imperative_verbs = [
	# 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
	# 'follow', 'prepare', 'send', 'contact', 'create'
	# ]

	# return first_word in imperative_verbs

	# def _is_decision(self, text):
	# """
	# Detect if text contains decision
	# """
	# text_lower = text.lower()

	# for pattern in self.decision_patterns:
	# if re.search(pattern, text_lower):
	# return True

	# return False

	# def _extract_entities(self, text):
	# """
	# Extract named entities (person, date, etc)
	# """
	# entities = self.ner_pipeline(text)

	# return {
	# 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
	# 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
	# 'dates': self._extract_dates(text)
	# }

	# def _extract_dates(self, text):
	# """
	# Extract date mentions
	# """
	# date_patterns = [
	# r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
	# r'(senin\|selasa\|rabu\|kamis\|jumat\|sabtu\|minggu)',
	# r'(besok\|lusa\|minggu\s+depan\|bulan\s+depan)',
	# r'(januari\|februari\|maret\|april\|mei\|juni\|juli\|agustus\|september\|oktober\|november\|desember)'
	# ]

	# dates = []
	# for pattern in date_patterns:
	# matches = re.findall(pattern, text.lower())
	# dates.extend(matches)

	# return dates



	from transformers import (
	AutoTokenizer,
	AutoModelForSeq2SeqLM,
	pipeline
	)
	from keybert import KeyBERT
	import re
	import nltk
	from typing import List, Dict

	class TextProcessor:
	def __init__(self):
	print("Initializing Text Processor...")

	# Use transformers pipeline for summarization instead
	try:
	self.summarizer = pipeline(
	"summarization",
	model="sshleifer/distilbart-cnn-12-6",
	device=-1 # CPU
	)
	except:
	# Fallback to simple extractive summarization
	self.summarizer = None
	print("Warning: Summarization model not loaded, using fallback")

	# Initialize KeyBERT for keyword extraction
	try:
	self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
	except:
	self.kw_model = None
	print("Warning: KeyBERT not loaded")

	# Action item patterns
	self.action_patterns = [
	r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)",
	r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)",
	r"follow\sup", r"action\sitem", r"to\s*do", r"deadline"
	]

	# Decision patterns
	self.decision_patterns = [
	r"(diputuskan\|memutuskan)\s+(.+)",
	r"(disepakati\|menyepakati)\s+(.+)",
	r"(setuju\|persetujuan)\s+(.+)",
	r"keputusan(?:nya)?\s+(.+)",
	r"final(?:isasi)?\s+(.+)"
	]

	print("Text Processor ready!")

	def summarize_transcript(self, transcript_segments, ratio=0.3):
	"""Summarization with fallback methods"""
	# Combine text from all segments
	full_text = ' '.join([seg['text'] for seg in transcript_segments])

	if not full_text.strip():
	return "No content to summarize."

	# Try using the summarization pipeline
	if self.summarizer:
	try:
	# Split into chunks if too long
	max_chunk_length = 1024
	if len(full_text) > max_chunk_length:
	chunks = self._split_into_chunks(full_text, max_chunk_length)
	summaries = []

	for chunk in chunks[:3]: # Limit to first 3 chunks
	summary = self.summarizer(
	chunk,
	max_length=130,
	min_length=30,
	do_sample=False
	)[0]['summary_text']
	summaries.append(summary)

	return ' '.join(summaries)
	else:
	return self.summarizer(
	full_text,
	max_length=150,
	min_length=30,
	do_sample=False
	)[0]['summary_text']
	except:
	pass

	# Fallback: Simple extractive summarization
	return self._simple_extractive_summary(full_text, ratio)

	def extract_key_information(self, transcript_segments):
	"""Extract action items, decisions, and key topics"""
	full_text = ' '.join([seg['text'] for seg in transcript_segments])

	# Extract keywords/topics
	keywords = []
	if self.kw_model:
	try:
	keywords = self.kw_model.extract_keywords(
	full_text,
	keyphrase_ngram_range=(1, 3),
	stop_words=None,
	top_n=10,
	use_mmr=True,
	diversity=0.5
	)
	except:
	pass

	# If KeyBERT fails, use simple frequency-based extraction
	if not keywords:
	keywords = self._extract_keywords_simple(full_text)

	# Extract action items and decisions
	action_items = []
	decisions = []

	for segment in transcript_segments:
	# Check for action items
	if self._is_action_item(segment['text']):
	action_items.append({
	'text': segment['text'],
	'speaker': segment['speaker'],
	'timestamp': f"{segment['start']:.1f}s"
	})

	# Check for decisions
	if self._is_decision(segment['text']):
	decisions.append({
	'text': segment['text'],
	'speaker': segment['speaker'],
	'timestamp': f"{segment['start']:.1f}s"
	})

	return {
	'keywords': keywords,
	'action_items': action_items,
	'decisions': decisions
	}

	def _split_into_chunks(self, text, max_length):
	"""Split text into chunks"""
	words = text.split()
	chunks = []
	current_chunk = []
	current_length = 0

	for word in words:
	current_chunk.append(word)
	current_length += len(word) + 1

	if current_length >= max_length:
	chunks.append(' '.join(current_chunk))
	current_chunk = []
	current_length = 0

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks

	def _simple_extractive_summary(self, text, ratio=0.3):
	"""Simple extractive summarization fallback"""
	sentences = nltk.sent_tokenize(text)

	if len(sentences) <= 3:
	return text

	# Calculate number of sentences to include
	num_sentences = max(3, int(len(sentences) * ratio))

	# Simple scoring: prefer sentences with more content words
	scored_sentences = []
	for i, sent in enumerate(sentences):
	# Score based on length and position
	score = len(sent.split())
	if i < 3: # Boost first sentences
	score *= 1.5
	if i >= len(sentences) - 2: # Boost last sentences
	score *= 1.2
	scored_sentences.append((score, sent))

	# Sort by score and select top sentences
	scored_sentences.sort(reverse=True)
	selected = [sent for _, sent in scored_sentences[:num_sentences]]

	# Return in original order
	return ' '.join([s for s in sentences if s in selected])

	def _extract_keywords_simple(self, text):
	"""Simple keyword extraction fallback"""
	# Remove common words
	stopwords = {
	'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'adalah',
	'ini', 'itu', 'dengan', 'tersebut', 'dalam', 'dapat', 'akan',
	'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
	'for', 'of', 'with', 'as', 'is', 'was', 'are', 'were'
	}

	# Count word frequency
	words = re.findall(r'\b\w+\b', text.lower())
	word_freq = {}

	for word in words:
	if len(word) > 3 and word not in stopwords:
	word_freq[word] = word_freq.get(word, 0) + 1

	# Get top keywords
	keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]

	# Format like KeyBERT output
	return [(word, freq/len(words)) for word, freq in keywords]

	def _is_action_item(self, text):
	"""Detect if text contains action item"""
	text_lower = text.lower()

	# Check patterns
	for pattern in self.action_patterns:
	if re.search(pattern, text_lower):
	return True

	# Check for imperative sentences
	first_word = text.split()[0].lower() if text.split() else ""
	imperative_verbs = [
	'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
	'follow', 'prepare', 'send', 'contact', 'create'
	]

	return first_word in imperative_verbs

	def _is_decision(self, text):
	"""Detect if text contains decision"""
	text_lower = text.lower()

	for pattern in self.decision_patterns:
	if re.search(pattern, text_lower):
	return True

	return False