Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Runtime error

App Files Files Community

Multimodal-Behavioral-Anomalies-Detection / voice_analysis.py

reab5555

Update voice_analysis.py

4a67bd7 verified over 1 year ago

raw

history blame

2.52 kB

	import moviepy.editor as mp
	from pyannote.audio import Pipeline
	import torch
	import torchaudio
	from pyannote.audio import Pipeline
	from pyannote.core import Segment
	from pyannote.audio import Model
	import os

	def extract_audio_from_video(video_path):
	video = mp.VideoFileClip(video_path)
	audio_path = video_path.rsplit('.', 1)[0] + '.wav'
	video.audio.write_audiofile(audio_path)
	return audio_path

	def diarize_speakers(audio_path):
	hf_token = os.environ.get("py_annote_hf_token")

	if not hf_token:
	raise ValueError("py_annote_hf_token environment variable is not set. Please check your Hugging Face Space's Variables and secrets section.")

	pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=hf_token)
	diarization = pipeline(audio_path)
	return diarization

	def get_speaker_embeddings(audio_path, diarization, model_name="pyannote/embedding"):
	model = Model.from_pretrained(model_name, use_auth_token=os.environ.get("py_annote_hf_token"))
	waveform, sample_rate = torchaudio.load(audio_path)
	duration = waveform.shape[1] / sample_rate

	embeddings = []
	for turn, _, speaker in diarization.itertracks(yield_label=True):
	start_frame = int(turn.start * sample_rate)
	end_frame = int(turn.end * sample_rate)
	segment = waveform[:, start_frame:end_frame]

	if segment.shape[1] > 0:
	with torch.no_grad():
	embedding = model(segment.to(model.device))
	embeddings.append({"time": turn.start, "duration": turn.duration, "embedding": embedding.cpu().numpy(), "speaker": speaker})

	# Ensure embeddings cover the entire duration
	if embeddings and embeddings[-1]['time'] + embeddings[-1]['duration'] < duration:
	embeddings.append({"time": duration, "duration": 0, "embedding": np.zeros_like(embeddings[0]['embedding']), "speaker": "silence"})

	return embeddings, duration

	def align_voice_embeddings(voice_embeddings, frame_count, fps):
	aligned_embeddings = []
	current_embedding_index = 0

	for frame in range(frame_count):
	frame_time = frame / fps

	while (current_embedding_index < len(voice_embeddings) - 1 and
	voice_embeddings[current_embedding_index + 1]["time"] <= frame_time):
	current_embedding_index += 1

	aligned_embeddings.append(voice_embeddings[current_embedding_index]["embedding"])

	return np.array(aligned_embeddings)