Yermia commited on
Commit
45b571d
·
1 Parent(s): 5da9a16
Files changed (2) hide show
  1. requirements.txt +3 -5
  2. utils/speech_processor.py +59 -27
requirements.txt CHANGED
@@ -4,9 +4,8 @@ transformers==4.37.2
4
  torch==2.1.2
5
  torchaudio==2.1.2
6
 
7
- # Audio processing
8
- pyannote.audio==3.1.1
9
- speechbrain==0.5.16
10
  librosa==0.10.1
11
  pydub==0.25.1
12
 
@@ -18,5 +17,4 @@ sentencepiece==0.1.99
18
 
19
  # Utils
20
  pandas==2.1.4
21
- markdown==3.5.2
22
- python-dotenv==1.0.0
 
4
  torch==2.1.2
5
  torchaudio==2.1.2
6
 
7
+ # Audio processing - skip pyannote if causing issues
8
+ # pyannote.audio==3.1.1
 
9
  librosa==0.10.1
10
  pydub==0.25.1
11
 
 
17
 
18
  # Utils
19
  pandas==2.1.4
20
+ markdown==3.5.2
 
utils/speech_processor.py CHANGED
@@ -10,6 +10,7 @@ import librosa
10
  import numpy as np
11
  from pydub import AudioSegment
12
  import tempfile
 
13
 
14
  class SpeechProcessor:
15
  def __init__(self):
@@ -22,11 +23,15 @@ class SpeechProcessor:
22
  )
23
 
24
  # Load speaker diarization
25
- self.diarization_pipeline = Pipeline.from_pretrained(
26
- "pyannote/speaker-diarization-3.1",
27
- use_auth_token=os.environ.get("HF_TOKEN")
28
- )
29
-
 
 
 
 
30
  def process_audio(self, audio_path, language="id"):
31
  """
32
  Process audio file untuk ASR dan speaker diarization
@@ -38,32 +43,59 @@ class SpeechProcessor:
38
  waveform, sample_rate = torchaudio.load(audio_path)
39
 
40
  # Speaker diarization
41
- diarization = self.diarization_pipeline(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # Process each speaker segment
44
- transcript_segments = []
 
 
 
 
 
 
45
 
46
- for turn, _, speaker in diarization.itertracks(yield_label=True):
47
- # Extract segment audio
48
- start_sample = int(turn.start * sample_rate)
49
- end_sample = int(turn.end * sample_rate)
50
- segment_waveform = waveform[:, start_sample:end_sample]
51
-
52
- # ASR on segment
53
- text = self._transcribe_segment(
54
- segment_waveform,
55
- sample_rate,
56
- language
57
- )
58
 
59
- transcript_segments.append({
60
- "start": round(turn.start, 2),
61
- "end": round(turn.end, 2),
62
- "speaker": speaker,
63
- "text": text
64
- })
 
65
 
66
- return self._merge_consecutive_segments(transcript_segments)
67
 
68
  def _transcribe_segment(self, waveform, sample_rate, language):
69
  """
 
10
  import numpy as np
11
  from pydub import AudioSegment
12
  import tempfile
13
+ import os # ADD THIS LINE - FIX FOR THE ERROR
14
 
15
  class SpeechProcessor:
16
  def __init__(self):
 
23
  )
24
 
25
  # Load speaker diarization
26
+ try:
27
+ self.diarization_pipeline = Pipeline.from_pretrained(
28
+ "pyannote/speaker-diarization-3.1",
29
+ use_auth_token=os.environ.get("HF_TOKEN") # Now os is imported
30
+ )
31
+ except Exception as e:
32
+ print(f"Warning: Could not load diarization model: {e}")
33
+ self.diarization_pipeline = None
34
+
35
  def process_audio(self, audio_path, language="id"):
36
  """
37
  Process audio file untuk ASR dan speaker diarization
 
43
  waveform, sample_rate = torchaudio.load(audio_path)
44
 
45
  # Speaker diarization
46
+ if self.diarization_pipeline:
47
+ try:
48
+ diarization = self.diarization_pipeline(audio_path)
49
+
50
+ # Process each speaker segment
51
+ transcript_segments = []
52
+
53
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
54
+ # Extract segment audio
55
+ start_sample = int(turn.start * sample_rate)
56
+ end_sample = int(turn.end * sample_rate)
57
+ segment_waveform = waveform[:, start_sample:end_sample]
58
+
59
+ # ASR on segment
60
+ text = self._transcribe_segment(
61
+ segment_waveform,
62
+ sample_rate,
63
+ language
64
+ )
65
+
66
+ transcript_segments.append({
67
+ "start": round(turn.start, 2),
68
+ "end": round(turn.end, 2),
69
+ "speaker": speaker,
70
+ "text": text
71
+ })
72
+
73
+ return self._merge_consecutive_segments(transcript_segments)
74
+ except Exception as e:
75
+ print(f"Diarization failed, falling back to simple transcription: {e}")
76
 
77
+ # Fallback: simple transcription without diarization
78
+ return self._simple_transcription(waveform, sample_rate, language)
79
+
80
+ def _simple_transcription(self, waveform, sample_rate, language):
81
+ """Fallback transcription without speaker diarization"""
82
+ # Process in 30-second chunks
83
+ chunk_length = 30 * sample_rate
84
+ segments = []
85
 
86
+ for i in range(0, waveform.shape[1], chunk_length):
87
+ chunk = waveform[:, i:i + chunk_length]
88
+ text = self._transcribe_segment(chunk, sample_rate, language)
 
 
 
 
 
 
 
 
 
89
 
90
+ if text.strip():
91
+ segments.append({
92
+ "start": i / sample_rate,
93
+ "end": min((i + chunk_length) / sample_rate, waveform.shape[1] / sample_rate),
94
+ "speaker": "SPEAKER_01",
95
+ "text": text
96
+ })
97
 
98
+ return segments
99
 
100
  def _transcribe_segment(self, waveform, sample_rate, language):
101
  """