Anna Sun
		
	commited on
		
		
					Commit 
							
							·
						
						fd69a21
	
1
								Parent(s):
							
							c1e0588
								
more fixes
Browse files- app.py +27 -10
- simuleval_transcoder.py +1 -0
    	
        app.py
    CHANGED
    
    | @@ -35,6 +35,7 @@ def build_agent(model_path, config_name=None): | |
| 35 |  | 
| 36 | 
             
            agent = build_agent("models", "vad_s2st_sc_24khz_main.yaml")
         | 
| 37 | 
             
            transcoder = SimulevalTranscoder(
         | 
|  | |
| 38 | 
             
                sample_rate=48_000,
         | 
| 39 | 
             
                debug=False,
         | 
| 40 | 
             
                buffer_limit=1,
         | 
| @@ -43,8 +44,8 @@ transcoder = SimulevalTranscoder( | |
| 43 | 
             
            def start_recording():
         | 
| 44 | 
             
                logger.debug(f"start_recording: starting transcoder")
         | 
| 45 | 
             
                transcoder.reset_states()
         | 
| 46 | 
            -
                transcoder.start()
         | 
| 47 | 
             
                transcoder.close = False
         | 
|  | |
| 48 |  | 
| 49 | 
             
            def stop_recording():
         | 
| 50 | 
             
                transcoder.close = True
         | 
| @@ -87,11 +88,13 @@ def get_buffered_output(): | |
| 87 |  | 
| 88 | 
             
                return speech, text, speech_and_text_output.final
         | 
| 89 |  | 
|  | |
| 90 | 
             
            def streaming_input_callback():
         | 
| 91 | 
             
                final = False
         | 
| 92 | 
             
                max_wait_s = 15
         | 
| 93 | 
             
                wait_s = 0
         | 
| 94 | 
             
                translated_text_state = ""
         | 
|  | |
| 95 | 
             
                while not transcoder.close:
         | 
| 96 | 
             
                    translated_wav_segment, translated_text, final = get_buffered_output()
         | 
| 97 |  | 
| @@ -107,7 +110,7 @@ def streaming_input_callback(): | |
| 107 | 
             
                        print("output sample rate", sample_rate)
         | 
| 108 | 
             
                        translated_wav_segment = sample_rate, np.array(audio_bytes)
         | 
| 109 | 
             
                    else:
         | 
| 110 | 
            -
                        translated_wav_segment =  | 
| 111 |  | 
| 112 | 
             
                    if translated_text is not None:
         | 
| 113 | 
             
                        translated_text_state += " | " + str(translated_text)
         | 
| @@ -123,16 +126,23 @@ def streaming_input_callback(): | |
| 123 |  | 
| 124 |  | 
| 125 | 
             
            def streaming_callback_dummy():
         | 
|  | |
|  | |
| 126 | 
             
                while not transcoder.close:
         | 
| 127 | 
             
                    if s.queue.empty():
         | 
| 128 | 
            -
                         | 
| 129 | 
            -
             | 
|  | |
| 130 | 
             
                        time.sleep(0.3)
         | 
| 131 | 
             
                    else:
         | 
| 132 | 
            -
                         | 
|  | |
|  | |
| 133 | 
             
                        audio = s.queue.get_nowait()
         | 
|  | |
|  | |
| 134 | 
             
                        s.queue.task_done()
         | 
| 135 | 
            -
                        yield audio
         | 
| 136 |  | 
| 137 | 
             
            def clear():
         | 
| 138 | 
             
                logger.debug(f"Clearing State")
         | 
| @@ -175,21 +185,28 @@ def blocks(): | |
| 175 | 
             
                    ).then(
         | 
| 176 | 
             
                        start_recording
         | 
| 177 | 
             
                    ).then(
         | 
| 178 | 
            -
                        #  | 
| 179 | 
            -
                        #  | 
| 180 | 
            -
                        #  | 
|  | |
|  | |
|  | |
| 181 | 
             
                        streaming_input_callback,
         | 
| 182 | 
             
                        None,
         | 
| 183 | 
             
                        [
         | 
| 184 | 
             
                            output_translation_segment,
         | 
| 185 | 
             
                            stream_output_text,
         | 
| 186 | 
             
                            translated_text_state,
         | 
| 187 | 
            -
                        ] | 
| 188 | 
             
                    )
         | 
| 189 | 
             
                    input_audio.stop_recording(
         | 
| 190 | 
             
                        stop_recording
         | 
| 191 | 
             
                    )
         | 
| 192 | 
             
                    input_audio.stream(
         | 
|  | |
|  | |
|  | |
|  | |
| 193 | 
             
                        process_incoming_bytes, [input_audio], None
         | 
| 194 | 
             
                    )
         | 
| 195 |  | 
|  | |
| 35 |  | 
| 36 | 
             
            agent = build_agent("models", "vad_s2st_sc_24khz_main.yaml")
         | 
| 37 | 
             
            transcoder = SimulevalTranscoder(
         | 
| 38 | 
            +
                agent,
         | 
| 39 | 
             
                sample_rate=48_000,
         | 
| 40 | 
             
                debug=False,
         | 
| 41 | 
             
                buffer_limit=1,
         | 
|  | |
| 44 | 
             
            def start_recording():
         | 
| 45 | 
             
                logger.debug(f"start_recording: starting transcoder")
         | 
| 46 | 
             
                transcoder.reset_states()
         | 
|  | |
| 47 | 
             
                transcoder.close = False
         | 
| 48 | 
            +
                transcoder.start()
         | 
| 49 |  | 
| 50 | 
             
            def stop_recording():
         | 
| 51 | 
             
                transcoder.close = True
         | 
|  | |
| 88 |  | 
| 89 | 
             
                return speech, text, speech_and_text_output.final
         | 
| 90 |  | 
| 91 | 
            +
            from scipy.io.wavfile import write as scipy_write
         | 
| 92 | 
             
            def streaming_input_callback():
         | 
| 93 | 
             
                final = False
         | 
| 94 | 
             
                max_wait_s = 15
         | 
| 95 | 
             
                wait_s = 0
         | 
| 96 | 
             
                translated_text_state = ""
         | 
| 97 | 
            +
                sample_rate = 24000
         | 
| 98 | 
             
                while not transcoder.close:
         | 
| 99 | 
             
                    translated_wav_segment, translated_text, final = get_buffered_output()
         | 
| 100 |  | 
|  | |
| 110 | 
             
                        print("output sample rate", sample_rate)
         | 
| 111 | 
             
                        translated_wav_segment = sample_rate, np.array(audio_bytes)
         | 
| 112 | 
             
                    else:
         | 
| 113 | 
            +
                        translated_wav_segment = sample_rate, np.empty(0, dtype=np.int16)
         | 
| 114 |  | 
| 115 | 
             
                    if translated_text is not None:
         | 
| 116 | 
             
                        translated_text_state += " | " + str(translated_text)
         | 
|  | |
| 126 |  | 
| 127 |  | 
| 128 | 
             
            def streaming_callback_dummy():
         | 
| 129 | 
            +
                i = 0
         | 
| 130 | 
            +
                out_text = ""
         | 
| 131 | 
             
                while not transcoder.close:
         | 
| 132 | 
             
                    if s.queue.empty():
         | 
| 133 | 
            +
                        yield (
         | 
| 134 | 
            +
                            (48000, np.empty(0, dtype=np.int16)), out_text, out_text
         | 
| 135 | 
            +
                        )
         | 
| 136 | 
             
                        time.sleep(0.3)
         | 
| 137 | 
             
                    else:
         | 
| 138 | 
            +
                        i += 1
         | 
| 139 | 
            +
                        out_text += " | " + str(i)
         | 
| 140 | 
            +
                        print(out_text)
         | 
| 141 | 
             
                        audio = s.queue.get_nowait()
         | 
| 142 | 
            +
                        if i == 0:
         | 
| 143 | 
            +
                            print(audio[0], type(audio[1]))
         | 
| 144 | 
             
                        s.queue.task_done()
         | 
| 145 | 
            +
                        yield audio, out_text, out_text
         | 
| 146 |  | 
| 147 | 
             
            def clear():
         | 
| 148 | 
             
                logger.debug(f"Clearing State")
         | 
|  | |
| 185 | 
             
                    ).then(
         | 
| 186 | 
             
                        start_recording
         | 
| 187 | 
             
                    ).then(
         | 
| 188 | 
            +
                        # TODO: streaming speech autoplay works fine with streaming_callback_dummy,
         | 
| 189 | 
            +
                        # but speech output from streaming_input_callback has a huge delay
         | 
| 190 | 
            +
                        # when comparing print/debugging logs vs. output speech
         | 
| 191 | 
            +
                        # TODO: text output works fine with one output, but is not
         | 
| 192 | 
            +
                        # updating when output is both text + speech
         | 
| 193 | 
            +
                        # streaming_callback_dummy,
         | 
| 194 | 
             
                        streaming_input_callback,
         | 
| 195 | 
             
                        None,
         | 
| 196 | 
             
                        [
         | 
| 197 | 
             
                            output_translation_segment,
         | 
| 198 | 
             
                            stream_output_text,
         | 
| 199 | 
             
                            translated_text_state,
         | 
| 200 | 
            +
                        ]
         | 
| 201 | 
             
                    )
         | 
| 202 | 
             
                    input_audio.stop_recording(
         | 
| 203 | 
             
                        stop_recording
         | 
| 204 | 
             
                    )
         | 
| 205 | 
             
                    input_audio.stream(
         | 
| 206 | 
            +
                        # TODO: *only when streaming speech output* about half the time 
         | 
| 207 | 
            +
                        # there is some race condition in gradio where process_incoming_bytes
         | 
| 208 | 
            +
                        # stops getting called once the first speech chunk is yield-ed 
         | 
| 209 | 
            +
                        # in streaming_input_callback (or streaming_callback_dummy)
         | 
| 210 | 
             
                        process_incoming_bytes, [input_audio], None
         | 
| 211 | 
             
                    )
         | 
| 212 |  | 
    	
        simuleval_transcoder.py
    CHANGED
    
    | @@ -325,6 +325,7 @@ class SimulevalTranscoder: | |
| 325 |  | 
| 326 | 
             
                def process_pipeline_loop(self):
         | 
| 327 | 
             
                    if self.close:
         | 
|  | |
| 328 | 
             
                        return  # closes the thread
         | 
| 329 |  | 
| 330 | 
             
                    print("processing_pipeline")
         | 
|  | |
| 325 |  | 
| 326 | 
             
                def process_pipeline_loop(self):
         | 
| 327 | 
             
                    if self.close:
         | 
| 328 | 
            +
                        print("transcoder closed")
         | 
| 329 | 
             
                        return  # closes the thread
         | 
| 330 |  | 
| 331 | 
             
                    print("processing_pipeline")
         | 
