Spaces:
Runtime error
Runtime error
| # Copyright 2023 The HuggingFace Team. All rights reserved. | |
| import datetime | |
| import platform | |
| import subprocess | |
| from typing import Optional, Tuple, Union | |
| import numpy as np | |
| def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: | |
| """ | |
| Helper function to read an audio file through ffmpeg. | |
| """ | |
| ar = f"{sampling_rate}" | |
| ac = "1" | |
| format_for_conversion = "f32le" | |
| ffmpeg_command = [ | |
| "ffmpeg", | |
| "-i", | |
| "pipe:0", | |
| "-ac", | |
| ac, | |
| "-ar", | |
| ar, | |
| "-f", | |
| format_for_conversion, | |
| "-hide_banner", | |
| "-loglevel", | |
| "quiet", | |
| "pipe:1", | |
| ] | |
| try: | |
| with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process: | |
| output_stream = ffmpeg_process.communicate(bpayload) | |
| except FileNotFoundError as error: | |
| raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error | |
| out_bytes = output_stream[0] | |
| audio = np.frombuffer(out_bytes, np.float32) | |
| if audio.shape[0] == 0: | |
| raise ValueError( | |
| "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has " | |
| "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote " | |
| "URL, ensure that the URL is the full address to **download** the audio file." | |
| ) | |
| return audio | |
| def ffmpeg_microphone( | |
| sampling_rate: int, | |
| chunk_length_s: float, | |
| format_for_conversion: str = "f32le", | |
| ): | |
| """ | |
| Helper function ro read raw microphone data. | |
| """ | |
| ar = f"{sampling_rate}" | |
| ac = "1" | |
| if format_for_conversion == "s16le": | |
| size_of_sample = 2 | |
| elif format_for_conversion == "f32le": | |
| size_of_sample = 4 | |
| else: | |
| raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`") | |
| system = platform.system() | |
| if system == "Linux": | |
| format_ = "alsa" | |
| input_ = "default" | |
| elif system == "Darwin": | |
| format_ = "avfoundation" | |
| input_ = ":0" | |
| elif system == "Windows": | |
| format_ = "dshow" | |
| input_ = "default" | |
| ffmpeg_command = [ | |
| "ffmpeg", | |
| "-f", | |
| format_, | |
| "-i", | |
| input_, | |
| "-ac", | |
| ac, | |
| "-ar", | |
| ar, | |
| "-f", | |
| format_for_conversion, | |
| "-fflags", | |
| "nobuffer", | |
| "-hide_banner", | |
| "-loglevel", | |
| "quiet", | |
| "pipe:1", | |
| ] | |
| chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample | |
| iterator = _ffmpeg_stream(ffmpeg_command, chunk_len) | |
| for item in iterator: | |
| yield item | |
| def ffmpeg_microphone_live( | |
| sampling_rate: int, | |
| chunk_length_s: float, | |
| stream_chunk_s: Optional[int] = None, | |
| stride_length_s: Optional[Union[Tuple[float, float], float]] = None, | |
| format_for_conversion: str = "f32le", | |
| ): | |
| """ | |
| Helper function to read audio from the microphone file through ffmpeg. This will output `partial` overlapping | |
| chunks starting from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of | |
| striding to avoid errors on the "sides" of the various chunks. | |
| Arguments: | |
| sampling_rate (`int`): | |
| The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to | |
| avoid resampling later. | |
| chunk_length_s (`float` or `int`): | |
| The length of the maximum chunk of audio to be sent returned. This includes the eventual striding. | |
| stream_chunk_s (`float` or `int`) | |
| The length of the minimal temporary audio to be returned. | |
| stride_length_s (`float` or `int` or `(float, float)`, *optional*, defaults to `None`) | |
| The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of | |
| an audio sample but without using that part to actually make the prediction. Setting this does not change | |
| the length of the chunk. | |
| format_for_conversion (`str`, defalts to `f32le`) | |
| The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le` | |
| could also be used. | |
| Return: | |
| A generator yielding dictionaries of the following form | |
| `{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionnally a `"stride" (int, int)` key if | |
| `stride_length_s` is defined. | |
| `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item | |
| is a whole chunk, or a partial temporary result to be later replaced by another larger chunk. | |
| """ | |
| if stream_chunk_s is not None: | |
| chunk_s = stream_chunk_s | |
| else: | |
| chunk_s = chunk_length_s | |
| microphone = ffmpeg_microphone(sampling_rate, chunk_s, format_for_conversion=format_for_conversion) | |
| if format_for_conversion == "s16le": | |
| dtype = np.int16 | |
| size_of_sample = 2 | |
| elif format_for_conversion == "f32le": | |
| dtype = np.float32 | |
| size_of_sample = 4 | |
| else: | |
| raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`") | |
| if stride_length_s is None: | |
| stride_length_s = chunk_length_s / 6 | |
| chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample | |
| if isinstance(stride_length_s, (int, float)): | |
| stride_length_s = [stride_length_s, stride_length_s] | |
| stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample | |
| stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample | |
| audio_time = datetime.datetime.now() | |
| delta = datetime.timedelta(seconds=chunk_s) | |
| for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True): | |
| # Put everything back in numpy scale | |
| item["raw"] = np.frombuffer(item["raw"], dtype=dtype) | |
| item["stride"] = ( | |
| item["stride"][0] // size_of_sample, | |
| item["stride"][1] // size_of_sample, | |
| ) | |
| item["sampling_rate"] = sampling_rate | |
| audio_time += delta | |
| if datetime.datetime.now() > audio_time + 10 * delta: | |
| # We're late !! SKIP | |
| continue | |
| yield item | |
| def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False): | |
| """ | |
| Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to | |
| get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available. | |
| """ | |
| acc = b"" | |
| stride_left, stride_right = stride | |
| if stride_left + stride_right >= chunk_len: | |
| raise ValueError( | |
| f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}" | |
| ) | |
| _stride_left = 0 | |
| for raw in iterator: | |
| acc += raw | |
| if stream and len(acc) < chunk_len: | |
| stride = (_stride_left, 0) | |
| yield {"raw": acc[:chunk_len], "stride": stride, "partial": True} | |
| else: | |
| while len(acc) >= chunk_len: | |
| # We are flushing the accumulator | |
| stride = (_stride_left, stride_right) | |
| item = {"raw": acc[:chunk_len], "stride": stride} | |
| if stream: | |
| item["partial"] = False | |
| yield item | |
| _stride_left = stride_left | |
| acc = acc[chunk_len - stride_left - stride_right :] | |
| # Last chunk | |
| if len(acc) > stride_left: | |
| item = {"raw": acc, "stride": (_stride_left, 0)} | |
| if stream: | |
| item["partial"] = False | |
| yield item | |
| def _ffmpeg_stream(ffmpeg_command, buflen: int): | |
| """ | |
| Internal function to create the generator of data through ffmpeg | |
| """ | |
| bufsize = 2**24 # 16Mo | |
| try: | |
| with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process: | |
| while True: | |
| raw = ffmpeg_process.stdout.read(buflen) | |
| if raw == b"": | |
| break | |
| yield raw | |
| except FileNotFoundError as error: | |
| raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error | |