Spaces:

smajumdar
/

nemo_multilingual_language_id

Runtime error

App Files Files Community

smajumdar commited on Dec 6, 2022

Commit

d40d29c

1 Parent(s): 39cf8cc

Add support for YT transcription

Browse files

Files changed (4) hide show

app.py +324 -16
requirements.txt +2 -1
speech_to_text_buffered_infer_ctc.py +193 -0
speech_to_text_buffered_infer_rnnt.py +247 -0

app.py CHANGED Viewed

@@ -1,7 +1,21 @@
 import gradio as gr
-import torch
 import nemo.collections.asr as nemo_asr
 SAMPLE_RATE = 16000
 TITLE = "NeMo ASR Inference on Hugging Face"
@@ -32,7 +46,7 @@ ARTICLE = """
 SUPPORTED_LANGUAGES = set([])
 SUPPORTED_MODEL_NAMES = set([])
-# HF models
 hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter()
 hf_filter.task = "automatic-speech-recognition"
@@ -44,6 +58,8 @@ for info in hf_infos:
 SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))
 model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES}
 SUPPORTED_LANG_MODEL_DICT = {}
@@ -63,8 +79,253 @@ for lang in SUPPORTED_LANG_MODEL_DICT.keys():
     SUPPORTED_LANG_MODEL_DICT[lang] = model_ids
 def transcribe(microphone, audio_file, model_name):
-    model = model_dict[model_name]
     warn_output = ""
     if (microphone is not None) and (audio_file is not None):
@@ -84,7 +345,7 @@ def transcribe(microphone, audio_file, model_name):
     try:
         # Use HF API for transcription
-        transcriptions = model(audio_data)
     except Exception as e:
         transcriptions = ""
@@ -98,21 +359,38 @@ def transcribe(microphone, audio_file, model_name):
     return warn_output + transcriptions
-demo = gr.Blocks(title=TITLE, css=CSS)
-with demo:
-    header = gr.Markdown(MARKDOWN)
-    with gr.Row() as row:
-        file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File')
-        microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone')
     lang_selector = gr.components.Dropdown(
         choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
     )
     models_in_lang = gr.components.Dropdown(
         choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])),
-        value=DEFAULT_EN_MODEL,
         label="Models",
         interactive=True,
     )
@@ -122,17 +400,47 @@ with demo:
         default = models_names[0]
         if lang == 'en':
-            default = DEFAULT_EN_MODEL
         return models_in_lang.update(choices=models_names, value=default)
     lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang])
-    transcript = gr.components.Label(label='Transcript')
-    run = gr.components.Button('Transcribe')
-    run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript])
     gr.components.HTML(ARTICLE)
 demo.queue(concurrency_count=1)
-demo.launch()

+import os
+import json
+import uuid
+import tempfile
+import subprocess
+import re
 import gradio as gr
+import pytube as pt
 import nemo.collections.asr as nemo_asr
+import speech_to_text_buffered_infer_ctc as buffered_ctc
+import speech_to_text_buffered_infer_rnnt as buffered_rnnt
+# Set NeMo cache dir as /tmp
+from nemo import constants
+os.environ[constants.NEMO_ENV_CACHE_DIR] = "/tmp/nemo"
 SAMPLE_RATE = 16000
 TITLE = "NeMo ASR Inference on Hugging Face"
 SUPPORTED_LANGUAGES = set([])
 SUPPORTED_MODEL_NAMES = set([])
+# HF models, grouped by language identifier
 hf_filter = nemo_asr.models.ASRModel.get_hf_model_filter()
 hf_filter.task = "automatic-speech-recognition"
 SUPPORTED_MODEL_NAMES = sorted(list(SUPPORTED_MODEL_NAMES))
+SUPPORTED_MODEL_NAMES = list(filter(lambda x: 'en' in x and 'conformer_transducer_large' in x, SUPPORTED_MODEL_NAMES))
 model_dict = {model_name: gr.Interface.load(f'models/{model_name}') for model_name in SUPPORTED_MODEL_NAMES}
 SUPPORTED_LANG_MODEL_DICT = {}
     SUPPORTED_LANG_MODEL_DICT[lang] = model_ids
+def parse_duration(audio_file):
+    """
+    FFMPEG to calculate durations. Libraries can do it too, but filetypes cause different libraries to behave differently.
+    """
+    process = subprocess.Popen(['ffmpeg', '-i', audio_file], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    stdout, stderr = process.communicate()
+    matches = re.search(
+        r"Duration:\s{1}(?P<hours>\d+?):(?P<minutes>\d+?):(?P<seconds>\d+\.\d+?),", stdout.decode(), re.DOTALL
+    ).groupdict()
+    duration = 0.0
+    duration += float(matches['hours']) * 60.0 * 60.0
+    duration += float(matches['minutes']) * 60.0
+    duration += float(matches['seconds']) * 1.0
+    return duration
+def resolve_model_type(model_name: str) -> str:
+    """
+    Map model name to a class type, without loading the model. Has some hardcoded assumptions in
+    semantics of model naming.
+    """
+    # Loss specific maps
+    if 'hybrid' in model_name or 'hybrid_ctc' in model_name or 'hybrid_transducer' in model_name:
+        return 'hybrid'
+    elif 'transducer' in model_name or 'rnnt' in model_id:
+        return 'transducer'
+    elif 'ctc' in model_name:
+        return 'ctc'
+    # Model specific maps
+    elif 'jasper' in model_name:
+        return 'ctc'
+    elif 'quartznet' in model_name:
+        return 'ctc'
+    elif 'citrinet' in model_name:
+        return 'ctc'
+    elif 'contextnet' in model_name:
+        return 'ctc'
+    else:
+        # Unknown model type
+        return None
+def resolve_model_stride(model_name) -> int:
+    """
+    Model specific pre-calc of stride levels.
+    Dont laod model to get such info.
+    """
+    if 'jasper' in model_name:
+        return 2
+    if 'quartznet' in model_name:
+        return 2
+    if 'conformer' in model_name:
+        return 4
+    if 'squeezeformer' in model_name:
+        return 4
+    if 'citrinet' in model_name:
+        return 8
+    if 'contextnet' in model_name:
+        return 8
+    return -1
+def convert_audio(audio_filepath):
+    """
+    Transcode all mp3 files to monochannel 16 kHz wav files.
+    """
+    filedir = os.path.split(audio_filepath)[0]
+    filename, ext = os.path.splitext(audio_filepath)
+    if ext == 'wav':
+        return audio_filepath
+    out_filename = os.path.join(filedir, filename + '.wav')
+    process = subprocess.Popen(
+        ['ffmpeg', '-i', audio_filepath, '-ac', '1', '-ar', str(SAMPLE_RATE), out_filename],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    stdout, stderr = process.communicate()
+    if os.path.exists(out_filename):
+        return out_filename
+    else:
+        return None
+def extract_result_from_manifest(filepath, model_name) -> (bool, str):
+    """
+    Parse the written manifest which is result of the buffered inference process.
+    """
+    data = []
+    with open(filepath, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                line = json.loads(line)
+                data.append(line['pred_text'])
+            except Exception as e:
+                pass
+    if len(data) > 0:
+        return True, data[0]
+    else:
+        return False, f"Could not perform inference on model with name : {model_name}"
+def infer_audio(model_name: str, audio_file: str) -> str:
+    """
+    Main method that switches from HF inference for small audio files to Buffered CTC/RNNT mode for long audio files.
+    Args:
+        model_name: Str name of the model (potentially with / to denote HF models)
+        audio_file: Path to an audio file (mp3 or wav)
+    Returns:
+        str which is the transcription if successful.
+    """
+    # Parse the duration of the audio file
+    duration = parse_duration(audio_file)
+    if duration > 60.0:  # Longer than one minute; use buffered mode
+        # Process audio to be of wav type (possible youtube audio)
+        audio_file = convert_audio(audio_file)
+        # If audio file transcoding failed, let user know
+        if audio_file is None:
+            return "Failed to convert audio file to wav."
+        # Extract audio dir from resolved audio filepath
+        audio_dir = os.path.split(audio_file)[0]
+        # Next calculate the stride of each model
+        model_stride = resolve_model_stride(model_name)
+        if model_stride < 0:
+            return f"Failed to compute the model stride for model with name : {model_name}"
+        # Process model type (CTC/RNNT/Hybrid)
+        model_type = resolve_model_type(model_name)
+        if model_type is None:
+            # Model type could not be infered.
+            # Try all feasible options
+            RESULT = None
+            try:
+                ctc_config = buffered_ctc.TranscriptionConfig(
+                    pretrained_name=model_name,
+                    audio_dir=audio_dir,
+                    output_filename="output.json",
+                    audio_type="wav",
+                    overwrite_transcripts=True,
+                    model_stride=model_stride,
+                    chunk_len_in_secs=20.0,
+                    total_buffer_in_secs=30.0,
+                )
+                buffered_ctc.main(ctc_config)
+                result = extract_result_from_manifest('output.json', model_name)
+                if result[0]:
+                    RESULT = result[1]
+            except Exception as e:
+                pass
+            try:
+                rnnt_config = buffered_rnnt.TranscriptionConfig(
+                    pretrained_name=model_name,
+                    audio_dir=audio_dir,
+                    output_filename="output.json",
+                    audio_type="wav",
+                    overwrite_transcripts=True,
+                    model_stride=model_stride,
+                    chunk_len_in_secs=20.0,
+                    total_buffer_in_secs=30.0,
+                )
+                buffered_rnnt.main(rnnt_config)
+                result = extract_result_from_manifest('output.json', model_name)[-1]
+                if result[0]:
+                    RESULT = result[1]
+            except Exception as e:
+                pass
+            if RESULT is None:
+                return f"Could not parse model type; failed to perform inference with model {model_name}!"
+        elif model_type == 'ctc':
+            # CTC Buffered Inference
+            ctc_config = buffered_ctc.TranscriptionConfig(
+                pretrained_name=model_name,
+                audio_dir=audio_dir,
+                output_filename="output.json",
+                audio_type="wav",
+                overwrite_transcripts=True,
+                model_stride=model_stride,
+                chunk_len_in_secs=20.0,
+                total_buffer_in_secs=30.0,
+            )
+            buffered_ctc.main(ctc_config)
+            return extract_result_from_manifest('output.json', model_name)[-1]
+        elif model_type == 'transducer':
+            # RNNT Buffered Inference
+            rnnt_config = buffered_rnnt.TranscriptionConfig(
+                pretrained_name=model_name,
+                audio_dir=audio_dir,
+                output_filename="output.json",
+                audio_type="wav",
+                overwrite_transcripts=True,
+                model_stride=model_stride,
+                chunk_len_in_secs=20.0,
+                total_buffer_in_secs=30.0,
+            )
+            buffered_rnnt.main(rnnt_config)
+            return extract_result_from_manifest('output.json', model_name)[-1]
+        else:
+            return f"Could not parse model type; failed to perform inference with model {model_name}!"
+    else:
+        if model_name in model_dict:
+            model = model_dict[model_name]
+        else:
+            model = None
+        if model is not None:
+            # Use HF API for transcription
+            transcriptions = model(audio_file)
+            return transcriptions
+        else:
+            error = (
+                f"Could not find model {model_name} in list of available models : "
+                f"{list([k for k in model_dict.keys()])}"
+            )
+            return error
 def transcribe(microphone, audio_file, model_name):
     warn_output = ""
     if (microphone is not None) and (audio_file is not None):
     try:
         # Use HF API for transcription
+        transcriptions = infer_audio(model_name, audio_data)
     except Exception as e:
         transcriptions = ""
     return warn_output + transcriptions
+def _return_yt_html_embed(yt_url):
+    video_id = yt_url.split("?v=")[-1]
+    HTML_str = (
+        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
+        " </center>"
+    )
+    return HTML_str
+def yt_transcribe(yt_url, model_name):
+    yt = pt.YouTube(yt_url)
+    html_embed_str = _return_yt_html_embed(yt_url)
+    with tempfile.TemporaryDirectory() as tempdir:
+        file_uuid = str(uuid.uuid4().hex)
+        file_uuid = f"{tempdir}/{file_uuid}.mp3"
+        stream = yt.streams.filter(only_audio=True)[0]
+        stream.download(filename=file_uuid)
+        text = infer_audio(model_name, file_uuid)
+    return html_embed_str, text
+def create_lang_selector_component(default_en_model=DEFAULT_EN_MODEL):
     lang_selector = gr.components.Dropdown(
         choices=sorted(list(SUPPORTED_LANGUAGES)), value="en", type="value", label="Languages", interactive=True,
     )
     models_in_lang = gr.components.Dropdown(
         choices=sorted(list(SUPPORTED_LANG_MODEL_DICT["en"])),
+        value=default_en_model,
         label="Models",
         interactive=True,
     )
         default = models_names[0]
         if lang == 'en':
+            default = default_en_model
         return models_in_lang.update(choices=models_names, value=default)
     lang_selector.change(update_models_with_lang, inputs=[lang_selector], outputs=[models_in_lang])
+    return lang_selector, models_in_lang
+demo = gr.Blocks(title=TITLE, css=CSS)
+with demo:
+    header = gr.Markdown(MARKDOWN)
+    with gr.Tab("Transcribe Audio"):
+        with gr.Row() as row:
+            file_upload = gr.components.Audio(source="upload", type='filepath', label='Upload File')
+            microphone = gr.components.Audio(source="microphone", type='filepath', label='Microphone')
+        lang_selector, models_in_lang = create_lang_selector_component()
+        transcript = gr.components.Label(label='Transcript')
+        run = gr.components.Button('Transcribe')
+        run.click(transcribe, inputs=[microphone, file_upload, models_in_lang], outputs=[transcript])
+    with gr.Tab("Transcribe Youtube"):
+        yt_url = gr.components.Textbox(
+            lines=1, label="Youtube URL", placeholder="Paste the URL to a YouTube video here"
+        )
+        lang_selector_yt, models_in_lang_yt = create_lang_selector_component(
+            default_en_model='nvidia/stt_en_conformer_transducer_large'
+        )
+        embedded_video = gr.components.HTML()
+        transcript = gr.components.Label(label='Transcript')
+        run = gr.components.Button('Transcribe YouTube')
+        run.click(yt_transcribe, inputs=[yt_url, models_in_lang_yt], outputs=[embedded_video, transcript])
     gr.components.HTML(ARTICLE)
 demo.queue(concurrency_count=1)
+demo.launch(enable_queue=True)

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- nemo_toolkit[~~asr~~]


1	+ git+https://github.com/NVIDIA/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]
2	+ pytube

speech_to_text_buffered_infer_ctc.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script serves three goals:
+    (1) Demonstrate how to use NeMo Models outside of PytorchLightning
+    (2) Shows example of batch ASR inference
+    (3) Serves as CI test for pre-trained checkpoint
+python speech_to_text_buffered_infer_ctc.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    total_buffer_in_secs=4.0 \
+    chunk_len_in_secs=1.6 \
+    model_stride=4 \
+    batch_size=32
+# NOTE:
+    You can use `DEBUG=1 python speech_to_text_buffered_infer_ctc.py ...` to print out the
+    predictions of the model, and ground-truth text if presents in manifest.
+"""
+import contextlib
+import copy
+import glob
+import math
+import os
+from dataclasses import dataclass, is_dataclass
+from typing import Optional
+import torch
+from omegaconf import OmegaConf
+from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchASR
+from nemo.collections.asr.parts.utils.transcribe_utils import (
+    compute_output_filename,
+    get_buffered_pred_feat,
+    setup_model,
+    write_transcription,
+)
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+can_gpu = torch.cuda.is_available()
+@dataclass
+class TranscriptionConfig:
+    # Required configs
+    model_path: Optional[str] = None  # Path to a .nemo file
+    pretrained_name: Optional[str] = None  # Name of a pretrained model
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
+    # General configs
+    output_filename: Optional[str] = None
+    batch_size: int = 32
+    num_workers: int = 0
+    append_pred: bool = False  # Sets mode of work, if True it will add new field transcriptions.
+    pred_name_postfix: Optional[str] = None  # If you need to use another model name, rather than standard one.
+    # Chunked configs
+    chunk_len_in_secs: float = 1.6  # Chunk length in seconds
+    total_buffer_in_secs: float = 4.0  # Length of buffer (chunk + left and right padding) in seconds
+    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models",
+    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
+    # device anyway, and do inference on CPU only if CUDA device is not found.
+    # If `cuda` is a negative number, inference will be on CPU only.
+    cuda: Optional[int] = None
+    amp: bool = False
+    audio_type: str = "wav"
+    # Recompute model transcription, even if the output folder exists with scores.
+    overwrite_transcripts: bool = True
+@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
+def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+    torch.set_grad_enabled(False)
+    if is_dataclass(cfg):
+        cfg = OmegaConf.structured(cfg)
+    if cfg.model_path is None and cfg.pretrained_name is None:
+        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
+    if cfg.audio_dir is None and cfg.dataset_manifest is None:
+        raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")
+    filepaths = None
+    manifest = cfg.dataset_manifest
+    if cfg.audio_dir is not None:
+        filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
+        manifest = None  # ignore dataset_manifest if audio_dir and dataset_manifest both presents
+    # setup GPU
+    if cfg.cuda is None:
+        if torch.cuda.is_available():
+            device = [0]  # use 0th CUDA device
+            accelerator = 'gpu'
+        else:
+            device = 1
+            accelerator = 'cpu'
+    else:
+        device = [cfg.cuda]
+        accelerator = 'gpu'
+    map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu')
+    logging.info(f"Inference will be done on device : {device}")
+    asr_model, model_name = setup_model(cfg, map_location)
+    model_cfg = copy.deepcopy(asr_model._cfg)
+    OmegaConf.set_struct(model_cfg.preprocessor, False)
+    # some changes for streaming scenario
+    model_cfg.preprocessor.dither = 0.0
+    model_cfg.preprocessor.pad_to = 0
+    if model_cfg.preprocessor.normalize != "per_feature":
+        logging.error("Only EncDecCTCModelBPE models trained with per_feature normalization are supported currently")
+    # Disable config overwriting
+    OmegaConf.set_struct(model_cfg.preprocessor, True)
+    # setup AMP (optional)
+    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
+        logging.info("AMP enabled!\n")
+        autocast = torch.cuda.amp.autocast
+    else:
+        @contextlib.contextmanager
+        def autocast():
+            yield
+    # Compute output filename
+    cfg = compute_output_filename(cfg, model_name)
+    # if transcripts should not be overwritten, and already exists, skip re-transcription step and return
+    if not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename):
+        logging.info(
+            f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`"
+            f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text."
+        )
+        return cfg
+    asr_model.eval()
+    asr_model = asr_model.to(asr_model.device)
+    feature_stride = model_cfg.preprocessor['window_stride']
+    model_stride_in_secs = feature_stride * cfg.model_stride
+    total_buffer = cfg.total_buffer_in_secs
+    chunk_len = float(cfg.chunk_len_in_secs)
+    tokens_per_chunk = math.ceil(chunk_len / model_stride_in_secs)
+    mid_delay = math.ceil((chunk_len + (total_buffer - chunk_len) / 2) / model_stride_in_secs)
+    logging.info(f"tokens_per_chunk is {tokens_per_chunk}, mid_delay is {mid_delay}")
+    frame_asr = FrameBatchASR(
+        asr_model=asr_model, frame_len=chunk_len, total_buffer=cfg.total_buffer_in_secs, batch_size=cfg.batch_size,
+    )
+    hyps = get_buffered_pred_feat(
+        frame_asr,
+        chunk_len,
+        tokens_per_chunk,
+        mid_delay,
+        model_cfg.preprocessor,
+        model_stride_in_secs,
+        asr_model.device,
+        manifest,
+        filepaths,
+    )
+    output_filename = write_transcription(hyps, cfg, model_name, filepaths=filepaths, compute_langs=False)
+    logging.info(f"Finished writing predictions to {output_filename}!")
+    return cfg
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter

speech_to_text_buffered_infer_rnnt.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Script to perform buffered inference using RNNT models.
+Buffered inference is the primary form of audio transcription when the audio segment is longer than 20-30 seconds.
+This is especially useful for models such as Conformers, which have quadratic time and memory scaling with
+audio duration.
+The difference between streaming and buffered inference is the chunk size (or the latency of inference).
+Buffered inference will use large chunk sizes (5-10 seconds) + some additional buffer for context.
+Streaming inference will use small chunk sizes (0.1 to 0.25 seconds) + some additional buffer for context.
+# Middle Token merge algorithm
+python speech_to_text_buffered_infer_rnnt.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    total_buffer_in_secs=4.0 \
+    chunk_len_in_secs=1.6 \
+    model_stride=4 \
+    batch_size=32
+# Longer Common Subsequence (LCS) Merge algorithm
+python speech_to_text_buffered_infer_rnnt.py \
+    model_path=null \
+    pretrained_name=null \
+    audio_dir="<remove or path to folder of audio files>" \
+    dataset_manifest="<remove or path to manifest>" \
+    output_filename="<remove or specify output filename>" \
+    total_buffer_in_secs=4.0 \
+    chunk_len_in_secs=1.6 \
+    model_stride=4 \
+    batch_size=32 \
+    merge_algo="lcs" \
+    lcs_alignment_dir=<OPTIONAL: Some path to store the LCS alignments>
+# NOTE:
+    You can use `DEBUG=1 python speech_to_text_buffered_infer_ctc.py ...` to print out the
+    predictions of the model, and ground-truth text if presents in manifest.
+"""
+import copy
+import glob
+import math
+import os
+from dataclasses import dataclass, is_dataclass
+from typing import Optional
+import torch
+from omegaconf import OmegaConf, open_dict
+from nemo.collections.asr.parts.utils.streaming_utils import (
+    BatchedFrameASRRNNT,
+    LongestCommonSubsequenceBatchedFrameASRRNNT,
+)
+from nemo.collections.asr.parts.utils.transcribe_utils import (
+    compute_output_filename,
+    get_buffered_pred_feat_rnnt,
+    setup_model,
+    write_transcription,
+)
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+can_gpu = torch.cuda.is_available()
+@dataclass
+class TranscriptionConfig:
+    # Required configs
+    model_path: Optional[str] = None  # Path to a .nemo file
+    pretrained_name: Optional[str] = None  # Name of a pretrained model
+    audio_dir: Optional[str] = None  # Path to a directory which contains audio files
+    dataset_manifest: Optional[str] = None  # Path to dataset's JSON manifest
+    # General configs
+    output_filename: Optional[str] = None
+    batch_size: int = 32
+    num_workers: int = 0
+    append_pred: bool = False  # Sets mode of work, if True it will add new field transcriptions.
+    pred_name_postfix: Optional[str] = None  # If you need to use another model name, rather than standard one.
+    # Chunked configs
+    chunk_len_in_secs: float = 1.6  # Chunk length in seconds
+    total_buffer_in_secs: float = 4.0  # Length of buffer (chunk + left and right padding) in seconds
+    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet models and 4 for Conformer models",
+    # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
+    # device anyway, and do inference on CPU only if CUDA device is not found.
+    # If `cuda` is a negative number, inference will be on CPU only.
+    cuda: Optional[int] = None
+    audio_type: str = "wav"
+    # Recompute model transcription, even if the output folder exists with scores.
+    overwrite_transcripts: bool = True
+    # Decoding configs
+    max_steps_per_timestep: int = 5  #'Maximum number of tokens decoded per acoustic timestep'
+    stateful_decoding: bool = False  # Whether to perform stateful decoding
+    # Merge algorithm for transducers
+    merge_algo: Optional[str] = 'middle'  # choices=['middle', 'lcs'], choice of algorithm to apply during inference.
+    lcs_alignment_dir: Optional[str] = None  # Path to a directory to store LCS algo alignments
+@hydra_runner(config_name="TranscriptionConfig", schema=TranscriptionConfig)
+def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+    torch.set_grad_enabled(False)
+    if is_dataclass(cfg):
+        cfg = OmegaConf.structured(cfg)
+    if cfg.model_path is None and cfg.pretrained_name is None:
+        raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None!")
+    if cfg.audio_dir is None and cfg.dataset_manifest is None:
+        raise ValueError("Both cfg.audio_dir and cfg.dataset_manifest cannot be None!")
+    filepaths = None
+    manifest = cfg.dataset_manifest
+    if cfg.audio_dir is not None:
+        filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
+        manifest = None  # ignore dataset_manifest if audio_dir and dataset_manifest both presents
+    # setup GPU
+    if cfg.cuda is None:
+        if torch.cuda.is_available():
+            device = [0]  # use 0th CUDA device
+            accelerator = 'gpu'
+        else:
+            device = 1
+            accelerator = 'cpu'
+    else:
+        device = [cfg.cuda]
+        accelerator = 'gpu'
+    map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu')
+    logging.info(f"Inference will be done on device : {device}")
+    asr_model, model_name = setup_model(cfg, map_location)
+    model_cfg = copy.deepcopy(asr_model._cfg)
+    OmegaConf.set_struct(model_cfg.preprocessor, False)
+    # some changes for streaming scenario
+    model_cfg.preprocessor.dither = 0.0
+    model_cfg.preprocessor.pad_to = 0
+    if model_cfg.preprocessor.normalize != "per_feature":
+        logging.error("Only EncDecRNNTBPEModel models trained with per_feature normalization are supported currently")
+    # Disable config overwriting
+    OmegaConf.set_struct(model_cfg.preprocessor, True)
+    # Compute output filename
+    cfg = compute_output_filename(cfg, model_name)
+    # if transcripts should not be overwritten, and already exists, skip re-transcription step and return
+    if not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename):
+        logging.info(
+            f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`"
+            f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text."
+        )
+        return cfg
+    asr_model.freeze()
+    asr_model = asr_model.to(asr_model.device)
+    # Change Decoding Config
+    decoding_cfg = asr_model.cfg.decoding
+    with open_dict(decoding_cfg):
+        if cfg.stateful_decoding:
+            decoding_cfg.strategy = "greedy"
+        else:
+            decoding_cfg.strategy = "greedy_batch"
+        decoding_cfg.preserve_alignments = True  # required to compute the middle token for transducers.
+        decoding_cfg.fused_batch_size = -1  # temporarily stop fused batch during inference.
+    asr_model.change_decoding_strategy(decoding_cfg)
+    feature_stride = model_cfg.preprocessor['window_stride']
+    model_stride_in_secs = feature_stride * cfg.model_stride
+    total_buffer = cfg.total_buffer_in_secs
+    chunk_len = float(cfg.chunk_len_in_secs)
+    tokens_per_chunk = math.ceil(chunk_len / model_stride_in_secs)
+    mid_delay = math.ceil((chunk_len + (total_buffer - chunk_len) / 2) / model_stride_in_secs)
+    logging.info(f"tokens_per_chunk is {tokens_per_chunk}, mid_delay is {mid_delay}")
+    if cfg.merge_algo == 'middle':
+        frame_asr = BatchedFrameASRRNNT(
+            asr_model=asr_model,
+            frame_len=chunk_len,
+            total_buffer=cfg.total_buffer_in_secs,
+            batch_size=cfg.batch_size,
+            max_steps_per_timestep=cfg.max_steps_per_timestep,
+            stateful_decoding=cfg.stateful_decoding,
+        )
+    elif cfg.merge_algo == 'lcs':
+        frame_asr = LongestCommonSubsequenceBatchedFrameASRRNNT(
+            asr_model=asr_model,
+            frame_len=chunk_len,
+            total_buffer=cfg.total_buffer_in_secs,
+            batch_size=cfg.batch_size,
+            max_steps_per_timestep=cfg.max_steps_per_timestep,
+            stateful_decoding=cfg.stateful_decoding,
+            alignment_basepath=cfg.lcs_alignment_dir,
+        )
+        # Set the LCS algorithm delay.
+        frame_asr.lcs_delay = math.floor(((total_buffer - chunk_len)) / model_stride_in_secs)
+    else:
+        raise ValueError("Invalid choice of merge algorithm for transducer buffered inference.")
+    hyps = get_buffered_pred_feat_rnnt(
+        asr=frame_asr,
+        tokens_per_chunk=tokens_per_chunk,
+        delay=mid_delay,
+        model_stride_in_secs=model_stride_in_secs,
+        batch_size=cfg.batch_size,
+        manifest=manifest,
+        filepaths=filepaths,
+    )
+    output_filename = write_transcription(hyps, cfg, model_name, filepaths=filepaths, compute_langs=False)
+    logging.info(f"Finished writing predictions to {output_filename}!")
+    return cfg
+if __name__ == '__main__':
+    main()  # noqa pylint: disable=no-value-for-parameter