--- base_model: unsloth/csm-1b tags: - transformers - csm license: apache-2.0 language: - en datasets: - beyoru/kafka-voice --- # Usage ``` import torch from transformers import CsmForConditionalGeneration, AutoProcessor model_id = "beyoru/kafka-sesame" device = "cuda" if torch.cuda.is_available() else "cpu" # load the model and the processor processor = AutoProcessor.from_pretrained(model_id) model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device) model.eval() model.generation_config.max_length = 250 # big enough to avoid recompilation model.generation_config.max_new_tokens = None # would take precedence over max_length model.generation_config.cache_implementation = "static" model.depth_decoder.generation_config.cache_implementation = "static" # prepare the inputs text = "[0]Hello from Sesame." # `[0]` for speaker id 0 inputs = processor(text, add_special_tokens=True).to(device) # another equivalent way to prepare the inputs conversation = [ {"role": "0", "content": [{"type": "text", "text": "Hello from Sesame."}]}, ] inputs = processor.apply_chat_template( conversation, tokenize=True, return_dict=True, ).to(device) # infer the model @torch.interface_mode() audio = model.generate(**inputs, output_audio=True) processor.save_audio(audio, "example_without_context.wav") ```