|
--- |
|
base_model: unsloth/csm-1b |
|
tags: |
|
- transformers |
|
- csm |
|
license: apache-2.0 |
|
language: |
|
- en |
|
datasets: |
|
- beyoru/kafka-voice |
|
--- |
|
|
|
# Usage |
|
``` |
|
import torch |
|
from transformers import CsmForConditionalGeneration, AutoProcessor |
|
|
|
model_id = "beyoru/kafka-sesame" |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
# load the model and the processor |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device) |
|
model.eval() |
|
|
|
model.generation_config.max_length = 250 # big enough to avoid recompilation |
|
model.generation_config.max_new_tokens = None # would take precedence over max_length |
|
model.generation_config.cache_implementation = "static" |
|
model.depth_decoder.generation_config.cache_implementation = "static" |
|
|
|
# prepare the inputs |
|
text = "[0]Hello from Sesame." # `[0]` for speaker id 0 |
|
inputs = processor(text, add_special_tokens=True).to(device) |
|
|
|
# another equivalent way to prepare the inputs |
|
conversation = [ |
|
{"role": "0", "content": [{"type": "text", "text": "Hello from Sesame."}]}, |
|
] |
|
inputs = processor.apply_chat_template( |
|
conversation, |
|
tokenize=True, |
|
return_dict=True, |
|
).to(device) |
|
|
|
# infer the model |
|
@torch.interface_mode() |
|
audio = model.generate(**inputs, output_audio=True) |
|
processor.save_audio(audio, "example_without_context.wav") |
|
|
|
``` |