VoxPolska
Collection
VoxPolska is an advanced Polish Text-to-Speech (TTS) model designed to convert written Polish text into
high-quality, natural-sounding speech.
β’
3 items
β’
Updated
!pip install transformers ipython
from transformers import pipeline
from IPython.display import Audio
pipe = pipeline("text-to-speech", model="salihfurkaan/VoxPolska-Auralis")
output = pipe("CzeΕΔ, jestem modelem sztucznej inteligencji mΓ³wiΔ
cym po polsku")
Audio(output["audio"], rate=output["sampling_rate"])
!pip install --no-deps unsloth==2025.4.1 bitsandbytes unsloth_zoo trl==0.15.2
!pip install xcodec2==0.1.5 --no-deps
!pip install vector_quantize_pytorch
from unsloth import FastLanguageModel
import torch
from xcodec2.modeling_xcodec2 import XCodec2Model
import torchaudio
import soundfile as sf
from IPython.display import display, Audio
from transformers import AutoTokenizer, AutoModelForCausalLM
input_text = "CzeΕΔ, jestem modelem sztucznej inteligencji mΓ³wiΔ
cym po polsku."
XCODEC2_MODEL_NAME = "HKUST-Audio/xcodec2"
SAMPLE_RATE = 16000
device = "cuda" if torch.cuda.is_available() else "cpu"
codec_model = XCodec2Model.from_pretrained(XCODEC2_MODEL_NAME)
codec_model = codec_model.to(device).eval()
codec_model.to('cpu')
tokenizer = AutoTokenizer.from_pretrained("salihfurkaan/VoxPolska-Auralis")
model = AutoModelForCausalLM.from_pretrained("salihfurkaan/VoxPolska-Auralis")
FastLanguageModel.for_inference(model)
def ids_to_speech_tokens(speech_ids):
speech_tokens_str = []
for speech_id in speech_ids:
speech_tokens_str.append(f"<|s_{speech_id}|>")
return speech_tokens_str
def extract_speech_ids(speech_tokens_str):
speech_ids = []
for token_str in speech_tokens_str:
if token_str.startswith('<|s_') and token_str.endswith('|>'):
num_str = token_str[4:-2]
num = int(num_str)
speech_ids.append(num)
else:
print(f"Unexpected token: {token_str}")
return speech_ids
with torch.inference_mode():
with torch.amp.autocast(device,dtype=model.dtype):
formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
chat = [
{"role": "user", "content": "Convert the text to speech:" + formatted_text},
{"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
]
input_ids = tokenizer.apply_chat_template(
chat,
tokenize=True,
return_tensors='pt',
continue_final_message=True
)
speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
# Generate the speech autoregressively
outputs = model.generate(
input_ids,
max_length=2048,
eos_token_id= speech_end_id ,
do_sample=True,
top_p=1.2, # Adjusts the diversity of generated content
temperature=1.2, # Controls randomness in output
)
generated_ids = outputs[0][input_ids.shape[1]:-1]
speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
speech_tokens = extract_speech_ids(speech_tokens)
speech_tokens = torch.tensor(speech_tokens).cpu().unsqueeze(0).unsqueeze(0)
gen_wav = codec_model.decode_code(speech_tokens)
sf.write("output.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)
display(Audio(gen_wav[0, 0, :].cpu().numpy(), rate=16000))
You can get your huggingface token from here
For questions, suggestions, and feedback, please open an issue on HuggingFace. You can also reach the author via: LinkedIn
Do not use this model for impersonation without consent, misinformation or deception (including fake news or fraudulent calls), or any illegal or harmful activity. By using this model, you agree to follow all applicable laws and ethical guidelines.
@misc{
title={salihfurkaan/VoxPolska-Auralis},
author={Salih Furkan Erik},
year={2025},
url={https://huggingface.co/salihfurkaan/VoxPolska-Auralis/}
}
Base model
meta-llama/Llama-3.2-1B-Instruct