mustafoyev202 commited on
Commit
7b7a648
·
verified ·
1 Parent(s): 19cf637

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +176 -0
  2. README.md +120 -14
  3. model.py +209 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
175
+
176
+ .env
README.md CHANGED
@@ -1,14 +1,120 @@
1
- ---
2
- title: Uzbek Stt
3
- emoji: 🏢
4
- colorFrom: pink
5
- colorTo: gray
6
- sdk: streamlit
7
- sdk_version: 1.42.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Uzbek Speech-to-Text
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Uzbek Speech-to-Text with Grammar Correction
2
+
3
+ A powerful Speech-to-Text (STT) pipeline for the Uzbek language that combines state-of-the-art speech recognition with advanced grammar correction capabilities. Built with Wav2Vec2 and enhanced with Groq's LLM-powered grammar correction.
4
+
5
+ ## Features
6
+
7
+ - High-accuracy Uzbek speech recognition using fine-tuned Wav2Vec2 model
8
+ - Intelligent grammar correction using Groq's LLaMA 3.3 70B model
9
+ - User-friendly Streamlit web interface
10
+ - Support for multiple audio formats (WAV, MP3, M4A, OGG)
11
+ - Robust error handling and logging
12
+ - Easy-to-use API for integration into other projects
13
+
14
+ ## Installation
15
+
16
+ 1. Clone the repository:
17
+
18
+ ```bash
19
+ git clone [your-repository-url]
20
+ cd uzbek-stt
21
+ ```
22
+
23
+ 2. Install the required dependencies:
24
+
25
+ ```bash
26
+ pip install -r requirements.txt
27
+ ```
28
+
29
+ 3. Set up your environment variables:
30
+
31
+ ```bash
32
+ export GROQ_API_KEY="your-groq-api-key"
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ### Using the Streamlit Web Interface
38
+
39
+ 1. Start the Streamlit application:
40
+
41
+ ```bash
42
+ streamlit run app.py
43
+ ```
44
+
45
+ 2. Open your web browser and navigate to the provided URL
46
+ 3. Upload an Uzbek audio file
47
+ 4. Click "Transcribe & Correct" to process the audio
48
+
49
+ ### Using the Python API
50
+
51
+ ```python
52
+ from uzbek_stt import UzbekSTT
53
+
54
+ # Initialize the pipeline
55
+ stt = UzbekSTT()
56
+
57
+ # Transcribe an audio file
58
+ transcription = stt.transcribe("path/to/your/audio.wav")
59
+ print(transcription)
60
+ ```
61
+
62
+ ## Requirements
63
+
64
+ - Python 3.8+
65
+ - PyTorch
66
+ - Transformers
67
+ - Librosa
68
+ - Streamlit
69
+ - LangChain
70
+ - Groq API access
71
+
72
+ ## Model Details
73
+
74
+ The pipeline uses two main components:
75
+
76
+ 1. **Speech Recognition**: Based on the `oyqiz/uzbek_stt` Wav2Vec2 model fine-tuned for Uzbek
77
+ 2. **Grammar Correction**: Powered by Groq's LLaMA 3.3 70B model with Uzbek language expertise
78
+
79
+ ## Environment Variables
80
+
81
+ Required environment variables:
82
+
83
+ - `GROQ_API_KEY`: Your Groq API key for accessing the LLM service
84
+
85
+ ## Error Handling
86
+
87
+ The pipeline includes comprehensive error handling for:
88
+
89
+ - Missing or invalid audio files
90
+ - Model loading failures
91
+ - Transcription errors
92
+ - API communication issues
93
+ - Invalid environment configurations
94
+
95
+ ## Logging
96
+
97
+ Logging is configured to track:
98
+
99
+ - Model initialization
100
+ - Audio processing steps
101
+ - Grammar correction progress
102
+ - Error messages and stack traces
103
+
104
+ ## Contributing
105
+
106
+ 1. Fork the repository
107
+ 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
108
+ 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
109
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
110
+ 5. Open a Pull Request
111
+
112
+ ## License
113
+
114
+ [Your chosen license]
115
+
116
+ ## Acknowledgments
117
+
118
+ - Thanks to the Wav2Vec2 team for the base model architecture
119
+ - Groq for providing the LLM API access
120
+ - Contributors to the Uzbek language model training data
model.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import logging
4
+ import librosa
5
+ from typing import Union, BinaryIO
6
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
7
+ from langchain_groq import ChatGroq
8
+ import streamlit as st
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class UzbekSTT:
20
+ """Enhanced Uzbek Speech-to-Text pipeline with grammar correction."""
21
+
22
+ # Set a class-level base model name
23
+ base_model_name = "oyqiz/uzbek_stt"
24
+
25
+ def __init__(self):
26
+ """Initialize the Uzbek STT pipeline with grammar correction."""
27
+ self.processor = None
28
+ self.model = None
29
+ self.groq_client = None
30
+ self.load_models()
31
+
32
+ def load_models(self) -> None:
33
+ """Load the base STT model and Groq client."""
34
+ try:
35
+ logger.info(f"Loading base Uzbek STT model: {self.base_model_name}")
36
+ self.processor = Wav2Vec2Processor.from_pretrained(self.base_model_name)
37
+ self.model = Wav2Vec2ForCTC.from_pretrained(self.base_model_name)
38
+
39
+ groq_api_key = os.getenv("GROQ_API_KEY")
40
+ if not groq_api_key:
41
+ raise ValueError("GROQ_API_KEY environment variable is required")
42
+
43
+ self.groq_client = ChatGroq(
44
+ model="llama-3.3-70b-versatile", temperature=0.3
45
+ )
46
+ logger.info("Models loaded successfully")
47
+ except Exception as e:
48
+ logger.error(f"Failed to initialize models: {str(e)}")
49
+ raise
50
+
51
+ def correct_grammar(self, text: str) -> str:
52
+ """Correct grammar in Uzbek text using Groq model."""
53
+ try:
54
+ messages = [
55
+ (
56
+ "system",
57
+ "Siz o'zbek tilida mutaxassissiz. Sizning vazifangiz berilgan o'zbek matnining grammatikasini to'g'rilash. Hech qanday izoh, tarjima yoki qo'shimcha ma'lumot bermang. Faqat to'g'rilangan o'zbek matnini qaytaring.",
58
+ ),
59
+ ("human", text),
60
+ ]
61
+ response = self.groq_client.invoke(messages)
62
+ return (
63
+ response.content.strip()
64
+ if hasattr(response, "content")
65
+ else str(response).strip()
66
+ )
67
+ except Exception as e:
68
+ logger.error(f"Grammar correction failed: {str(e)}")
69
+ return text
70
+
71
+ def transcribe(self, audio_file: Union[str, BinaryIO]) -> str:
72
+ """
73
+ Transcribe Uzbek speech to text with grammar correction.
74
+
75
+ Args:
76
+ audio_file: Path to audio file or file-like object
77
+
78
+ Returns:
79
+ str: Transcribed and grammar-corrected text
80
+ """
81
+ try:
82
+ # Validate and load audio
83
+ if isinstance(audio_file, str) and not os.path.exists(audio_file):
84
+ raise FileNotFoundError(f"Audio file not found: {audio_file}")
85
+
86
+ logger.info("Processing audio file...")
87
+ audio, _ = librosa.load(audio_file, sr=16000)
88
+ input_values = self.processor(
89
+ audio, return_tensors="pt", padding="longest", sampling_rate=16000
90
+ ).input_values
91
+
92
+ # Generate transcription
93
+ with torch.no_grad():
94
+ logits = self.model(input_values).logits
95
+ predicted_ids = torch.argmax(logits, dim=-1)
96
+
97
+ transcription = self.processor.batch_decode(predicted_ids)[0]
98
+
99
+ # Apply grammar correction
100
+ logger.info("Applying grammar correction...")
101
+ corrected_text = self.correct_grammar(transcription)
102
+
103
+ return corrected_text
104
+
105
+ except Exception as e:
106
+ logger.error(f"Transcription failed: {str(e)}")
107
+ raise
108
+
109
+ @classmethod
110
+ def from_pretrained(cls, model_name: str = "mustafoyev202/uzbek_stt"):
111
+ """Factory method for 🤗 Transformers compatibility."""
112
+ if model_name != "mustafoyev202/uzbek_stt":
113
+ logger.warning(
114
+ f"Using base model {cls.base_model_name} regardless of specified model name"
115
+ )
116
+ return cls()
117
+
118
+
119
+ # ----------------- Streamlit App ----------------- #
120
+
121
+
122
+ def main():
123
+ # Set Streamlit page configuration
124
+ st.set_page_config(
125
+ page_title="Uzbek STT with Grammar Correction",
126
+ page_icon="🗣️",
127
+ layout="centered",
128
+ initial_sidebar_state="auto",
129
+ )
130
+
131
+ # Inject custom CSS for a modern, beautiful design
132
+ st.markdown(
133
+ """
134
+ <style>
135
+ body {
136
+ background-color: #f0f2f6;
137
+ }
138
+ .main {
139
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
140
+ }
141
+ .stButton>button {
142
+ background-color: #4CAF50;
143
+ color: white;
144
+ padding: 10px 24px;
145
+ border: none;
146
+ border-radius: 4px;
147
+ cursor: pointer;
148
+ font-size: 16px;
149
+ }
150
+ .stButton>button:hover {
151
+ background-color: #45a049;
152
+ }
153
+ .header {
154
+ text-align: center;
155
+ color: #2c3e50;
156
+ margin-bottom: 30px;
157
+ }
158
+ </style>
159
+ """,
160
+ unsafe_allow_html=True,
161
+ )
162
+
163
+ # App header
164
+ st.markdown(
165
+ "<h1 class='header'>🗣️ Uzbek Speech-to-Text & Grammar Correction</h1>",
166
+ unsafe_allow_html=True,
167
+ )
168
+ st.markdown(
169
+ """
170
+ Welcome to the **Uzbek STT** application, where cutting-edge technology meets
171
+ linguistic precision. Upload an Uzbek audio file, and let our model transcribe and
172
+ correct your text in real time!
173
+ """
174
+ )
175
+
176
+ # File uploader for audio files
177
+ uploaded_file = st.file_uploader(
178
+ "Upload your Uzbek audio file", type=["wav", "mp3", "m4a", "ogg"]
179
+ )
180
+
181
+ if uploaded_file is not None:
182
+ # Display an audio player for the uploaded file
183
+ st.audio(uploaded_file, format="audio/wav")
184
+
185
+ # Save the uploaded file to a temporary file
186
+ temp_audio_path = "temp_audio.wav"
187
+ with open(temp_audio_path, "wb") as f:
188
+ f.write(uploaded_file.read())
189
+
190
+ if st.button("Transcribe"):
191
+ with st.spinner("Processing your audio file..."):
192
+ try:
193
+ # Initialize the UzbekSTT pipeline
194
+ uzbek_stt = UzbekSTT()
195
+ # Transcribe and correct the audio
196
+ transcription = uzbek_stt.transcribe(temp_audio_path)
197
+ st.success("Transcription complete!")
198
+ st.markdown("### Transcribed Text:")
199
+ st.write(transcription)
200
+ except Exception as e:
201
+ st.error(f"An error occurred: {str(e)}")
202
+ finally:
203
+ # Clean up the temporary audio file
204
+ if os.path.exists(temp_audio_path):
205
+ os.remove(temp_audio_path)
206
+
207
+
208
+ if __name__ == "__main__":
209
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ torch
3
+ transformers
4
+ librosa
5
+ langchain_groq
6
+ python-dotenv