Initial upload: Arabic GEC Gemma 3 1B v1
Browse files- .gitattributes +1 -0
- README.md +206 -0
- added_tokens.json +3 -0
- chat_template.jinja +5 -0
- config.json +36 -0
- generation_config.json +14 -0
- model.safetensors +3 -0
- special_tokens_map.json +33 -0
- tokenizer.json +3 -0
- tokenizer.model +3 -0
- tokenizer_config.json +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Gemma 3 1B Arabic Grammatical Error Correction v1
|
2 |
+
|
3 |
+
## Model Description
|
4 |
+
|
5 |
+
This model is a fine-tuned version of Google's Gemma 3 1B model, specifically trained for Arabic Grammatical Error Correction (GEC) by Alnnahwi. The model takes Arabic sentences as input and outputs their grammatically corrected versions.
|
6 |
+
|
7 |
+
**Developed by**: Bahjat Al Mostafa (Alnnahwi)
|
8 |
+
**Base Model:** google/gemma-3-1b
|
9 |
+
**Task:** Grammatical Error Correction
|
10 |
+
**Language:** Arabic
|
11 |
+
**Version:** 1.0.0
|
12 |
+
**Organization**: [Alnnahwi](https://alnnahwi.com/)
|
13 |
+
|
14 |
+
## Quick Start
|
15 |
+
|
16 |
+
### Installation
|
17 |
+
|
18 |
+
```bash
|
19 |
+
pip install transformers torch
|
20 |
+
```
|
21 |
+
|
22 |
+
### Basic Usage
|
23 |
+
|
24 |
+
```python
|
25 |
+
from transformers import pipeline, AutoTokenizer
|
26 |
+
import torch
|
27 |
+
|
28 |
+
MODEL_NAME = "alnnahwi/gemma-3-1b-arabic-gec-v1"
|
29 |
+
|
30 |
+
def extract_model_response(generated_text):
|
31 |
+
"""Extract just the model's response from the full generated text."""
|
32 |
+
# Find the position after "model" marker
|
33 |
+
model_marker = "\nmodel\n"
|
34 |
+
if model_marker in generated_text:
|
35 |
+
response_start = generated_text.find(model_marker) + len(model_marker)
|
36 |
+
return generated_text[response_start:].strip()
|
37 |
+
|
38 |
+
# Alternative format (in case formatting changes)
|
39 |
+
alt_marker = "model\n"
|
40 |
+
if alt_marker in generated_text:
|
41 |
+
response_start = generated_text.find(alt_marker) + len(alt_marker)
|
42 |
+
return generated_text[response_start:].strip()
|
43 |
+
|
44 |
+
# If markers not found, return the original text
|
45 |
+
return generated_text
|
46 |
+
|
47 |
+
# Initialize the tokenizer
|
48 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
49 |
+
# Add Gemma chat template manually
|
50 |
+
tokenizer.chat_template = """{% for message in messages %}{{'<start_of_turn>' + message['role'] + '\n' + message['content'] + '<end_of_turn>\n'}}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"""
|
51 |
+
|
52 |
+
# Device selection
|
53 |
+
if torch.backends.mps.is_available():
|
54 |
+
device = "mps"
|
55 |
+
elif torch.cuda.is_available():
|
56 |
+
device = "cuda"
|
57 |
+
else:
|
58 |
+
device = "cpu"
|
59 |
+
|
60 |
+
# Create pipeline
|
61 |
+
pipe = pipeline(
|
62 |
+
"text-generation",
|
63 |
+
model=MODEL_NAME,
|
64 |
+
tokenizer=tokenizer,
|
65 |
+
device=device,
|
66 |
+
)
|
67 |
+
|
68 |
+
def correct_arabic_text(text):
|
69 |
+
"""Correct Arabic text using the fine-tuned model."""
|
70 |
+
messages = [{"role": "user", "content": text}]
|
71 |
+
prompt = tokenizer.apply_chat_template(
|
72 |
+
messages, tokenize=False, add_generation_prompt=True
|
73 |
+
)
|
74 |
+
|
75 |
+
outputs = pipe(
|
76 |
+
prompt,
|
77 |
+
max_new_tokens=512,
|
78 |
+
do_sample=True,
|
79 |
+
temperature=0.7,
|
80 |
+
top_p=0.9,
|
81 |
+
)
|
82 |
+
|
83 |
+
full_text = outputs[0]["generated_text"]
|
84 |
+
return extract_model_response(full_text)
|
85 |
+
|
86 |
+
# Example usage with real outputs
|
87 |
+
test_inputs = [
|
88 |
+
"كيف حالكي اليوم؟",
|
89 |
+
"وجدنا سبعون حالة",
|
90 |
+
"جاء في تسعة و سبعين سورة.",
|
91 |
+
"لاكن ما رايكم",
|
92 |
+
]
|
93 |
+
|
94 |
+
for text in test_inputs:
|
95 |
+
corrected = correct_arabic_text(text)
|
96 |
+
print(f"Original: {text}")
|
97 |
+
print(f"Corrected: {corrected}")
|
98 |
+
print("-" * 50)
|
99 |
+
|
100 |
+
# Expected output:
|
101 |
+
# Original: كيف حالكي اليوم؟
|
102 |
+
# Corrected: كيف حالك اليوم؟
|
103 |
+
# --------------------------------------------------
|
104 |
+
# Original: وجدنا سبعون حالة
|
105 |
+
# Corrected: وجدنا سبعين حالة
|
106 |
+
# --------------------------------------------------
|
107 |
+
# Original: جاء في تسعة و سبعين سورة.
|
108 |
+
# Corrected: جاء في تسع وسبعين سورة.
|
109 |
+
# --------------------------------------------------
|
110 |
+
# Original: لاكن ما رايكم
|
111 |
+
# Corrected: لكن ما رأيكم؟
|
112 |
+
# --------------------------------------------------
|
113 |
+
```
|
114 |
+
|
115 |
+
### Example Corrections
|
116 |
+
|
117 |
+
| Input (Incorrect) | Output (Corrected) | Error Type |
|
118 |
+
|---|---|---|
|
119 |
+
| كيف حالكي اليوم؟ | كيف حالك اليوم؟ | Gender agreement |
|
120 |
+
| وجدنا سبعون حالة | وجدنا سبعين حالة | Number declension |
|
121 |
+
| جاء في تسعة و سبعين سورة. | جاء في تسع وسبعين سورة. | Number gender + spacing |
|
122 |
+
| لاكن ما رايكم | لكن ما رأيكم؟ | Spelling + punctuation |
|
123 |
+
|
124 |
+
## Model Details
|
125 |
+
|
126 |
+
### Training Data
|
127 |
+
|
128 |
+
- **Dataset**: Custom Arabic GEC dataset
|
129 |
+
- **Training Epochs**: 7
|
130 |
+
- **Base Architecture**: Gemma 3 1B parameters
|
131 |
+
|
132 |
+
### Performance
|
133 |
+
|
134 |
+
- Designed for Modern Standard Arabic (MSA).
|
135 |
+
- Handles common grammatical errors.
|
136 |
+
|
137 |
+
### Limitations
|
138 |
+
|
139 |
+
- Primarily trained on Modern Standard Arabic
|
140 |
+
- May not handle dialectical Arabic variations optimally
|
141 |
+
- Performance may vary with very long texts (>512 tokens)
|
142 |
+
- Context-dependent corrections may sometimes be imperfect
|
143 |
+
|
144 |
+
## Use Cases
|
145 |
+
|
146 |
+
- **Educational Tools**: Helping Arabic learners with gender agreement and number declension
|
147 |
+
- **Content Creation**: Proofreading Arabic content for grammatical accuracy
|
148 |
+
- **Text Processing**: Preprocessing Arabic text for downstream NLP tasks
|
149 |
+
- **Writing Assistance**: Supporting writers with:
|
150 |
+
- Proper number-noun agreement
|
151 |
+
- Correct case declensions
|
152 |
+
- Spelling standardization
|
153 |
+
- Punctuation normalization
|
154 |
+
- **Academic Writing**: Ensuring grammatical correctness in formal Arabic texts
|
155 |
+
|
156 |
+
## Training Details
|
157 |
+
|
158 |
+
- **Fine-tuning Framework**: Unsloth
|
159 |
+
- **Base Model**: Gemma 3 1B
|
160 |
+
- **Training Epochs**: 7
|
161 |
+
- **Optimization**: Memory-efficient fine-tuning techniques
|
162 |
+
|
163 |
+
## Citation
|
164 |
+
|
165 |
+
If you use this model in your research or applications, please cite:
|
166 |
+
|
167 |
+
```bibtex
|
168 |
+
@misc{gemma3-arabic-gec-v1,
|
169 |
+
title={Gemma 3 1B Arabic Grammatical Error Correction v1},
|
170 |
+
author={Bahjat Al Mostafa},
|
171 |
+
organization={Alnnahwi},
|
172 |
+
year={2025},
|
173 |
+
publisher={Hugging Face},
|
174 |
+
url={https://huggingface.co/alnnahwi/gemma-3-1b-arabic-gec-v1},
|
175 |
+
website={https://alnnahwi.com/}
|
176 |
+
}
|
177 |
+
```
|
178 |
+
|
179 |
+
## License
|
180 |
+
|
181 |
+
This model is released under the same license as the base Gemma model. Please refer to Google's Gemma license for usage terms and conditions.
|
182 |
+
|
183 |
+
**Important**: This model is based on Google's Gemma and is subject to Google's AI Principles and licensing terms.
|
184 |
+
|
185 |
+
## Acknowledgments
|
186 |
+
|
187 |
+
- Built upon Google's Gemma 3 1B model
|
188 |
+
- Fine-tuned using Unsloth framework
|
189 |
+
- Trained for Arabic Grammatical Error Correction
|
190 |
+
- Developed by Bahjat Al Mostafa at Alnnahwi
|
191 |
+
- Visit [Alnnahwi](https://alnnahwi.com/) for more Arabic NLP resources
|
192 |
+
|
193 |
+
## Contact
|
194 |
+
|
195 |
+
**Author**: Bahjat Al Mostafa
|
196 |
+
**Email**: <[email protected]>
|
197 |
+
**Organization**: Alnnahwi
|
198 |
+
**Website**: [https://alnnahwi.com/](https://alnnahwi.com/)
|
199 |
+
|
200 |
+
For questions, issues, or collaboration opportunities, please open an issue in this repository or visit our website.
|
201 |
+
|
202 |
+
---
|
203 |
+
|
204 |
+
**Model Version**: v1.0.0
|
205 |
+
**Last Updated**: May 2025
|
206 |
+
**Model Size**: ~1.9GB
|
added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<image_soft_token>": 262144
|
3 |
+
}
|
chat_template.jinja
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{% for message in messages %}{% if message['role'] == 'user' %}<start_of_turn>user
|
2 |
+
{{ message['content'] }}<end_of_turn>
|
3 |
+
{% elif message['role'] == 'model' %}<start_of_turn>model
|
4 |
+
{{ message['content'] }}<end_of_turn>
|
5 |
+
{% endif %}{% endfor %}
|
config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"Gemma3ForCausalLM"
|
4 |
+
],
|
5 |
+
"attention_bias": false,
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"attn_logit_softcapping": null,
|
8 |
+
"bos_token_id": 2,
|
9 |
+
"cache_implementation": "hybrid",
|
10 |
+
"eos_token_id": 1,
|
11 |
+
"final_logit_softcapping": null,
|
12 |
+
"head_dim": 256,
|
13 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
14 |
+
"hidden_size": 1152,
|
15 |
+
"initializer_range": 0.02,
|
16 |
+
"intermediate_size": 6912,
|
17 |
+
"max_position_embeddings": 32768,
|
18 |
+
"model_type": "gemma3_text",
|
19 |
+
"num_attention_heads": 4,
|
20 |
+
"num_hidden_layers": 26,
|
21 |
+
"num_key_value_heads": 1,
|
22 |
+
"pad_token_id": 0,
|
23 |
+
"query_pre_attn_scalar": 256,
|
24 |
+
"rms_norm_eps": 1e-06,
|
25 |
+
"rope_local_base_freq": 10000,
|
26 |
+
"rope_scaling": null,
|
27 |
+
"rope_theta": 1000000,
|
28 |
+
"sliding_window": 512,
|
29 |
+
"sliding_window_pattern": 6,
|
30 |
+
"torch_dtype": "bfloat16",
|
31 |
+
"transformers_version": "4.52.4",
|
32 |
+
"unsloth_fixed": true,
|
33 |
+
"unsloth_version": "2025.5.9",
|
34 |
+
"use_cache": true,
|
35 |
+
"vocab_size": 262144
|
36 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token_id": 2,
|
3 |
+
"cache_implementation": "hybrid",
|
4 |
+
"do_sample": true,
|
5 |
+
"eos_token_id": [
|
6 |
+
1,
|
7 |
+
106
|
8 |
+
],
|
9 |
+
"max_length": 32768,
|
10 |
+
"pad_token_id": 0,
|
11 |
+
"top_k": 64,
|
12 |
+
"top_p": 0.95,
|
13 |
+
"transformers_version": "4.52.4"
|
14 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6175daad5548277650840a3f4d25e4967a8ca3d4b87393eefdb6ea8b8f6bc6df
|
3 |
+
size 1999811208
|
special_tokens_map.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"boi_token": "<start_of_image>",
|
3 |
+
"bos_token": {
|
4 |
+
"content": "<bos>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false
|
9 |
+
},
|
10 |
+
"eoi_token": "<end_of_image>",
|
11 |
+
"eos_token": {
|
12 |
+
"content": "<eos>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false
|
17 |
+
},
|
18 |
+
"image_token": "<image_soft_token>",
|
19 |
+
"pad_token": {
|
20 |
+
"content": "<pad>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false
|
25 |
+
},
|
26 |
+
"unk_token": {
|
27 |
+
"content": "<unk>",
|
28 |
+
"lstrip": false,
|
29 |
+
"normalized": false,
|
30 |
+
"rstrip": false,
|
31 |
+
"single_word": false
|
32 |
+
}
|
33 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a872e3bb510a751b26bd65f61aad05f948c9cf78fe4f787aebd197b393cc4081
|
3 |
+
size 33384667
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
|
3 |
+
size 4689074
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|