File size: 13,212 Bytes
fcf2981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e57ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcf2981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75bcdb3
 
fcf2981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e57ff
fcf2981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e57ff
 
 
 
 
 
 
 
fcf2981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e57ff
 
 
 
 
 
 
 
fcf2981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e57ff
 
 
 
 
 
 
 
 
fcf2981
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
#!/usr/bin/env python3
"""
GPT-OSS Model Push Script
Specialized script for pushing GPT-OSS models to Hugging Face Hub
Handles LoRA weight merging and model card generation
"""

import os
import sys
import argparse
import json
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

def merge_lora_weights(checkpoint_path, base_model_name, output_path):
    """Merge LoRA weights with base model for inference"""
    
    print(f"Loading base model: {base_model_name}")
    
    # Load base model
    model_kwargs = {
        "attn_implementation": "eager", 
        "torch_dtype": "auto", 
        "use_cache": True, 
        "device_map": "auto"
    }
    base_model = AutoModelForCausalLM.from_pretrained(base_model_name, **model_kwargs).cuda()
    
    print(f"Loading LoRA weights from: {checkpoint_path}")
    
    # Load and merge LoRA weights
    model = PeftModel.from_pretrained(base_model, checkpoint_path)
    model = model.merge_and_unload()
    
    print(f"Saving merged model to: {output_path}")
    model.save_pretrained(output_path)
    
    # Save tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.save_pretrained(output_path)
    
    return model, tokenizer

def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description, training_config_type=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
    """Create a comprehensive model card for GPT-OSS models using generate_model_card.py"""
    
    try:
        # Import the model card generator
        import sys
        import os
        sys.path.append(os.path.join(os.path.dirname(__file__)))
        from generate_model_card import ModelCardGenerator, create_default_variables
        
        # Create generator
        generator = ModelCardGenerator()
        
        # Create variables for the model card
        variables = create_default_variables()
        
        # Update with GPT-OSS specific values
        variables.update({
            "repo_name": model_name,
            "model_name": model_name.split('/')[-1],
            "experiment_name": experiment_name or "gpt_oss_finetune",
            "dataset_repo": dataset_repo,
            "author_name": author_name or "GPT-OSS Fine-tuner",
            "model_description": model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks.",
            "training_config_type": training_config_type or "GPT-OSS Configuration",
            "base_model": "openai/gpt-oss-20b",
            "dataset_name": dataset_name or "HuggingFaceH4/Multilingual-Thinking",
            "trainer_type": trainer_type or "SFTTrainer",
            "batch_size": str(batch_size) if batch_size else "4",
            "learning_rate": str(learning_rate) if learning_rate else "2e-4",
            "max_epochs": str(max_epochs) if max_epochs else "1",
            "max_seq_length": str(max_seq_length) if max_seq_length else "2048",
            "hardware_info": "GPU (H100/A100)",
            "trackio_url": trackio_url or "N/A",
            "training_loss": "N/A",
            "validation_loss": "N/A",
            "perplexity": "N/A",
            "quantized_models": False
        })
        
        # Generate the model card
        model_card_content = generator.generate_model_card(variables)
        
        print("✅ Model card generated using generate_model_card.py")
        return model_card_content
        
    except Exception as e:
        print(f"❌ Failed to generate model card with generator: {e}")
        print("🔄 Falling back to original GPT-OSS model card")
        return _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description)

def _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
    """Create the original GPT-OSS model card as fallback"""
    
    card_content = f"""---
language:
- en
- es
- fr
- it
- de
- zh
- hi
- ja
- ko
- ar
license: mit
tags:
- gpt-oss
- multilingual
- reasoning
- chain-of-thought
- fine-tuned
---

# {model_name}

## Model Description

{model_description}

This model is a fine-tuned version of OpenAI's GPT-OSS-20B model, optimized for multilingual reasoning tasks. It has been trained on the Multilingual-Thinking dataset to generate chain-of-thought reasoning in multiple languages.

## Training Details

- **Base Model**: openai/gpt-oss-20b
- **Training Dataset**: HuggingFaceH4/Multilingual-Thinking
- **Training Method**: LoRA (Low-Rank Adaptation)
- **Quantization**: MXFP4
- **Experiment**: {experiment_name}
- **Monitoring**: {trackio_url}

## Usage

### Basic Usage

```python
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("{model_name}")
model = AutoModelForCausalLM.from_pretrained("{model_name}")

# Example: Reasoning in Spanish
messages = [
    {{"role": "system", "content": "reasoning language: Spanish"}},
    {{"role": "user", "content": "What is the capital of Australia?"}}
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

output_ids = model.generate(input_ids, max_new_tokens=512)
response = tokenizer.batch_decode(output_ids)[0]
print(response)
```

### Multilingual Reasoning

The model supports reasoning in multiple languages:

- English
- Spanish (Español)
- French (Français)
- Italian (Italiano)
- German (Deutsch)
- Chinese (中文)
- Hindi (हिन्दी)
- Japanese (日本語)
- Korean (한국어)
- Arabic (العربية)

### System Prompt Format

To control the reasoning language, use the system prompt:

```
reasoning language: [LANGUAGE]
```

Example:
```
reasoning language: German
```

## Training Configuration

- **LoRA Rank**: 8
- **LoRA Alpha**: 16
- **Target Modules**: all-linear
- **Learning Rate**: 2e-4
- **Batch Size**: 4
- **Sequence Length**: 2048
- **Mixed Precision**: bf16

## Dataset Information

The model was trained on the Multilingual-Thinking dataset, which contains 1,000 examples of chain-of-thought reasoning translated into multiple languages.

## Limitations

- The model is designed for reasoning tasks and may not perform optimally on other tasks
- Reasoning quality may vary across languages
- The model inherits limitations from the base GPT-OSS-20B model

## Citation

If you use this model in your research, please cite:

```bibtex
@misc{{{model_name.replace("/", "_").replace("-", "_")},
  author = {{{author_name}}},
  title = {{{model_name}}},
  year = {{{datetime.now().year}}},
  publisher = {{Hugging Face}},
  journal = {{Hugging Face repository}},
  howpublished = {{\\url{{https://huggingface.co/{model_name}}}}}
  }}
```

## License

This model is licensed under the MIT License.

## Training Resources

- **Training Dataset**: https://huggingface.co/datasets/{dataset_repo}
- **Training Monitoring**: {trackio_url}
- **Base Model**: https://huggingface.co/openai/gpt-oss-20b

## Model Information

- **Architecture**: GPT-OSS-20B with LoRA adapters
- **Parameters**: 20B base + LoRA adapters
- **Context Length**: 2048 tokens
- **Languages**: 10+ languages supported
- **Task**: Multilingual reasoning and chain-of-thought generation
"""
    
    return card_content

def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description, training_config_type=None, model_name=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
    """Push GPT-OSS model to Hugging Face Hub"""
    
    print("=== GPT-OSS Model Push Pipeline ===")
    print(f"Checkpoint: {checkpoint_path}")
    print(f"Repository: {repo_name}")
    print(f"Experiment: {experiment_name}")
    print(f"Author: {author_name}")
    
    # Validate checkpoint path
    if not os.path.exists(checkpoint_path):
        raise FileNotFoundError(f"Checkpoint path not found: {checkpoint_path}")
    
    # Create temporary directory for merged model
    temp_output = f"/tmp/gpt_oss_merged_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    os.makedirs(temp_output, exist_ok=True)
    
    try:
        # Merge LoRA weights with base model
        print("Merging LoRA weights with base model...")
        model, tokenizer = merge_lora_weights(
            checkpoint_path=checkpoint_path,
            base_model_name="openai/gpt-oss-20b",
            output_path=temp_output
        )
        
        # Create model card
        print("Creating model card...")
        model_card_content = create_gpt_oss_model_card(
            model_name=repo_name,
            experiment_name=experiment_name,
            trackio_url=trackio_url,
            dataset_repo=dataset_repo,
            author_name=author_name,
            model_description=model_description,
            training_config_type=training_config_type,
            dataset_name=dataset_name,
            batch_size=batch_size,
            learning_rate=learning_rate,
            max_epochs=max_epochs,
            max_seq_length=max_seq_length,
            trainer_type=trainer_type
        )
        
        # Save model card
        model_card_path = os.path.join(temp_output, "README.md")
        with open(model_card_path, "w", encoding="utf-8") as f:
            f.write(model_card_content)
        
        # Push to Hugging Face Hub
        print(f"Pushing model to: {repo_name}")
        
        # Set HF token
        os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token
        
        # Push using transformers
        from huggingface_hub import HfApi
        api = HfApi()
        
        # Create repository if it doesn't exist
        try:
            api.create_repo(repo_name, private=False, exist_ok=True)
        except Exception as e:
            print(f"Warning: Could not create repository: {e}")
        
        # Upload files
        print("Uploading model files...")
        api.upload_folder(
            folder_path=temp_output,
            repo_id=repo_name,
            repo_type="model"
        )
        
        print("✅ GPT-OSS model pushed successfully!")
        print(f"Model URL: https://huggingface.co/{repo_name}")
        
        # Clean up
        import shutil
        shutil.rmtree(temp_output)
        
        return True
        
    except Exception as e:
        print(f"❌ Error pushing GPT-OSS model: {e}")
        
        # Clean up on error
        if os.path.exists(temp_output):
            import shutil
            shutil.rmtree(temp_output)
        
        return False

def main():
    parser = argparse.ArgumentParser(description="Push GPT-OSS model to Hugging Face Hub")
    parser.add_argument("checkpoint_path", help="Path to model checkpoint")
    parser.add_argument("repo_name", help="Hugging Face repository name")
    parser.add_argument("--token", required=True, help="Hugging Face token")
    parser.add_argument("--trackio-url", help="Trackio URL for model card")
    parser.add_argument("--experiment-name", help="Experiment name")
    parser.add_argument("--dataset-repo", help="Dataset repository")
    parser.add_argument("--author-name", help="Author name")
    parser.add_argument("--model-description", help="Model description")
    parser.add_argument("--training-config-type", help="Training configuration type")
    parser.add_argument("--model-name", help="Base model name")
    parser.add_argument("--dataset-name", help="Dataset name")
    parser.add_argument("--batch-size", help="Batch size")
    parser.add_argument("--learning-rate", help="Learning rate")
    parser.add_argument("--max-epochs", help="Maximum epochs")
    parser.add_argument("--max-seq-length", help="Maximum sequence length")
    parser.add_argument("--trainer-type", help="Trainer type")
    
    args = parser.parse_args()
    
    # Set defaults
    experiment_name = args.experiment_name or "gpt_oss_finetune"
    dataset_repo = args.dataset_repo or "HuggingFaceH4/Multilingual-Thinking"
    author_name = args.author_name or "GPT-OSS Fine-tuner"
    model_description = args.model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks."
    
    success = push_gpt_oss_model(
        checkpoint_path=args.checkpoint_path,
        repo_name=args.repo_name,
        hf_token=args.token,
        trackio_url=args.trackio_url,
        experiment_name=experiment_name,
        dataset_repo=dataset_repo,
        author_name=author_name,
        model_description=model_description,
        training_config_type=args.training_config_type,
        model_name=args.model_name,
        dataset_name=args.dataset_name,
        batch_size=args.batch_size,
        learning_rate=args.learning_rate,
        max_epochs=args.max_epochs,
        max_seq_length=args.max_seq_length,
        trainer_type=args.trainer_type
    )
    
    sys.exit(0 if success else 1)

if __name__ == "__main__":
    main()