laampt commited on about 13 hours ago

Commit

b4ec773

verified ·

1 Parent(s): 561ff7e

Model save

Browse files

Files changed (31) hide show

.gitattributes +1 -0
README.md +58 -0
added_tokens.json +3 -0
chat_template.jinja +47 -0
config.json +54 -0
edl_lecture_01.ipynb +0 -0
final_raw_result.csv +0 -0
gemma3.ipynb +0 -0
generation_config.json +11 -0
gpt-oss-20b.ipynb +0 -0
gpt-oss-base.ipynb +0 -0
gpt_oss_submission.csv +0 -0
gpu-101.ipynb +1 -0
kimi_submission.csv +0 -0
kimi_submission_k2.csv +0 -0
model.safetensors +3 -0
pytorch_cuda_102.ipynb +1 -0
qwen-3.ipynb +0 -0
qwen3-4b.ipynb +1 -0
runs/Aug15_06-27-56_3ddbd7fd5744/events.out.tfevents.1755239294.3ddbd7fd5744.339.0 +3 -0
special_tokens_map.json +33 -0
token_analysis.ipynb +1 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
training_args.bin +3 -0
vlmu_dialog_v1.zip +3 -0
vlmu_drop_v1.zip +3 -0
vlmu_mqa_v1.5.zip +3 -0
vlmu_squad_v1.zip +3 -0
vmlu.ipynb +1 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+base_model: google/gemma-3-270m-it
+library_name: transformers
+model_name: llm
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+# Model Card for llm
+This model is a fine-tuned version of [google/gemma-3-270m-it](https://huggingface.co/google/gemma-3-270m-it).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="laampt/llm", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.21.0
+- Transformers: 4.55.0
+- Pytorch: 2.6.0+cu124
+- Datasets: 4.0.0
+- Tokenizers: 0.21.4
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 640,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 18,
+  "num_key_value_heads": 1,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 512,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.0",
+  "use_bidirectional_attention": false,
+  "use_cache": true,
+  "vocab_size": 262144
+}

edl_lecture_01.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

final_raw_result.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

gemma3.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

generation_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "cache_implementation": "hybrid",
+  "do_sample": true,
+  "eos_token_id": [
+    1,
+    106
+  ],
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "4.55.0"
+}

gpt-oss-20b.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt-oss-base.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt_oss_submission.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

gpu-101.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyNmLoeMm2V9yoyQDaneLvbr"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"B3ulPFQCjm-P","executionInfo":{"status":"ok","timestamp":1755174590649,"user_tz":-420,"elapsed":149,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"c9406e64-d9ff-4a06-d16d-f63a8c3b1d6e"},"outputs":[{"output_type":"stream","name":"stdout","text":["Thu Aug 14 12:29:50 2025 \n","+-----------------------------------------------------------------------------------------+\n","| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n","|-----------------------------------------+------------------------+----------------------+\n","| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|=========================================+========================+======================|\n","| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n","| N/A 35C P8 9W / 70W | 0MiB / 15360MiB | 0% Default |\n","| | | N/A |\n","+-----------------------------------------+------------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=========================================================================================|\n","| No running processes found |\n","+-----------------------------------------------------------------------------------------+\n"]}],"source":["!nvidia-smi"]},{"cell_type":"code","source":["import torch"],"metadata":{"id":"e5Wc9ksVjroy","executionInfo":{"status":"ok","timestamp":1755174600684,"user_tz":-420,"elapsed":5359,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["class PyTorchCUDA101:\n"," def __init__(self) -> None:\n"," if not torch.cuda.is_available():\n"," raise RuntimeError(\"CUDA not available\")\n"," self.device = torch.device(\"cuda\")\n","\n"," print(f\"Using GPU: {torch.cuda.get_device_name()}\")\n"," print(f\"CUDA version: {torch.version.cuda}\")\n"," print(f\"PyTorch version: {torch.__version__}\")\n","\n"," print(\"\\n\" + \"=\"*60)\n"," print(\"GPU MEMORY BASELINE: Understanding CUDA Overhead\")\n"," print(\"=\"* 60)\n","\n"," torch.cuda.empty_cache()\n"," torch.cuda.reset_peak_memory_stats()\n"," baseline_mem = torch.cuda.memory_allocated() / 1024**2\n","\n"," min_tensor = torch.ones((1,1), device=\"cuda\")\n"," cuda_overhead = torch.cuda.memory_allocated() / 1024**2\n","\n"," print(f\"📊 Memory before CUDA init: {baseline_mem:.1f} MB\")\n"," print(f\"📊 Memory after CUDA init: {cuda_overhead:.1f} MB\")\n"," print(f\"🎯 CUDA kernel overhead: {cuda_overhead - baseline_mem:.1f} MB\")\n","\n"," print(f\"\\n💡 Key Reality Check: CUDA kernels consume 1-2 GB regardless of your model size!\")\n"," print(f\" This overhead is constant and unavoidable for any GPU computation.\")\n"," print(f\" Additional memory used for buffers, intermediate results, and fragmentation\")\n"," print(f\" makes precise memory calculations challenging - focus on relative improvements.\")"],"metadata":{"id":"9scPSfzCjulL","executionInfo":{"status":"ok","timestamp":1755175093077,"user_tz":-420,"elapsed":4,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["p = PyTorchCUDA101()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cRZ-5LW5komv","executionInfo":{"status":"ok","timestamp":1755175093836,"user_tz":-420,"elapsed":14,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"01a5bd7c-3739-4eec-9247-9b98bdb82bba"},"execution_count":10,"outputs":[{"output_type":"stream","name":"stdout","text":["Using GPU: Tesla T4\n","CUDA version: 12.4\n","PyTorch version: 2.6.0+cu124\n","\n","============================================================\n","GPU MEMORY BASELINE: Understanding CUDA Overhead\n","============================================================\n","📊 Memory before CUDA init: 0.0 MB\n","📊 Memory after CUDA init: 0.0 MB\n","🎯 CUDA kernel overhead: 0.0 MB\n","\n","💡 Key Reality Check: CUDA kernels consume 1-2 GB regardless of your model size!\n"," This overhead is constant and unavoidable for any GPU computation.\n"," Additional memory used for buffers, intermediate results, and fragmentation\n"," makes precise memory calculations challenging - focus on relative improvements.\n"]}]},{"cell_type":"code","source":["!nvidia-smi"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"oTZ0xVRnkqr-","executionInfo":{"status":"ok","timestamp":1755175117694,"user_tz":-420,"elapsed":106,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"9bae77de-5e0a-4f66-8a70-d51bc04e00fa"},"execution_count":11,"outputs":[{"output_type":"stream","name":"stdout","text":["Thu Aug 14 12:38:37 2025 \n","+-----------------------------------------------------------------------------------------+\n","| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n","|-----------------------------------------+------------------------+----------------------+\n","| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|=========================================+========================+======================|\n","| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n","| N/A 44C P0 25W / 70W | 120MiB / 15360MiB | 0% Default |\n","| | | N/A |\n","+-----------------------------------------+------------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=========================================================================================|\n","+-----------------------------------------------------------------------------------------+\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"kDRiSDIvluHU"},"execution_count":null,"outputs":[]}]}

kimi_submission.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

kimi_submission_k2.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c9af5f462bef0c26aeea2d004217130e01641a393e4c7a8f6e2566f1c449c79
+size 536223056

pytorch_cuda_102.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"cells":[{"cell_type":"markdown","metadata":{"id":"A2kvRfGnp_kt"},"source":["# PyTorch CUDA 101: GPU Optimization Mastery\n","\n","**From First Principles to Tensor Cores**\n","\n","This notebook demonstrates essential CUDA patterns in PyTorch, based on performance principles revealed by GPU microbenchmarking.\n","\n","## Key Principles:\n","1. Minimize GPU-CPU data transfers\n","2. Choose appropriate data types (float32 vs float64)\n","3. Batch operations to increase arithmetic intensity\n","4. Use in-place operations when possible\n","5. Leverage tensor cores for matrix operations\n","6. Understand memory access patterns\n","7. Profile to identify bottlenecks\n","\n","---"]},{"cell_type":"code","source":["!nvidia-smi"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"DD3gMGbPqN4P","executionInfo":{"status":"ok","timestamp":1755176308057,"user_tz":-420,"elapsed":119,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"875df861-8118-417f-f597-371b12037fa3"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Thu Aug 14 12:58:27 2025 \n","+-----------------------------------------------------------------------------------------+\n","| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n","|-----------------------------------------+------------------------+----------------------+\n","| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|=========================================+========================+======================|\n","| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n","| N/A 53C P8 10W / 70W | 0MiB / 15360MiB | 0% Default |\n","| | | N/A |\n","+-----------------------------------------+------------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=========================================================================================|\n","| No running processes found |\n","+-----------------------------------------------------------------------------------------+\n"]}]},{"cell_type":"markdown","metadata":{"id":"vltvInI_p_ku"},"source":["## Setup and Imports"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9d38y-0tp_ku","executionInfo":{"status":"ok","timestamp":1755176315357,"user_tz":-420,"elapsed":6345,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"f4000e08-da34-423c-a035-994bd4911170"},"outputs":[{"output_type":"stream","name":"stdout","text":["✅ Using GPU: Tesla T4\n","✅ CUDA Version: 12.4\n","✅ PyTorch Version: 2.6.0+cu124\n"]}],"source":["import torch\n","import time\n","import math\n","from typing import Tuple, Optional\n","\n","def benchmark_operation(func, *args, num_iters=1000, warmup=100):\n"," \"\"\"Benchmark a PyTorch operation with proper CUDA synchronization.\"\"\"\n"," # Warmup to eliminate kernel compilation overhead\n"," for _ in range(warmup):\n"," func(*args)\n"," torch.cuda.synchronize()\n","\n"," # Actual timing\n"," start = time.perf_counter()\n"," for _ in range(num_iters):\n"," result = func(*args)\n"," torch.cuda.synchronize()\n","\n"," elapsed = time.perf_counter() - start\n"," return (elapsed / num_iters) * 1000 # Convert to milliseconds\n","\n","# Check CUDA availability\n","if not torch.cuda.is_available():\n"," raise RuntimeError(\"CUDA not available - GPU required for tutorial\")\n","\n","device = torch.device('cuda')\n","print(f'✅ Using GPU: {torch.cuda.get_device_name()}')\n","print(f'✅ CUDA Version: {torch.version.cuda}')\n","print(f'✅ PyTorch Version: {torch.__version__}')"]},{"cell_type":"markdown","metadata":{"id":"H-Ip68DFp_kv"},"source":["# Lesson 0: GPU Memory Baseline - Understanding CUDA Overhead\n","\n","**Key Reality Check:** CUDA kernels consume 1-2 GB regardless of your model size!\n","\n","You might think you could compute memory requirements exactly, but CUDA kernels require substantial overhead that makes precise calculations challenging."]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MzC_ZSalp_kv","executionInfo":{"status":"ok","timestamp":1755176322060,"user_tz":-420,"elapsed":324,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"b367dacd-246e-4005-fee0-a34b6f574546"},"outputs":[{"output_type":"stream","name":"stdout","text":["📊 Memory before CUDA initialization: 0.0 MB\n","📊 Memory after CUDA initialization: 0.0 MB\n","🎯 CUDA kernel overhead: 0.0 MB\n","\n","💡 This overhead is constant and unavoidable for any GPU computation!\n"," Additional memory used for buffers, intermediate results, and fragmentation\n"," makes precise memory calculations challenging - focus on relative improvements.\n"]}],"source":["# Demonstrate CUDA kernel memory overhead\n","torch.cuda.empty_cache()\n","torch.cuda.reset_peak_memory_stats()\n","baseline_memory = torch.cuda.memory_allocated() / 1024**2\n","\n","# Create minimal tensor to initialize CUDA context\n","minimal_tensor = torch.ones((1, 1), device='cuda')\n","cuda_overhead = torch.cuda.memory_allocated() / 1024**2\n","\n","print(f'📊 Memory before CUDA initialization: {baseline_memory:.1f} MB')\n","print(f'📊 Memory after CUDA initialization: {cuda_overhead:.1f} MB')\n","print(f'🎯 CUDA kernel overhead: {cuda_overhead - baseline_memory:.1f} MB')\n","print(f'\\n💡 This overhead is constant and unavoidable for any GPU computation!')\n","print(f' Additional memory used for buffers, intermediate results, and fragmentation')\n","print(f' makes precise memory calculations challenging - focus on relative improvements.')"]},{"cell_type":"markdown","metadata":{"id":"jCCwZc1Yp_kv"},"source":["# Lesson 1: Device Management & Tensor Creation\n","\n","**Principle:** Memory allocation location is immutable post-creation. CPU→GPU transfer involves PCIe bandwidth (~16GB/s) vs GPU memory bandwidth (~1500GB/s)."]},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vJ0s8NWtp_kw","executionInfo":{"status":"ok","timestamp":1755176335403,"user_tz":-420,"elapsed":1863,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"5645c438-8e8d-4f4f-93bb-d10da266bde9"},"outputs":[{"output_type":"stream","name":"stdout","text":["❌ BAD: Creating on CPU then moving to GPU\n","✅ GOOD: Creating directly on GPU\n","Bad approach: 7.84 ms\n","Good approach: 0.04 ms\n","Speedup: 206.32x\n","\n","🎯 Key Takeaway: Always create tensors directly on the target device\n"," Use device=\"cuda\" parameter in tensor creation functions\n"]}],"source":["print('❌ BAD: Creating on CPU then moving to GPU')\n","def bad_tensor_creation(size):\n"," x = torch.randn(size, size) # Created on CPU\n"," x = x.cuda() # Expensive CPU->GPU transfer\n"," return x\n","\n","print('✅ GOOD: Creating directly on GPU')\n","def good_tensor_creation(size):\n"," x = torch.randn(size, size, device='cuda') # Created directly on GPU\n"," return x\n","\n","size = 1024\n","bad_time = benchmark_operation(bad_tensor_creation, size, num_iters=100)\n","good_time = benchmark_operation(good_tensor_creation, size, num_iters=100)\n","\n","print(f'Bad approach: {bad_time:.2f} ms')\n","print(f'Good approach: {good_time:.2f} ms')\n","print(f'Speedup: {bad_time/good_time:.2f}x')\n","\n","print('\\n🎯 Key Takeaway: Always create tensors directly on the target device')\n","print(' Use device=\"cuda\" parameter in tensor creation functions')"]},{"cell_type":"markdown","metadata":{"id":"CWEYcjqap_kw"},"source":["# Lesson 2: Data Type Optimization\n","\n","**Surprising Implication:** Float16 isn't just 2x faster—it enables Tensor Cores (312 TFLOPS vs 19.5 TFLOPS). This demonstrates a 16x performance cliff, not gradual degradation."]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"C2TBbxWbp_kw","executionInfo":{"status":"ok","timestamp":1755176398588,"user_tz":-420,"elapsed":12064,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"b708001f-89ce-45dc-80b0-d2991d744a90"},"outputs":[{"output_type":"stream","name":"stdout","text":["Float64: 71.54 ms\n","Float32: 4.58 ms (15.63x faster)\n","Float16: 1.15 ms (62.09x faster)\n","\n","🎯 Key Takeaway: Use float32 unless you need float64 precision\n"," Float16 is even faster but may have numerical stability issues\n","\n","Memory usage:\n","Float64: 7.6 MB\n","Float32: 3.8 MB\n"]}],"source":["size = 2048\n","\n","def matmul_float64():\n"," A = torch.randn(size, size, dtype=torch.float64, device='cuda')\n"," B = torch.randn(size, size, dtype=torch.float64, device='cuda')\n"," return torch.mm(A, B)\n","\n","def matmul_float32():\n"," A = torch.randn(size, size, dtype=torch.float32, device='cuda')\n"," B = torch.randn(size, size, dtype=torch.float32, device='cuda')\n"," return torch.mm(A, B)\n","\n","def matmul_float16():\n"," A = torch.randn(size, size, dtype=torch.float16, device='cuda')\n"," B = torch.randn(size, size, dtype=torch.float16, device='cuda')\n"," return torch.mm(A, B)\n","\n","time_f64 = benchmark_operation(matmul_float64, num_iters=50)\n","time_f32 = benchmark_operation(matmul_float32, num_iters=50)\n","time_f16 = benchmark_operation(matmul_float16, num_iters=50)\n","\n","print(f'Float64: {time_f64:.2f} ms')\n","print(f'Float32: {time_f32:.2f} ms ({time_f64/time_f32:.2f}x faster)')\n","print(f'Float16: {time_f16:.2f} ms ({time_f64/time_f16:.2f}x faster)')\n","\n","print('\\n🎯 Key Takeaway: Use float32 unless you need float64 precision')\n","print(' Float16 is even faster but may have numerical stability issues')\n","\n","# Memory usage comparison\n","f64_tensor = torch.randn(1000, 1000, dtype=torch.float64, device='cuda')\n","f32_tensor = torch.randn(1000, 1000, dtype=torch.float32, device='cuda')\n","\n","print(f'\\nMemory usage:')\n","print(f'Float64: {f64_tensor.element_size() * f64_tensor.numel() / 1024**2:.1f} MB')\n","print(f'Float32: {f32_tensor.element_size() * f32_tensor.numel() / 1024**2:.1f} MB')"]},{"cell_type":"markdown","metadata":{"id":"nMSyOSqnp_kw"},"source":["# Lesson 3: CPU-GPU Transfer Optimization\n","\n","**Hidden Cost:** Each transfer incurs ~10μs latency + bandwidth cost. For small operations, latency dominates—you're paying milliseconds to save microseconds."]},{"cell_type":"code","execution_count":6,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dJ_6DtHzp_kw","executionInfo":{"status":"ok","timestamp":1755176416931,"user_tz":-420,"elapsed":350,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"0aaf1e5f-c83c-40a3-edb8-a5cf271b71dd"},"outputs":[{"output_type":"stream","name":"stdout","text":["❌ BAD: Frequent CPU-GPU transfers\n","✅ GOOD: Keep operations on GPU\n","Bad approach: 1.64 ms\n","Good approach: 0.02 ms\n","Speedup: 75.2x\n","\n","🎯 Key Takeaway: Keep data on GPU as long as possible\n"," Use PyTorch operations instead of numpy when possible\n"]}],"source":["x = torch.randn(1000, 1000, device='cuda')\n","\n","print('❌ BAD: Frequent CPU-GPU transfers')\n","def bad_cpu_gpu_pattern():\n"," # Convert to CPU, do numpy operation, back to GPU\n"," x_cpu = x.cpu().numpy() # GPU -> CPU\n"," result_cpu = x_cpu.sum() # CPU operation\n"," result_gpu = torch.tensor(result_cpu, device='cuda') # CPU -> GPU\n"," return result_gpu\n","\n","print('✅ GOOD: Keep operations on GPU')\n","def good_gpu_pattern():\n"," result = x.sum() # All on GPU\n"," return result\n","\n","bad_time = benchmark_operation(bad_cpu_gpu_pattern, num_iters=100)\n","good_time = benchmark_operation(good_gpu_pattern, num_iters=100)\n","\n","print(f'Bad approach: {bad_time:.2f} ms')\n","print(f'Good approach: {good_time:.2f} ms')\n","print(f'Speedup: {bad_time/good_time:.1f}x')\n","\n","print('\\n🎯 Key Takeaway: Keep data on GPU as long as possible')\n","print(' Use PyTorch operations instead of numpy when possible')"]},{"cell_type":"markdown","metadata":{"id":"WsLFJphvp_kw"},"source":["# Lesson 4: Batching for Arithmetic Intensity\n","\n","**First Principles:** Single operations have low arithmetic intensity (FLOPS/memory_access). Batching increases intensity from O(n²) to O(n³) for matrix operations."]},{"cell_type":"code","execution_count":7,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UkzNkIjip_kx","executionInfo":{"status":"ok","timestamp":1755176426734,"user_tz":-420,"elapsed":253,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"6c305f14-5f06-4df4-8968-c3554fa9abce"},"outputs":[{"output_type":"stream","name":"stdout","text":["❌ BAD: Processing one sample at a time\n","✅ GOOD: Batch processing\n","Bad approach: 1.71 ms\n","Good approach: 0.34 ms\n","Speedup: 5.03x\n","\n","🎯 Key Takeaway: Batch operations whenever possible\n"," Use bmm(), batch matrix operations, and higher-dimensional tensors\n"]}],"source":["print('❌ BAD: Processing one sample at a time')\n","def bad_sequential_processing():\n"," samples = [torch.randn(256, 256, device='cuda') for _ in range(32)]\n"," results = []\n"," for sample in samples:\n"," result = torch.mm(sample, sample.T) # Individual matrix multiply\n"," results.append(result)\n"," return torch.stack(results)\n","\n","print('✅ GOOD: Batch processing')\n","def good_batch_processing():\n"," # Create batched tensor directly\n"," batch = torch.randn(32, 256, 256, device='cuda')\n"," # Batched matrix multiply - much more efficient\n"," result = torch.bmm(batch, batch.transpose(-2, -1))\n"," return result\n","\n","bad_time = benchmark_operation(bad_sequential_processing, num_iters=10)\n","good_time = benchmark_operation(good_batch_processing, num_iters=10)\n","\n","print(f'Bad approach: {bad_time:.2f} ms')\n","print(f'Good approach: {good_time:.2f} ms')\n","print(f'Speedup: {bad_time/good_time:.2f}x')\n","\n","print('\\n🎯 Key Takeaway: Batch operations whenever possible')\n","print(' Use bmm(), batch matrix operations, and higher-dimensional tensors')"]},{"cell_type":"markdown","metadata":{"id":"oI9DFq-0p_kx"},"source":["# Lesson 5: In-place Operations\n","\n","**Memory Allocator Tax:** Each allocation involves GPU memory manager overhead. In-place operations eliminate allocation/deallocation cycles entirely."]},{"cell_type":"code","execution_count":8,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3Jl88InLp_kx","executionInfo":{"status":"ok","timestamp":1755176436411,"user_tz":-420,"elapsed":256,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"f3ae7d30-bb88-467f-b401-11711dd1f3a0"},"outputs":[{"output_type":"stream","name":"stdout","text":["❌ BAD: Creating new tensors\n","✅ GOOD: In-place operations\n","Bad approach: 0.59 ms, 103.8 MB peak\n","Good approach: 0.52 ms, 71.8 MB peak\n","Speedup: 1.13x\n","Memory reduction: 1.45x\n","\n","🎯 Key Takeaway: Use in-place operations (add_, mul_, etc.)\n"," Reduces memory allocation and garbage collection overhead\n"]}],"source":["size = (2048, 2048)\n","\n","print('❌ BAD: Creating new tensors')\n","def bad_memory_allocation():\n"," x = torch.randn(*size, device='cuda')\n"," y = torch.randn(*size, device='cuda')\n"," z = x + y # Creates new tensor\n"," w = z * 2 # Creates another new tensor\n"," return w\n","\n","print('✅ GOOD: In-place operations')\n","def good_inplace_operations():\n"," x = torch.randn(*size, device='cuda')\n"," y = torch.randn(*size, device='cuda')\n"," x.add_(y) # In-place addition\n"," x.mul_(2) # In-place multiplication\n"," return x\n","\n","# Monitor memory usage\n","torch.cuda.empty_cache()\n","torch.cuda.reset_peak_memory_stats()\n","\n","bad_time = benchmark_operation(bad_memory_allocation, num_iters=50)\n","bad_memory = torch.cuda.max_memory_allocated() / 1024**2\n","\n","torch.cuda.empty_cache()\n","torch.cuda.reset_peak_memory_stats()\n","\n","good_time = benchmark_operation(good_inplace_operations, num_iters=50)\n","good_memory = torch.cuda.max_memory_allocated() / 1024**2\n","\n","print(f'Bad approach: {bad_time:.2f} ms, {bad_memory:.1f} MB peak')\n","print(f'Good approach: {good_time:.2f} ms, {good_memory:.1f} MB peak')\n","print(f'Speedup: {bad_time/good_time:.2f}x')\n","print(f'Memory reduction: {bad_memory/good_memory:.2f}x')\n","\n","print('\\n🎯 Key Takeaway: Use in-place operations (add_, mul_, etc.)')\n","print(' Reduces memory allocation and garbage collection overhead')"]},{"cell_type":"markdown","metadata":{"id":"7ISx7KIGp_kx"},"source":["# Lesson 6: Tensor Core Optimization\n","\n","**Hardware Constraint:** Tensor Cores operate on 4×4 matrices of float16. Misaligned dimensions force fallback to CUDA cores—a 16x performance penalty."]},{"cell_type":"code","execution_count":9,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3oQ53k8pp_kx","executionInfo":{"status":"ok","timestamp":1755176445833,"user_tz":-420,"elapsed":246,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"2dd83185-735b-4129-9e85-771846cb998e"},"outputs":[{"output_type":"stream","name":"stdout","text":["Matrix multiply performance depends on tensor core compatibility:\n","Size Time (ms) TFLOPS Notes\n","--------------------------------------------------\n","512 0.08 3.36 ✅ TC-friendly\n","768 0.11 7.90 ✅ TC-friendly\n","1024 0.22 9.67 ✅ TC-friendly\n","1536 0.55 13.14 ✅ TC-friendly\n","2048 0.92 18.70 ✅ TC-friendly\n","\n","🎯 Key Takeaway: Use float16 and dimensions divisible by 16\n"," This maximizes tensor core utilization on modern GPUs\n"]}],"source":["print('Matrix multiply performance depends on tensor core compatibility:')\n","\n","# Test different matrix sizes - tensor cores prefer certain dimensions\n","sizes = [512, 768, 1024, 1536, 2048]\n","\n","print(f'{\"Size\":<8} {\"Time (ms)\":<10} {\"TFLOPS\":<10} {\"Notes\"}')\n","print('-' * 50)\n","\n","for size in sizes:\n"," def matmul_test():\n"," A = torch.randn(size, size, dtype=torch.float16, device='cuda')\n"," B = torch.randn(size, size, dtype=torch.float16, device='cuda')\n"," return torch.mm(A, B)\n","\n"," time_ms = benchmark_operation(matmul_test, num_iters=20)\n"," flops = 2 * size**3 # Matrix multiply FLOPS\n"," tflops = (flops / (time_ms * 1e-3)) / 1e12\n","\n"," # Tensor cores work best with dimensions divisible by 8/16\n"," tc_friendly = '✅ TC-friendly' if size % 16 == 0 else '⚠️ Sub-optimal'\n","\n"," print(f'{size:<8} {time_ms:<10.2f} {tflops:<10.2f} {tc_friendly}')\n","\n","print('\\n🎯 Key Takeaway: Use float16 and dimensions divisible by 16')\n","print(' This maximizes tensor core utilization on modern GPUs')"]},{"cell_type":"markdown","metadata":{"id":"Iz9G6I4fp_kx"},"source":["# Lesson 7: Memory Access Patterns\n","\n","**Memory Layout Principle:** GPU threads access memory in coalesced patterns. Non-contiguous access forces multiple memory transactions instead of single wide loads."]},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"DRaeNq3Gp_kx","executionInfo":{"status":"ok","timestamp":1755176454772,"user_tz":-420,"elapsed":406,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"32d4ea35-0ffe-450a-d3f2-b2e297cd77e0"},"outputs":[{"output_type":"stream","name":"stdout","text":["❌ BAD: Non-contiguous memory access\n","✅ GOOD: Contiguous memory access\n","Bad approach: 0.27 ms\n","Good approach: 1.32 ms\n","Speedup: 0.21x\n","\n","Memory layout check:\n","Original tensor is_contiguous: True\n","Transposed tensor is_contiguous: False\n","After .contiguous(): True\n","\n","🎯 Key Takeaway: Use .contiguous() after shape operations\n"," Check .is_contiguous() and call .contiguous() when needed\n"]}],"source":["size = (4096, 4096)\n","x = torch.randn(*size, device='cuda')\n","\n","print('❌ BAD: Non-contiguous memory access')\n","def bad_memory_pattern():\n"," # Transpose creates a view with different strides\n"," x_t = x.T\n"," return torch.sum(x_t, dim=0) # Non-contiguous access\n","\n","print('✅ GOOD: Contiguous memory access')\n","def good_memory_pattern():\n"," # Make contiguous first\n"," x_t = x.T.contiguous()\n"," return torch.sum(x_t, dim=0) # Contiguous access\n","\n","bad_time = benchmark_operation(bad_memory_pattern, num_iters=100)\n","good_time = benchmark_operation(good_memory_pattern, num_iters=100)\n","\n","print(f'Bad approach: {bad_time:.2f} ms')\n","print(f'Good approach: {good_time:.2f} ms')\n","print(f'Speedup: {bad_time/good_time:.2f}x')\n","\n","print(f'\\nMemory layout check:')\n","print(f'Original tensor is_contiguous: {x.is_contiguous()}')\n","print(f'Transposed tensor is_contiguous: {x.T.is_contiguous()}')\n","print(f'After .contiguous(): {x.T.contiguous().is_contiguous()}')\n","\n","print('\\n🎯 Key Takeaway: Use .contiguous() after shape operations')\n","print(' Check .is_contiguous() and call .contiguous() when needed')"]},{"cell_type":"markdown","metadata":{"id":"V1E-TpW1p_kx"},"source":["# Lesson 8: Performance Profiling\n","\n","**Measurement Principle:** You cannot optimize what you cannot measure. The profiler reveals the actual bottleneck—often surprising compared to intuition."]},{"cell_type":"code","execution_count":11,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":382},"id":"ZQYGvFQdp_kx","executionInfo":{"status":"error","timestamp":1755176465409,"user_tz":-420,"elapsed":247,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"0134a7c8-94ef-4cfc-f62a-d0cba546cd64"},"outputs":[{"output_type":"stream","name":"stdout","text":["Running profiler example...\n"]},{"output_type":"error","ename":"RuntimeError","evalue":"element 0 of tensors does not require grad and does not have a grad_fn","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)","\u001b[0;32m/tmp/ipython-input-1548132624.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 25\u001b[0m ) as prof:\n\u001b[1;32m 26\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m \u001b[0mexample_neural_network\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 28\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;31m# Print profiling results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/tmp/ipython-input-1548132624.py\u001b[0m in \u001b[0;36mexample_neural_network\u001b[0;34m()\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;31m# Backward pass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 625\u001b[0m )\n\u001b[0;32m--> 626\u001b[0;31m torch.autograd.backward(\n\u001b[0m\u001b[1;32m 627\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 628\u001b[0m )\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;31m# some Python versions print out the first line of a multi-line function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0;31m# calls in the traceback and some print out the last line\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 347\u001b[0;31m _engine_run_backward(\n\u001b[0m\u001b[1;32m 348\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py\u001b[0m in \u001b[0;36m_engine_run_backward\u001b[0;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[1;32m 821\u001b[0m \u001b[0munregister_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_register_logging_hooks_on_whole_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt_outputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 822\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 823\u001b[0;31m return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n\u001b[0m\u001b[1;32m 824\u001b[0m \u001b[0mt_outputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 825\u001b[0m ) # Calls into the C++ engine to run the backward pass\n","\u001b[0;31mRuntimeError\u001b[0m: element 0 of tensors does not require grad and does not have a grad_fn"]}],"source":["def example_neural_network():\n"," # Simple neural network operations\n"," x = torch.randn(1024, 512, device='cuda')\n"," W1 = torch.randn(512, 256, device='cuda')\n"," W2 = torch.randn(256, 10, device='cuda')\n","\n"," # Forward pass\n"," h1 = torch.mm(x, W1)\n"," h1 = torch.relu(h1)\n"," output = torch.mm(h1, W2)\n"," loss = torch.sum(output**2)\n","\n"," # Backward pass\n"," loss.backward()\n","\n"," return loss\n","\n","print('Running profiler example...')\n","\n","# Profile the neural network\n","with torch.profiler.profile(\n"," activities=[torch.profiler.ProfilerActivity.CPU,\n"," torch.profiler.ProfilerActivity.CUDA],\n"," record_shapes=True,\n",") as prof:\n"," for _ in range(10):\n"," example_neural_network()\n","\n","# Print profiling results\n","print('\\nTop 5 GPU operations by time:')\n","print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=5))\n","\n","print('\\n🎯 Key Takeaway: Use torch.profiler to identify bottlenecks')\n","print(' Focus optimization efforts on the most time-consuming operations')"]},{"cell_type":"markdown","metadata":{"id":"dtm6fq10p_kx"},"source":["# Summary: PyTorch CUDA Best Practices\n","\n","## The Systematic Optimization Framework\n","\n","**P(optimization_success|measurement) >> P(optimization_success|intuition)**\n","\n","### Core Practices:\n","\n","1. **📱 Create tensors directly on GPU** with `device='cuda'`\n","2. **🔢 Use float32** unless float64 precision is required\n","3. **🚫 Minimize CPU-GPU transfers** (`.cpu()`, `.cuda()`)\n","4. **📦 Batch operations** using `bmm()`, 3D+ tensors\n","5. **⚡ Use in-place operations** (`add_`, `mul_`, etc.) to save memory\n","6. **🎯 Leverage tensor cores** with float16 + dims divisible by 16\n","7. **🧠 Ensure memory contiguity** with `.contiguous()`\n","8. **📊 Profile code** to identify actual bottlenecks\n","9. **🔄 Always use `torch.cuda.synchronize()`** for accurate timing\n","10. **🎮 Understand hardware limits** (memory vs compute bound)\n","\n","### The Three Performance Regimes:\n","\n","| **Regime** | **Characteristics** | **Solutions** |\n","|------------|--------------------|--------------|\n","| **Overhead-Bound** | Runtime doesn't scale with data size | Tracing, operator fusion, JIT compilation |\n","| **Memory-Bound** | Low FLOPS utilization, high bandwidth | Operator fusion, increase arithmetic intensity |\n","| **Compute-Bound** | High FLOPS utilization | Use Tensor Cores, upgrade hardware |\n","\n","### Key Formulas:\n","\n","- **Arithmetic Intensity** = `FLOPS / Bytes_Accessed`\n","- **Memory Usage** = `batch_size × seq_len × hidden_dim × bytes_per_element`\n","- **P(tensor_core_usage|float16 + aligned_dims) ≈ 1.0**\n","\n","**Remember:** The microbenchmarking results show that performance depends on arithmetic intensity. Optimize based on whether your operations are memory-bound or compute-bound!"]}],"metadata":{"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.0"},"colab":{"provenance":[],"gpuType":"T4"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0}

qwen-3.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen3-4b.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","gpuType":"L4","authorship_tag":"ABX9TyOl5a9C9lnUUtmpeE7bvvie"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","widgets":{"application/vnd.jupyter.widget-state+json":{"64a5ce5a5f7d41ca92da6c112ca675a5":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_a29960a153514e19a7de1eb413012d05","IPY_MODEL_11456cafe70f411181ec1761bc9f0923","IPY_MODEL_07b4903e48cf4b148b0957a8004a4044"],"layout":"IPY_MODEL_dd02315346ef4da99d15d059956aa0f8"}},"a29960a153514e19a7de1eb413012d05":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_861f738e8c93404593ddaafb14048b96","placeholder":"","style":"IPY_MODEL_bae57c2ea63a41fc816392227a2548ab","value":"tokenizer_config.json: "}},"11456cafe70f411181ec1761bc9f0923":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_ad19f4d6780044588e4a6171aa7c4d2a","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_6845f74a8e0c40e3924114146ae8cc3b","value":1}},"07b4903e48cf4b148b0957a8004a4044":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_96467b66ff944f71bcc186145c2270de","placeholder":"","style":"IPY_MODEL_1cfd960a5cc34f40accdd539651951c7","value":" 10.8k/? [00:00<00:00, 1.01MB/s]"}},"dd02315346ef4da99d15d059956aa0f8":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"861f738e8c93404593ddaafb14048b96":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bae57c2ea63a41fc816392227a2548ab":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ad19f4d6780044588e4a6171aa7c4d2a":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"6845f74a8e0c40e3924114146ae8cc3b":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"96467b66ff944f71bcc186145c2270de":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1cfd960a5cc34f40accdd539651951c7":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"ec56b2f130714423b18ad5721a52ae74":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_3db7bbb9eeba4f24a10e67ab866456b4","IPY_MODEL_9d20cc9b01be41cf8cd2c90e13c21268","IPY_MODEL_4ac0aa76cf9c4803b28513ccf6d355d7"],"layout":"IPY_MODEL_0e2386e11c074288868b3574806142be"}},"3db7bbb9eeba4f24a10e67ab866456b4":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_84edf8019b094152ac6e0fd3f03bc65e","placeholder":"","style":"IPY_MODEL_900c31d50cb04fbaae78e5d77a92d8ae","value":"vocab.json: "}},"9d20cc9b01be41cf8cd2c90e13c21268":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_8cafb694c5484e529ce50d51bde6a76b","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_96a12716da8845818b65f20a0a3edb74","value":1}},"4ac0aa76cf9c4803b28513ccf6d355d7":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_be4a073d495e4311b0b9d6790e5088ac","placeholder":"","style":"IPY_MODEL_96cf8d3454f54a18b94f2736035fa6cb","value":" 2.78M/? [00:00<00:00, 8.01MB/s]"}},"0e2386e11c074288868b3574806142be":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"84edf8019b094152ac6e0fd3f03bc65e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"900c31d50cb04fbaae78e5d77a92d8ae":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"8cafb694c5484e529ce50d51bde6a76b":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"96a12716da8845818b65f20a0a3edb74":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"be4a073d495e4311b0b9d6790e5088ac":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"96cf8d3454f54a18b94f2736035fa6cb":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"1a949c4edb16410f8d6698a964902f62":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_17df5653418b4db492df9a6750cea42a","IPY_MODEL_7545c43d731e4b81b360b89f2d548faf","IPY_MODEL_4f620d9dc4914b4bbf765525232e511b"],"layout":"IPY_MODEL_0eebb83c3d43496982e4160c8b1bb300"}},"17df5653418b4db492df9a6750cea42a":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_938d3390ccfb4845b25780f254c0dbce","placeholder":"","style":"IPY_MODEL_1cde36a9370a4acdb569e972e7d44519","value":"merges.txt: "}},"7545c43d731e4b81b360b89f2d548faf":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_3e5fea43577a4a92a3ce9dc990b29c70","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_66a90ef44dee4459832a5f0c6c75e21a","value":1}},"4f620d9dc4914b4bbf765525232e511b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_d1a256282c5f40a1832a2bd2bc815580","placeholder":"","style":"IPY_MODEL_3bd5a0dd220f4b6a99c68ef7af275559","value":" 1.67M/? [00:00<00:00, 66.2kB/s]"}},"0eebb83c3d43496982e4160c8b1bb300":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"938d3390ccfb4845b25780f254c0dbce":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"1cde36a9370a4acdb569e972e7d44519":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"3e5fea43577a4a92a3ce9dc990b29c70":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"66a90ef44dee4459832a5f0c6c75e21a":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"d1a256282c5f40a1832a2bd2bc815580":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"3bd5a0dd220f4b6a99c68ef7af275559":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"49d5b4c2cb5e4f31925dfc9efc35fad8":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_abed0371092b4857983ccb9b9c9155a8","IPY_MODEL_63add0f0320f44b2b73f0304a0646c0a","IPY_MODEL_49103807b86b41e5983b5f0567a54e62"],"layout":"IPY_MODEL_6b626ec5eb3544969865bb37123eefb0"}},"abed0371092b4857983ccb9b9c9155a8":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_72682a8ea8394371a5fb40e041712a91","placeholder":"","style":"IPY_MODEL_e9175d44c63d4826a2bc1b22e1783426","value":"tokenizer.json: 100%"}},"63add0f0320f44b2b73f0304a0646c0a":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_cebbe7056e76465f8194dcf41660febd","max":11422654,"min":0,"orientation":"horizontal","style":"IPY_MODEL_c621799772174f1d949bc6858bca2abb","value":11422654}},"49103807b86b41e5983b5f0567a54e62":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_2c046cfe937c4750bbf4602b47b2959e","placeholder":"","style":"IPY_MODEL_67dcbef9127c4ed8b1733c523d66e7c8","value":" 11.4M/11.4M [00:01<00:00, 20.5kB/s]"}},"6b626ec5eb3544969865bb37123eefb0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"72682a8ea8394371a5fb40e041712a91":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e9175d44c63d4826a2bc1b22e1783426":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"cebbe7056e76465f8194dcf41660febd":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c621799772174f1d949bc6858bca2abb":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"2c046cfe937c4750bbf4602b47b2959e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"67dcbef9127c4ed8b1733c523d66e7c8":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"8bab96c4757c41509f1b98b7311498a0":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_f221547373044253a3d91d1e20ff4b39","IPY_MODEL_17ba5874b8bf4df6b2351426a17a2725","IPY_MODEL_42b59cf9890a436ca1e262fb9a77024f"],"layout":"IPY_MODEL_e6021618b18a4b279e46dcbf2eac37fb"}},"f221547373044253a3d91d1e20ff4b39":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_00d264a6166446dda3a7e585c8f995b0","placeholder":"","style":"IPY_MODEL_6a1f4aef8a9e4dd0b06b568abd370c6f","value":"config.json: "}},"17ba5874b8bf4df6b2351426a17a2725":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_f1d34e2886524c3bbf1be92f4f337a3e","max":1,"min":0,"orientation":"horizontal","style":"IPY_MODEL_ddc460f1ce044601a6e1fd7dbdce71cf","value":1}},"42b59cf9890a436ca1e262fb9a77024f":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_0c1023c1e84046c0bfaa2a6b43523790","placeholder":"","style":"IPY_MODEL_9cdb4f69486541a6b89647805349a0e3","value":" 4.21k/? [00:00<00:00, 428kB/s]"}},"e6021618b18a4b279e46dcbf2eac37fb":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"00d264a6166446dda3a7e585c8f995b0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"6a1f4aef8a9e4dd0b06b568abd370c6f":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"f1d34e2886524c3bbf1be92f4f337a3e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":"20px"}},"ddc460f1ce044601a6e1fd7dbdce71cf":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"0c1023c1e84046c0bfaa2a6b43523790":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"9cdb4f69486541a6b89647805349a0e3":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"82d166965030426f90e3c5f2813683fc":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_e766b89cdb7f47bdb07106ec007f43da","IPY_MODEL_b0a6d34f2e8f446d8ae054b586d59486","IPY_MODEL_7b77a7fd0a1242f188a3395fc83a4836"],"layout":"IPY_MODEL_b15e7170f8694e18bc1cba176497a48c"}},"e766b89cdb7f47bdb07106ec007f43da":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_e9ff7705176f4b3cb8b8d8dec78dc93e","placeholder":"","style":"IPY_MODEL_c3e0938fa8614f9aa489520265dda514","value":"model.safetensors: 100%"}},"b0a6d34f2e8f446d8ae054b586d59486":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_72684f4c1cfc4c07b99ef3e89be83ef0","max":5190053264,"min":0,"orientation":"horizontal","style":"IPY_MODEL_663488499dc244e29f1a7a3324d8d8d0","value":5190053264}},"7b77a7fd0a1242f188a3395fc83a4836":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_50b1df97b11b4716b548d269fe608c2c","placeholder":"","style":"IPY_MODEL_5734c9ce79644d208223d29f1ca2a386","value":" 5.19G/5.19G [01:48<00:00, 62.2MB/s]"}},"b15e7170f8694e18bc1cba176497a48c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"e9ff7705176f4b3cb8b8d8dec78dc93e":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c3e0938fa8614f9aa489520265dda514":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"72684f4c1cfc4c07b99ef3e89be83ef0":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"663488499dc244e29f1a7a3324d8d8d0":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"50b1df97b11b4716b548d269fe608c2c":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"5734c9ce79644d208223d29f1ca2a386":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"80a47421025c49288b38728c3103d9d1":{"model_module":"@jupyter-widgets/controls","model_name":"HBoxModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HBoxModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HBoxView","box_style":"","children":["IPY_MODEL_ff25de5230074347ba7cf6feb43c480e","IPY_MODEL_41bef6e0a5b249628b5b4ae2ec56d984","IPY_MODEL_6f566b0d1bf74caf8b215e95fcfcb20b"],"layout":"IPY_MODEL_f3dd0988304348009a1e2babb304a594"}},"ff25de5230074347ba7cf6feb43c480e":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_c714f72305bb4365ab3308766f4deda1","placeholder":"","style":"IPY_MODEL_bca9395dbb904df29502faca115cc985","value":"generation_config.json: 100%"}},"41bef6e0a5b249628b5b4ae2ec56d984":{"model_module":"@jupyter-widgets/controls","model_name":"FloatProgressModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"FloatProgressModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"ProgressView","bar_style":"success","description":"","description_tooltip":null,"layout":"IPY_MODEL_a43107d0d7a84134a717eb41de575555","max":239,"min":0,"orientation":"horizontal","style":"IPY_MODEL_081250bd35184357a995acd3f73d2088","value":239}},"6f566b0d1bf74caf8b215e95fcfcb20b":{"model_module":"@jupyter-widgets/controls","model_name":"HTMLModel","model_module_version":"1.5.0","state":{"_dom_classes":[],"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"HTMLModel","_view_count":null,"_view_module":"@jupyter-widgets/controls","_view_module_version":"1.5.0","_view_name":"HTMLView","description":"","description_tooltip":null,"layout":"IPY_MODEL_3b603589159d4440a721cb9e2390fb85","placeholder":"","style":"IPY_MODEL_edf6623bcd2d4b3cbee58d8b8db38271","value":" 239/239 [00:00<00:00, 30.4kB/s]"}},"f3dd0988304348009a1e2babb304a594":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"c714f72305bb4365ab3308766f4deda1":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"bca9395dbb904df29502faca115cc985":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}},"a43107d0d7a84134a717eb41de575555":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"081250bd35184357a995acd3f73d2088":{"model_module":"@jupyter-widgets/controls","model_name":"ProgressStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"ProgressStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","bar_color":null,"description_width":""}},"3b603589159d4440a721cb9e2390fb85":{"model_module":"@jupyter-widgets/base","model_name":"LayoutModel","model_module_version":"1.2.0","state":{"_model_module":"@jupyter-widgets/base","_model_module_version":"1.2.0","_model_name":"LayoutModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"LayoutView","align_content":null,"align_items":null,"align_self":null,"border":null,"bottom":null,"display":null,"flex":null,"flex_flow":null,"grid_area":null,"grid_auto_columns":null,"grid_auto_flow":null,"grid_auto_rows":null,"grid_column":null,"grid_gap":null,"grid_row":null,"grid_template_areas":null,"grid_template_columns":null,"grid_template_rows":null,"height":null,"justify_content":null,"justify_items":null,"left":null,"margin":null,"max_height":null,"max_width":null,"min_height":null,"min_width":null,"object_fit":null,"object_position":null,"order":null,"overflow":null,"overflow_x":null,"overflow_y":null,"padding":null,"right":null,"top":null,"visibility":null,"width":null}},"edf6623bcd2d4b3cbee58d8b8db38271":{"model_module":"@jupyter-widgets/controls","model_name":"DescriptionStyleModel","model_module_version":"1.5.0","state":{"_model_module":"@jupyter-widgets/controls","_model_module_version":"1.5.0","_model_name":"DescriptionStyleModel","_view_count":null,"_view_module":"@jupyter-widgets/base","_view_module_version":"1.2.0","_view_name":"StyleView","description_width":""}}}}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":279,"referenced_widgets":["64a5ce5a5f7d41ca92da6c112ca675a5","a29960a153514e19a7de1eb413012d05","11456cafe70f411181ec1761bc9f0923","07b4903e48cf4b148b0957a8004a4044","dd02315346ef4da99d15d059956aa0f8","861f738e8c93404593ddaafb14048b96","bae57c2ea63a41fc816392227a2548ab","ad19f4d6780044588e4a6171aa7c4d2a","6845f74a8e0c40e3924114146ae8cc3b","96467b66ff944f71bcc186145c2270de","1cfd960a5cc34f40accdd539651951c7","ec56b2f130714423b18ad5721a52ae74","3db7bbb9eeba4f24a10e67ab866456b4","9d20cc9b01be41cf8cd2c90e13c21268","4ac0aa76cf9c4803b28513ccf6d355d7","0e2386e11c074288868b3574806142be","84edf8019b094152ac6e0fd3f03bc65e","900c31d50cb04fbaae78e5d77a92d8ae","8cafb694c5484e529ce50d51bde6a76b","96a12716da8845818b65f20a0a3edb74","be4a073d495e4311b0b9d6790e5088ac","96cf8d3454f54a18b94f2736035fa6cb","1a949c4edb16410f8d6698a964902f62","17df5653418b4db492df9a6750cea42a","7545c43d731e4b81b360b89f2d548faf","4f620d9dc4914b4bbf765525232e511b","0eebb83c3d43496982e4160c8b1bb300","938d3390ccfb4845b25780f254c0dbce","1cde36a9370a4acdb569e972e7d44519","3e5fea43577a4a92a3ce9dc990b29c70","66a90ef44dee4459832a5f0c6c75e21a","d1a256282c5f40a1832a2bd2bc815580","3bd5a0dd220f4b6a99c68ef7af275559","49d5b4c2cb5e4f31925dfc9efc35fad8","abed0371092b4857983ccb9b9c9155a8","63add0f0320f44b2b73f0304a0646c0a","49103807b86b41e5983b5f0567a54e62","6b626ec5eb3544969865bb37123eefb0","72682a8ea8394371a5fb40e041712a91","e9175d44c63d4826a2bc1b22e1783426","cebbe7056e76465f8194dcf41660febd","c621799772174f1d949bc6858bca2abb","2c046cfe937c4750bbf4602b47b2959e","67dcbef9127c4ed8b1733c523d66e7c8","8bab96c4757c41509f1b98b7311498a0","f221547373044253a3d91d1e20ff4b39","17ba5874b8bf4df6b2351426a17a2725","42b59cf9890a436ca1e262fb9a77024f","e6021618b18a4b279e46dcbf2eac37fb","00d264a6166446dda3a7e585c8f995b0","6a1f4aef8a9e4dd0b06b568abd370c6f","f1d34e2886524c3bbf1be92f4f337a3e","ddc460f1ce044601a6e1fd7dbdce71cf","0c1023c1e84046c0bfaa2a6b43523790","9cdb4f69486541a6b89647805349a0e3","82d166965030426f90e3c5f2813683fc","e766b89cdb7f47bdb07106ec007f43da","b0a6d34f2e8f446d8ae054b586d59486","7b77a7fd0a1242f188a3395fc83a4836","b15e7170f8694e18bc1cba176497a48c","e9ff7705176f4b3cb8b8d8dec78dc93e","c3e0938fa8614f9aa489520265dda514","72684f4c1cfc4c07b99ef3e89be83ef0","663488499dc244e29f1a7a3324d8d8d0","50b1df97b11b4716b548d269fe608c2c","5734c9ce79644d208223d29f1ca2a386","80a47421025c49288b38728c3103d9d1","ff25de5230074347ba7cf6feb43c480e","41bef6e0a5b249628b5b4ae2ec56d984","6f566b0d1bf74caf8b215e95fcfcb20b","f3dd0988304348009a1e2babb304a594","c714f72305bb4365ab3308766f4deda1","bca9395dbb904df29502faca115cc985","a43107d0d7a84134a717eb41de575555","081250bd35184357a995acd3f73d2088","3b603589159d4440a721cb9e2390fb85","edf6623bcd2d4b3cbee58d8b8db38271"]},"id":"7LAppEc4PAIG","executionInfo":{"status":"ok","timestamp":1754884842632,"user_tz":-420,"elapsed":159157,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"c4183d78-8655-4233-f611-6e41a170d40d"},"outputs":[{"output_type":"display_data","data":{"text/plain":["tokenizer_config.json: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"64a5ce5a5f7d41ca92da6c112ca675a5"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["vocab.json: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"ec56b2f130714423b18ad5721a52ae74"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["merges.txt: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"1a949c4edb16410f8d6698a964902f62"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["tokenizer.json: 0%| | 0.00/11.4M [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"49d5b4c2cb5e4f31925dfc9efc35fad8"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["config.json: 0.00B [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"8bab96c4757c41509f1b98b7311498a0"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["model.safetensors: 0%| | 0.00/5.19G [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"82d166965030426f90e3c5f2813683fc"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["generation_config.json: 0%| | 0.00/239 [00:00<?, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"80a47421025c49288b38728c3103d9d1"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["content: A large language model (LLM) is a type of artificial intelligence model trained on vast amounts of text data from the internet, books, and other sources. It learns to understand and generate human-like text by identifying patterns in language, such as grammar, context, and meaning. These models use deep learning techniques, particularly transformer architectures, to process and produce natural language responses. LLMs can answer questions, write stories, code, summarize content, and more—often in a way that mimics human conversation. Because they are trained on enormous datasets, they can generate coherent and contextually relevant responses, though they may sometimes produce inaccurate or biased information. Examples include GPT-3, GPT-4, and Llama.\n"]}],"source":["from transformers import AutoModelForCausalLM, AutoTokenizer\n","\n","model_name = \"Qwen/Qwen3-4B-Instruct-2507-FP8\"\n","\n","# load the tokenizer and the model\n","tokenizer = AutoTokenizer.from_pretrained(model_name)\n","model = AutoModelForCausalLM.from_pretrained(\n"," model_name,\n"," torch_dtype=\"auto\",\n"," device_map=\"auto\"\n",")\n","\n","# prepare the model input\n","prompt = \"Give me a short introduction to large language model.\"\n","messages = [\n"," {\"role\": \"user\", \"content\": prompt}\n","]\n","text = tokenizer.apply_chat_template(\n"," messages,\n"," tokenize=False,\n"," add_generation_prompt=True,\n",")\n","model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n","\n","# conduct text completion\n","generated_ids = model.generate(\n"," **model_inputs,\n"," max_new_tokens=16384\n",")\n","output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()\n","\n","content = tokenizer.decode(output_ids, skip_special_tokens=True)\n","\n","print(\"content:\", content)\n"]},{"cell_type":"code","source":["import time\n","\n","tic = time.time()\n","messages = [\n"," {\"role\": \"system\", \"content\": \"Chỉ đưa ra chữ cái đứng trước câu trả lời đúng (A, B, C, D hoặc E) của câu hỏi trắc nghiệm sau\"},\n"," {\"role\": \"user\", \"content\": \"Question: Một nền kinh tế trong trạng thái toàn dụng nhân công có nghĩa là:\\n\\nA. Không còn lạm phát nhưng có thể còn thất nghiệp\\nB. Không còn thất nghiệp nhưng có thể còn lạm phát\\nC. Không còn thất nghiệp và không còn lạm phát\\nD. Vẫn còn một tỷ lệ lạm phát và tỷ lệ thất nghiệp nhất định\", \"reasoning_effort\": \"medium\"},\n","]\n","\n","inputs = tokenizer.apply_chat_template(\n"," messages,\n"," add_generation_prompt=True,\n"," return_tensors=\"pt\",\n"," return_dict=True,\n",").to(model.device)\n","\n","generated = model.generate(**inputs, max_new_tokens=1, temperature=0.1)\n","toc = time.time()\n","print(f\"runtime: {toc-tic:.02f}\")\n","print(tokenizer.decode(generated[0][inputs[\"input_ids\"].shape[-1]:]))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"FRpMRwbqPDeV","executionInfo":{"status":"ok","timestamp":1754885251377,"user_tz":-420,"elapsed":171,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"896d05c1-2d77-4fa7-d411-a10d0f09b03e"},"execution_count":18,"outputs":[{"output_type":"stream","name":"stdout","text":["runtime: 0.17\n","C\n"]}]},{"cell_type":"code","source":["prompt = \"who is andrej karpathy\"\n","messages = [\n"," {\"role\": \"user\", \"content\": prompt}\n","]\n","text = tokenizer.apply_chat_template(\n"," messages,\n"," tokenize=False,\n"," add_generation_prompt=True,\n",")\n","model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n","\n","# conduct text completion\n","generated_ids = model.generate(\n"," **model_inputs,\n"," max_new_tokens=16384\n",")\n","output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()\n","\n","content = tokenizer.decode(output_ids, skip_special_tokens=True)\n","\n","print(\"content:\", content)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"bXEOQySBRXK8","executionInfo":{"status":"ok","timestamp":1754885513347,"user_tz":-420,"elapsed":46352,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"f0b18c3a-af23-4b89-936b-69fdbe278bb5"},"execution_count":19,"outputs":[{"output_type":"stream","name":"stdout","text":["content: Andrej Karpathy is a prominent figure in the field of artificial intelligence and machine learning. He is a computer scientist and software engineer, best known for his work at **Neural Networks and Deep Learning** at **Tesla**, where he serves as the **Head of AI at Tesla** and is a key contributor to the development of Tesla’s AI systems, including autonomous driving and neural network-based technologies.\n","\n","Before joining Tesla, Karpathy was a **Principal Research Scientist at NVIDIA**, where he played a major role in advancing deep learning research and tools. He is also widely recognized for his influential blog posts and open-source contributions, particularly in the area of **deep learning education and practical implementation**.\n","\n","One of his most well-known contributions is the **\"Neural Networks and Deep Learning\"** book and accompanying tutorials, which have become foundational resources for students and developers learning machine learning. His clear, accessible explanations helped democratize understanding of neural networks and deep learning.\n","\n","Karpathy is also known for his work on **Transformer models**, **language models**, and **efficient training techniques**. He has contributed to open-source projects like **PyTorch**, the deep learning framework developed by Facebook (Meta), and has been a strong advocate for making AI accessible and practical.\n","\n","In summary:\n","- **Andrej Karpathy** is a leading AI researcher and engineer.\n","- He works at **Tesla** (Head of AI) and previously at **NVIDIA**.\n","- He is known for his educational content, open-source contributions, and deep learning research.\n","- His work has significantly influenced the development and popularization of deep learning.\n","\n","He is often cited as one of the most influential figures in making machine learning more approachable and widely adopted.\n"]}]},{"cell_type":"code","source":["prompt = \"who is vincent warmerdam\"\n","messages = [\n"," {\"role\": \"user\", \"content\": prompt}\n","]\n","text = tokenizer.apply_chat_template(\n"," messages,\n"," tokenize=False,\n"," add_generation_prompt=True,\n",")\n","model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n","\n","# conduct text completion\n","generated_ids = model.generate(\n"," **model_inputs,\n"," max_new_tokens=16384\n",")\n","output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()\n","\n","content = tokenizer.decode(output_ids, skip_special_tokens=True)\n","\n","print(\"content:\", content)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"eCgvjoe3UysU","executionInfo":{"status":"ok","timestamp":1754885839054,"user_tz":-420,"elapsed":35534,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"dcd69ea4-42f4-4404-f047-4a514920f623"},"execution_count":20,"outputs":[{"output_type":"stream","name":"stdout","text":["content: As of now, there is no widely known public figure named Vincent Warmerdam in major international databases, news sources, or academic references. It's possible that the name is misspelled, fictional, or refers to a private individual not publicly recognized.\n","\n","If you meant **Vincent Warmerdam**, a known figure in the field of geospatial science, it's possible you are referring to **Vincent Warmerdam**, a Dutch geospatial scientist and software developer. He is known for his work in open-source geospatial software, particularly in the development and maintenance of **GDAL (Geospatial Data Abstraction Library)**, a key tool used in geospatial data processing.\n","\n","Vincent Warmerdam has contributed significantly to the open-source geospatial community, especially in the areas of raster and vector data handling, image processing, and geospatial interoperability. He has also been involved in various open-source projects and has authored or co-authored technical documentation and tutorials.\n","\n","So, in summary:\n","\n","✅ **Vincent Warmerdam** is a known contributor to open-source geospatial software, particularly GDAL. \n","❌ He is not a public figure in entertainment, politics, or general pop culture.\n","\n","If you meant someone else or have a specific context (e.g., a book, company, or academic paper), feel free to clarify!\n"]}]},{"cell_type":"code","source":["prompt = \"who is to lam\"\n","messages = [\n"," {\"role\": \"user\", \"content\": prompt}\n","]\n","text = tokenizer.apply_chat_template(\n"," messages,\n"," tokenize=False,\n"," add_generation_prompt=True,\n",")\n","model_inputs = tokenizer([text], return_tensors=\"pt\").to(model.device)\n","\n","# conduct text completion\n","generated_ids = model.generate(\n"," **model_inputs,\n"," max_new_tokens=16384\n",")\n","output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()\n","\n","content = tokenizer.decode(output_ids, skip_special_tokens=True)\n","\n","print(\"content:\", content)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"P2OtxSUfWE2k","executionInfo":{"status":"ok","timestamp":1754886016390,"user_tz":-420,"elapsed":27781,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"280f1fde-115b-4d57-fc19-fbe1627cb9fb"},"execution_count":21,"outputs":[{"output_type":"stream","name":"stdout","text":["content: As of now, there is no widely known public figure or notable individual named \"To Lam\" in major international or global contexts. It's possible that \"To Lam\" could be a misspelling, a less-known local figure, or a name used in a specific regional or professional context.\n","\n","If you meant **\"Tô Lâm\"**, that is a Vietnamese name. Tô Lâm is a prominent Vietnamese political figure. Specifically:\n","\n","- **Tô Lâm** is the current **Prime Minister of Vietnam** (as of 2024).\n","- He was born in 1965 and has held several high-level government positions, including Minister of Finance and Minister of Industry and Trade.\n","- He became Prime Minister in 2021, succeeding Nguyễn Xuân Phát.\n","\n","So, if you meant **Tô Lâm**, then he is a key political leader in Vietnam.\n","\n","If you meant someone else or have a different context (e.g., a fictional character, a businessperson, or a different region), please provide more details so I can assist you better.\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"fwd5oX5JWyCo"},"execution_count":null,"outputs":[]}]}

runs/Aug15_06-27-56_3ddbd7fd5744/events.out.tfevents.1755239294.3ddbd7fd5744.339.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae4e7d006d9520a9783116f7d7e9f99e5294e004dd86f07c2ce4148563c8aa63
+size 16564

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

token_analysis.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","gpuType":"L4","authorship_tag":"ABX9TyO9WIr+dMkZzui0zEfQ5GlL"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"DrPDyW5bjJh2","executionInfo":{"status":"ok","timestamp":1746937541098,"user_tz":-420,"elapsed":207,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"e7fd988c-82b4-4256-e5c7-5b4231d49e1f"},"outputs":[{"output_type":"stream","name":"stdout","text":["Sun May 11 04:25:41 2025 \n","+-----------------------------------------------------------------------------------------+\n","| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |\n","|-----------------------------------------+------------------------+----------------------+\n","| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n","| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n","| | | MIG M. |\n","|=========================================+========================+======================|\n","| 0 NVIDIA L4 Off | 00000000:00:03.0 Off | 0 |\n","| N/A 64C P8 19W / 72W | 0MiB / 23034MiB | 0% Default |\n","| | | N/A |\n","+-----------------------------------------+------------------------+----------------------+\n"," \n","+-----------------------------------------------------------------------------------------+\n","| Processes: |\n","| GPU GI CI PID Type Process name GPU Memory |\n","| ID ID Usage |\n","|=========================================================================================|\n","| No running processes found |\n","+-----------------------------------------------------------------------------------------+\n"]}],"source":["!nvidia-smi"]},{"cell_type":"code","source":["import torch\n","import torch.nn as nn\n","from transformers import GPT2Model, GPT2Config, GPT2Tokenizer\n","import time\n","import copy\n","\n","# 0. Define the Tanh \"Normalization\" Layer\n","class TanhReplacement(nn.Module):\n"," def __init__(self, original_ln_config=None): # original_ln_config is for API compatibility if needed by some architectures\n"," super().__init__()\n"," # Tanh has no learnable parameters or specific configuration like hidden_size or eps\n"," # If the original LayerNorm had learnable parameters, they are now gone.\n"," # The shape of the output will be the same as the input, just like LayerNorm.\n"," if original_ln_config:\n"," self.normalized_shape = original_ln_config.normalized_shape\n"," self.eps = original_ln_config.eps\n"," # We don't actually use these, but store them for potential inspection\n","\n"," def forward(self, x):\n"," return torch.tanh(x)\n","\n","# 1. Setup: Model and Tokenizer\n","model_name = \"gpt2\" # Smallest GPT-2 for quicker testing\n","config = GPT2Config.from_pretrained(model_name)\n","tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n","\n","# 2. Load Original GPT-2 Model\n","model_orig = GPT2Model.from_pretrained(model_name, config=config)\n","model_orig.eval() # Set to evaluation mode\n","\n","# 3. Create Modified Model (DyT-like)\n","model_dyt = copy.deepcopy(model_orig) # Deep copy to modify independently\n","\n","# Replace LayerNorms in the transformer blocks\n","for i, block in enumerate(model_dyt.h):\n"," # LayerNorm before MultiHeadAttention\n"," block.ln_1 = TanhReplacement(original_ln_config=block.ln_1) # Pass original LN config for potential reference\n"," # LayerNorm before MLP\n"," block.ln_2 = TanhReplacement(original_ln_config=block.ln_2)\n"," # print(f\"Replaced LayerNorms in block {i}\")\n","\n","# Replace final LayerNorm (if present in GPT2Model's main structure, GPT2Model has ln_f)\n","if hasattr(model_dyt, 'ln_f') and isinstance(model_dyt.ln_f, nn.LayerNorm):\n"," model_dyt.ln_f = TanhReplacement(original_ln_config=model_dyt.ln_f)\n"," # print(\"Replaced final LayerNorm (ln_f)\")\n","\n","model_dyt.eval()\n","\n","# --- Sanity check: Print model structures (optional) ---\n","# print(\"Original Model Structure (relevant parts):\")\n","# for i, block in enumerate(model_orig.h):\n","# print(f\"Block {i}: ln_1={block.ln_1}, ln_2={block.ln_2}\")\n","# print(f\"Final ln_f={model_orig.ln_f}\")\n","\n","# print(\"\\nModified Model Structure (relevant parts):\")\n","# for i, block in enumerate(model_dyt.h):\n","# print(f\"Block {i}: ln_1={block.ln_1}, ln_2={block.ln_2}\")\n","# print(f\"Final ln_f={model_dyt.ln_f}\")\n","# --- End Sanity check ---\n","\n","\n","# 4. Prepare Input Data\n","text = \"Remember that William Gibson quote The future is already here, it's just not evenly distributed? Surprise - the future is already here, and it is shockingly distributed. Power to the people. Personally, I love it.\"\n","# Using a moderately long sequence for better timing\n","text = \" \".join([\"test\"] * 100)\n","\n","text = \"\"\"\n","Transformative technologies usually follow a top-down diffusion path: originating in government or military contexts, passing through corporations, and eventually reaching individuals - think electricity, cryptography, computers, flight, the internet, or GPS. This progression feels intuitive, new and powerful technologies are usually scarce, capital-intensive, and their use requires specialized technical expertise in the early stages.\n","\n","So it strikes me as quite unique and remarkable that LLMs display a dramatic reversal of this pattern - they generate disproportionate benefit for regular people, while their impact is a lot more muted and lagging in corporations and governments. ChatGPT is the fastest growing consumer application in history, with 400 million weekly active users who use it for writing, coding, translation, tutoring, summarization, deep research, brainstorming, etc. This isn't a minor upgrade to what existed before, it is a major multiplier to an individual's power level across a broad range of capabilities. And the barrier to use is incredibly low - the models are cheap (free, even), fast, available to anyone on demand behind a url (or even local machine), and they speak anyone's native language, including tone, slang or emoji. This is insane. As far as I can tell, the average person has never experienced a technological unlock this dramatic, this fast.\n","\n","Why then are the benefits a lot more muted in the corporate and government realms? I think the first reason is that LLMs offer a very specific profile of capability - that of merely quasi-expert knowledge/performance, but simultaneously across a very wide variety of domains. In other words, they are simultaneously versatile but also shallow and fallible. Meanwhile, an organization's unique superpower is the ability to concentrate diverse expertise into a single entity by employing engineers, researchers, analysts, lawyers, marketers, etc. While LLMs can certainly make these experts more efficient individually (e.g. drafting initial legal clauses, generating boilerplate code, etc.), the improvement to the organization takes the form of becoming a bit better at the things it could already do. In contrast, an individual will usually only be an expert in at most one thing, so the broad quasi-expertise offered by the LLM fundamentally allows them to do things they couldn't do before. People can now vibe code apps. They can approach legal documents. They can grok esoteric research papers. They can do data analytics. They can generate multimodal content for branding and marketing. They can do all of this at an adequate capability without involving an additional expert.\n","\n","Second, organizations deal with problems of a lot greater complexity and necessary coordination, think: various integrations, legacy systems, corporate brand or style guides, stringent security protocols, privacy considerations, internationalization, regulatory compliance and legal risk. There are a lot more variables, a lot more constraints, a lot more considerations, and a lot lower margin for error. It's not so easy to put all of it into a context window. You can't just vibe code something. You might be one disastrous hallucination away from losing your job. And third, there is the well-documented inertia of a larger organization, featuring culture, historical precedents, political turf wars that escalate in periods of rapid change, communication overhead, re-training challenges of a distributed workforce and good old-fashioned bureaucracy. These are major headwinds when it comes to rapid adoption of a sparkling new, versatile-but-shallow-and-fallible tool. I don't wish to downplay the impacts of LLMs in corporations or governments, but at least for the moment and in aggregate across society, they have been significantly more life altering for individuals than they have been for organizations. Mary, Jim and Joes are experiencing the majority of the benefit, not Google or the government of the United States.\n","\n","Looking forward, the continued diffusion of LLMs of course depends on continued performance improvement and its capability profile. The \"benefit distribution\" overall is particularly interesting to chart, and depends heavily on the dynamic range of the performance as a function of capital expenditure. Today, frontier-grade LLM performance is very accessible and cheap. Beyond this point, you cannot spend a marginal dollar to get better performance, reliability or autonomy. Money can't buy better ChatGPT. Bill Gates talks to GPT 4o just like you do. But can this be expected to last? Train-time scaling (increase parameters, data), test-time scaling (increase time) and model ensembles (increase batch) are forces increasing the dynamic range. On the other hand, model distillation (the ability to train disproportionately powerful small models by training to mimic the big model) has been a force decreasing dynamic range. Certainly, the moment money can buy dramatically better ChatGPT, things change. Large organizations get to concentrate their vast resources to buy more intelligence. And within the category of \"individual\" too, the elite may once again split away from the rest of society. Their child will be tutored by GPT-8-pro-max-high, yours by GPT-6 mini.\n","\n","But at least at this moment in time, we find ourselves in a unique and unprecedented situation in the history of technology. If you go back through various sci-fi you'll see that very few would have predicted that the AI revolution would feature this progression. It was supposed to be a top secret government megabrain project wielded by the generals, not ChatGPT appearing basically overnight and for free on a device already in everyone's pocket. Remember that William Gibson quote \"The future is already here, it's just not evenly distributed\"? Surprise - the future is already here, and it is shockingly distributed. Power to the people. Personally, I love it.\n","\"\"\"\n","\n","inputs = tokenizer(text, return_tensors=\"pt\")\n","input_ids = inputs[\"input_ids\"]\n","attention_mask = inputs[\"attention_mask\"]\n","\n","# 5. Move models and data to device (GPU if available, else CPU)\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","print(f\"Using device: {device}\")\n","\n","model_orig.to(device)\n","model_dyt.to(device)\n","input_ids = input_ids.to(device)\n","attention_mask = attention_mask.to(device)\n","\n","# 6. Inference Time Measurement Function\n","def measure_inference_time(model, input_ids_tensor, attention_mask_tensor, num_runs=100, warmup_runs=10):\n"," # Warmup runs\n"," for _ in range(warmup_runs):\n"," with torch.no_grad():\n"," _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)\n","\n"," if device.type == 'cuda':\n"," torch.cuda.synchronize() # Ensure warmup is complete\n","\n"," total_time = 0\n"," start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None\n"," end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None\n","\n"," for i in range(num_runs):\n"," if device.type == 'cuda':\n"," start_events[i].record()\n"," else:\n"," start_time = time.perf_counter()\n","\n"," with torch.no_grad():\n"," _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)\n","\n"," if device.type == 'cuda':\n"," end_events[i].record()\n"," else:\n"," end_time = time.perf_counter()\n"," total_time += (end_time - start_time)\n","\n"," if device.type == 'cuda':\n"," torch.cuda.synchronize() # Wait for all runs to complete\n"," for i in range(num_runs):\n"," total_time += start_events[i].elapsed_time(end_events[i]) / 1000.0 # elapsed_time is in ms\n","\n"," avg_time = total_time / num_runs\n"," return avg_time\n","\n","# 7. Run and Compare\n","print(f\"\\nBenchmarking with sequence length: {input_ids.shape[1]}\")\n","num_runs = 200\n","warmup_runs = 20\n","\n","# --- Test outputs to ensure they are different (as expected) ---\n","# with torch.no_grad():\n","# out_orig = model_orig(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state\n","# out_dyt = model_dyt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state\n","# print(f\"Output norm difference: {torch.norm(out_orig - out_dyt)}\")\n","# assert not torch.allclose(out_orig, out_dyt), \"Outputs should be different!\"\n","# ---\n","\n","avg_time_orig = measure_inference_time(model_orig, input_ids, attention_mask, num_runs, warmup_runs)\n","print(f\"Original GPT-2 avg inference time: {avg_time_orig*1000:.4f} ms\")\n","\n","avg_time_dyt = measure_inference_time(model_dyt, input_ids, attention_mask, num_runs, warmup_runs)\n","print(f\"DyT-like (Tanh) GPT-2 avg inference time: {avg_time_dyt*1000:.4f} ms\")\n","\n","if avg_time_dyt < avg_time_orig:\n"," speedup_percentage = ((avg_time_orig - avg_time_dyt) / avg_time_orig) * 100\n"," print(f\"Speedup with Tanh replacement: {speedup_percentage:.2f}%\")\n","else:\n"," slowdown_percentage = ((avg_time_dyt - avg_time_orig) / avg_time_orig) * 100\n"," print(f\"Slowdown with Tanh replacement: {slowdown_percentage:.2f}% (This is unexpected if Tanh is truly faster)\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":474},"id":"BqnapHTijcaV","executionInfo":{"status":"error","timestamp":1746937609166,"user_tz":-420,"elapsed":1241,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"7da6806f-e1a2-4dba-bc57-4ecb03a9a201"},"execution_count":4,"outputs":[{"output_type":"stream","name":"stderr","text":["Token indices sequence length is longer than the specified maximum sequence length for this model (1171 > 1024). Running this sequence through the model will result in indexing errors\n"]},{"output_type":"stream","name":"stdout","text":["Using device: cuda\n","\n","Benchmarking with sequence length: 1171\n"]},{"output_type":"error","ename":"RuntimeError","evalue":"CUDA error: device-side assert triggered\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)","\u001b[0;32m<ipython-input-4-9e39e3659b16>\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;31m# ---\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 144\u001b[0;31m \u001b[0mavg_time_orig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmeasure_inference_time\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_orig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_runs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwarmup_runs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 145\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Original GPT-2 avg inference time: {avg_time_orig*1000:.4f} ms\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m<ipython-input-4-9e39e3659b16>\u001b[0m in \u001b[0;36mmeasure_inference_time\u001b[0;34m(model, input_ids_tensor, attention_mask_tensor, num_runs, warmup_runs)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwarmup_runs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 99\u001b[0;31m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput_ids_tensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask_tensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 100\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'cuda'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1738\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1739\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1740\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1741\u001b[0m \u001b[0;31m# torchrec tests the code consistency with the following code\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1748\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1749\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1751\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1752\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/transformers/models/gpt2/modeling_gpt2.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 818\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minputs_embeds\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[0minputs_embeds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwte\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 820\u001b[0;31m \u001b[0mposition_embeds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwpe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 821\u001b[0m \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minputs_embeds\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mposition_embeds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 822\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1738\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1739\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1740\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1741\u001b[0m \u001b[0;31m# torchrec tests the code consistency with the following code\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1748\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1749\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1751\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1752\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/sparse.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m return F.embedding(\n\u001b[0m\u001b[1;32m 191\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36membedding\u001b[0;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[1;32m 2549\u001b[0m \u001b[0;31m# remove once script supports set_grad_enabled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2550\u001b[0m \u001b[0m_no_grad_embedding_renorm_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_norm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnorm_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2551\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscale_grad_by_freq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msparse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2552\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2553\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mRuntimeError\u001b[0m: CUDA error: device-side assert triggered\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n"]}]},{"cell_type":"code","source":["import torch\n","import torch.nn as nn\n","from transformers import GPT2Model, GPT2Config, GPT2Tokenizer\n","import time\n","import copy\n","\n","# 0. Define the Tanh \"Normalization\" Layer\n","class TanhReplacement(nn.Module):\n"," def __init__(self, original_ln_config=None): # original_ln_config is for API compatibility if needed by some architectures\n"," super().__init__()\n"," # Tanh has no learnable parameters or specific configuration like hidden_size or eps\n"," # If the original LayerNorm had learnable parameters, they are now gone.\n"," # The shape of the output will be the same as the input, just like LayerNorm.\n"," if original_ln_config:\n"," self.normalized_shape = original_ln_config.normalized_shape\n"," self.eps = original_ln_config.eps\n"," # We don't actually use these, but store them for potential inspection\n","\n"," def forward(self, x):\n"," return torch.tanh(x)\n","\n","# 1. Setup: Model and Tokenizer\n","model_name = \"gpt2\" # Smallest GPT-2 for quicker testing\n","config = GPT2Config.from_pretrained(model_name)\n","tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n","\n","# 2. Load Original GPT-2 Model\n","model_orig = GPT2Model.from_pretrained(model_name, config=config)\n","model_orig.eval() # Set to evaluation mode\n","\n","# 3. Create Modified Model (DyT-like)\n","model_dyt = copy.deepcopy(model_orig) # Deep copy to modify independently\n","\n","# Replace LayerNorms in the transformer blocks\n","for i, block in enumerate(model_dyt.h):\n"," # LayerNorm before MultiHeadAttention\n"," block.ln_1 = TanhReplacement(original_ln_config=block.ln_1) # Pass original LN config for potential reference\n"," # LayerNorm before MLP\n"," block.ln_2 = TanhReplacement(original_ln_config=block.ln_2)\n"," # print(f\"Replaced LayerNorms in block {i}\")\n","\n","# Replace final LayerNorm (if present in GPT2Model's main structure, GPT2Model has ln_f)\n","if hasattr(model_dyt, 'ln_f') and isinstance(model_dyt.ln_f, nn.LayerNorm):\n"," model_dyt.ln_f = TanhReplacement(original_ln_config=model_dyt.ln_f)\n"," # print(\"Replaced final LayerNorm (ln_f)\")\n","\n","model_dyt.eval()\n","\n","# --- Sanity check: Print model structures (optional) ---\n","# print(\"Original Model Structure (relevant parts):\")\n","# for i, block in enumerate(model_orig.h):\n","# print(f\"Block {i}: ln_1={block.ln_1}, ln_2={block.ln_2}\")\n","# print(f\"Final ln_f={model_orig.ln_f}\")\n","\n","# print(\"\\nModified Model Structure (relevant parts):\")\n","# for i, block in enumerate(model_dyt.h):\n","# print(f\"Block {i}: ln_1={block.ln_1}, ln_2={block.ln_2}\")\n","# print(f\"Final ln_f={model_dyt.ln_f}\")\n","# --- End Sanity check ---\n","\n","\n","# 4. Prepare Input Data\n","text = \"Replace this with your desired input text for benchmarking. Longer sequences might show more difference.\"\n","# Using a moderately long sequence for better timing\n","text = \" \".join([\"test\"] * 100)\n","inputs = tokenizer(text, return_tensors=\"pt\")\n","input_ids = inputs[\"input_ids\"]\n","attention_mask = inputs[\"attention_mask\"]\n","\n","# 5. Move models and data to device (GPU if available, else CPU)\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","print(f\"Using device: {device}\")\n","\n","model_orig.to(device)\n","model_dyt.to(device)\n","input_ids = input_ids.to(device)\n","attention_mask = attention_mask.to(device)\n","\n","# 6. Inference Time Measurement Function\n","def measure_inference_time(model, input_ids_tensor, attention_mask_tensor, num_runs=100, warmup_runs=10):\n"," # Warmup runs\n"," for _ in range(warmup_runs):\n"," with torch.no_grad():\n"," _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)\n","\n"," if device.type == 'cuda':\n"," torch.cuda.synchronize() # Ensure warmup is complete\n","\n"," total_time = 0\n"," start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None\n"," end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None\n","\n"," for i in range(num_runs):\n"," if device.type == 'cuda':\n"," start_events[i].record()\n"," else:\n"," start_time = time.perf_counter()\n","\n"," with torch.no_grad():\n"," _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)\n","\n"," if device.type == 'cuda':\n"," end_events[i].record()\n"," else:\n"," end_time = time.perf_counter()\n"," total_time += (end_time - start_time)\n","\n"," if device.type == 'cuda':\n"," torch.cuda.synchronize() # Wait for all runs to complete\n"," for i in range(num_runs):\n"," total_time += start_events[i].elapsed_time(end_events[i]) / 1000.0 # elapsed_time is in ms\n","\n"," avg_time = total_time / num_runs\n"," return avg_time\n","\n","# 7. Run and Compare\n","print(f\"\\nBenchmarking with sequence length: {input_ids.shape[1]}\")\n","num_runs = 200\n","warmup_runs = 20\n","\n","# --- Test outputs to ensure they are different (as expected) ---\n","# with torch.no_grad():\n","# out_orig = model_orig(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state\n","# out_dyt = model_dyt(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state\n","# print(f\"Output norm difference: {torch.norm(out_orig - out_dyt)}\")\n","# assert not torch.allclose(out_orig, out_dyt), \"Outputs should be different!\"\n","# ---\n","\n","avg_time_orig = measure_inference_time(model_orig, input_ids, attention_mask, num_runs, warmup_runs)\n","print(f\"Original GPT-2 avg inference time: {avg_time_orig*1000:.4f} ms\")\n","\n","avg_time_dyt = measure_inference_time(model_dyt, input_ids, attention_mask, num_runs, warmup_runs)\n","print(f\"DyT-like (Tanh) GPT-2 avg inference time: {avg_time_dyt*1000:.4f} ms\")\n","\n","if avg_time_dyt < avg_time_orig:\n"," speedup_percentage = ((avg_time_orig - avg_time_dyt) / avg_time_orig) * 100\n"," print(f\"Speedup with Tanh replacement: {speedup_percentage:.2f}%\")\n","else:\n"," slowdown_percentage = ((avg_time_dyt - avg_time_orig) / avg_time_orig) * 100\n"," print(f\"Slowdown with Tanh replacement: {slowdown_percentage:.2f}% (This is unexpected if Tanh is truly faster)\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"IiIYSDjYjzhn","executionInfo":{"status":"ok","timestamp":1746937583133,"user_tz":-420,"elapsed":6848,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"04721b36-199e-47d3-b414-080b24b81c4c"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n","\n","Benchmarking with sequence length: 100\n","Original GPT-2 avg inference time: 13.5885 ms\n","DyT-like (Tanh) GPT-2 avg inference time: 12.9903 ms\n","Speedup with Tanh replacement: 4.40%\n"]}]},{"cell_type":"code","source":["import torch\n","import torch.nn as nn\n","from transformers import GPT2Model, GPT2Config, GPT2Tokenizer\n","import time\n","import copy\n","import os\n","\n","# IMPORTANT FOR DEBUGGING CUDA ERRORS:\n","# This makes CUDA kernel launches synchronous. If a kernel errors,\n","# the Python stack trace will now point to the exact line that launched the faulty kernel.\n","os.environ['CUDA_LAUNCH_BLOCKING'] = '1'\n","\n","# 0. Define the Tanh \"Normalization\" Layer\n","class TanhReplacement(nn.Module):\n"," def __init__(self, original_ln_config=None): # original_ln_config is for API compatibility if needed\n"," super().__init__()\n"," # Store original config for potential inspection, though tanh doesn't use it\n"," if original_ln_config:\n"," self.normalized_shape = original_ln_config.normalized_shape\n"," self.eps = original_ln_config.eps\n"," # Tanh has no learnable parameters or specific configuration like hidden_size or eps\n"," # If the original LayerNorm had learnable parameters (gamma, beta), they are now gone.\n","\n"," def forward(self, x):\n"," # --- Debugging: Check for NaNs/Infs in input to this specific Tanh layer ---\n"," # if torch.isnan(x).any() or torch.isinf(x).any():\n"," # print(f\"WARNING: NaN or Inf detected in input to TanhReplacement! Min: {x.min().item()}, Max: {x.max().item()}\")\n","\n"," output = torch.tanh(x)\n","\n"," # --- Debugging: Check for NaNs/Infs in output of this Tanh layer ---\n"," # (tanh(nan) -> nan; tanh(inf) -> 1.0; tanh(-inf) -> -1.0)\n"," # if torch.isnan(output).any():\n"," # print(f\"WARNING: NaN detected in TanhReplacement output! Input was likely NaN.\")\n"," # if torch.isinf(output).any(): # Should not happen for tanh if input is not inf\n"," # print(f\"WARNING: Inf detected in TanhReplacement output! This is unexpected for tanh.\")\n"," return output\n","\n","# Function to recursively replace LayerNorm modules\n","def replace_layernorm_with_tanh_recursive(module):\n"," has_replaced = False\n"," for name, child_module in module.named_children():\n"," if isinstance(child_module, nn.LayerNorm):\n"," # print(f\"Replacing LayerNorm '{name}' in {module.__class__.__name__} with TanhReplacement\")\n"," setattr(module, name, TanhReplacement(original_ln_config=child_module))\n"," has_replaced = True\n"," else:\n"," has_replaced = replace_layernorm_with_tanh_recursive(child_module) or has_replaced # Recurse\n"," return has_replaced\n","\n","# 1. Setup: Model and Tokenizer\n","model_name = \"gpt2\" # Smallest GPT-2 for quicker testing\n","config = GPT2Config.from_pretrained(model_name)\n","tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n","if tokenizer.pad_token is None: # GPT-2 often doesn't have a pad token; add if missing for batching\n"," tokenizer.pad_token = tokenizer.eos_token\n","\n","\n","# 2. Load Original GPT-2 Model\n","model_orig = GPT2Model.from_pretrained(model_name, config=config)\n","model_orig.eval() # Set to evaluation mode\n","\n","# 3. Create Modified Model (DyT-like)\n","model_dyt = copy.deepcopy(model_orig) # Deep copy to modify independently\n","replaced_in_dyt = replace_layernorm_with_tanh_recursive(model_dyt)\n","# print(f\"LayerNorms replaced in DyT model: {replaced_in_dyt}\")\n","model_dyt.eval()\n","\n","\n","# --- Sanity check: Print model structures (optional) ---\n","# print(\"Original Model Structure (example LayerNorm):\")\n","# if len(model_orig.h) > 0:\n","# print(f\"Block 0 ln_1: {model_orig.h[0].ln_1}\")\n","# if hasattr(model_orig, 'ln_f'):\n","# print(f\"Final ln_f: {model_orig.ln_f}\")\n","\n","# print(\"\\nModified Model Structure (example TanhReplacement):\")\n","# if len(model_dyt.h) > 0:\n","# print(f\"Block 0 ln_1: {model_dyt.h[0].ln_1}\")\n","# if hasattr(model_dyt, 'ln_f'):\n","# print(f\"Final ln_f: {model_dyt.ln_f}\")\n","# --- End Sanity check ---\n","\n","\n","# 4. Prepare Input Data\n","# text = \"Replace this with your desired input text for benchmarking. Longer sequences might show more difference.\"\n","text = \" \".join([\"test\"] * 50) # Using a shorter sequence for faster debugging if issues persist\n","inputs = tokenizer(text, return_tensors=\"pt\", padding=True, truncation=True, max_length=128)\n","input_ids = inputs[\"input_ids\"]\n","attention_mask = inputs[\"attention_mask\"]\n","\n","# 5. Move models and data to device (GPU if available, else CPU)\n","device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n","print(f\"Using device: {device}\")\n","\n","model_orig.to(device)\n","model_dyt.to(device)\n","input_ids = input_ids.to(device)\n","attention_mask = attention_mask.to(device)\n","\n","# 6. Inference Time Measurement Function\n","def measure_inference_time(model, model_desc, input_ids_tensor, attention_mask_tensor, num_runs=100, warmup_runs=10):\n"," # Warmup runs\n"," for _ in range(warmup_runs):\n"," with torch.no_grad():\n"," _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)\n","\n"," if device.type == 'cuda':\n"," torch.cuda.synchronize() # Ensure warmup is complete\n","\n"," total_time = 0\n"," start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None\n"," end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_runs)] if device.type == 'cuda' else None\n","\n"," for i in range(num_runs):\n"," if device.type == 'cuda':\n"," start_events[i].record()\n"," else:\n"," start_time = time.perf_counter()\n","\n"," with torch.no_grad():\n"," _ = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)\n","\n"," if device.type == 'cuda':\n"," end_events[i].record()\n"," else:\n"," end_time = time.perf_counter()\n"," total_time += (end_time - start_time)\n","\n"," if device.type == 'cuda':\n"," torch.cuda.synchronize() # Wait for all runs to complete\n"," for i in range(num_runs):\n"," total_time += start_events[i].elapsed_time(end_events[i]) / 1000.0 # elapsed_time is in ms\n","\n"," avg_time = total_time / num_runs\n"," return avg_time\n","\n","# 7. Run and Compare\n","print(f\"\\nBenchmarking with sequence length: {input_ids.shape[1]}\")\n","num_runs = 50 # Reduced for potentially faster debugging cycles, increase for stable benchmarks\n","warmup_runs = 5\n","\n","avg_time_orig = float('nan')\n","avg_time_dyt = float('nan')\n","\n","try:\n"," avg_time_orig = measure_inference_time(model_orig, \"Original GPT-2\", input_ids, attention_mask, num_runs, warmup_runs)\n"," print(f\"Original GPT-2 avg inference time: {avg_time_orig*1000:.4f} ms\")\n","except Exception as e:\n"," print(f\"Error benchmarking original model: {e}\")\n"," # This would be very unexpected\n","\n","try:\n"," avg_time_dyt = measure_inference_time(model_dyt, \"DyT-like (Tanh) GPT-2\", input_ids, attention_mask, num_runs, warmup_runs)\n"," print(f\"DyT-like (Tanh) GPT-2 avg inference time: {avg_time_dyt*1000:.4f} ms\")\n","except RuntimeError as e:\n"," print(f\"\\nERROR during benchmarking of DyT-like model: {e}\")\n"," if \"CUDA error: device-side assert triggered\" in str(e):\n"," print(\"\\n\" + \"=\"*50)\n"," print(\"A 'device-side assert' was triggered in the DyT-like model.\")\n"," print(\"This LIKELY means that replacing LayerNorm with tanh in the pre-trained GPT-2\")\n"," print(\"caused numerical instability (e.g., NaNs or Infs), which then made a GPU kernel fail.\")\n"," print(\"The Python stack trace (above, with CUDA_LAUNCH_BLOCKING=1) should now pinpoint\")\n"," print(\"the exact operation in the Hugging Face code where the problem occurred.\")\n"," print(\"Common culprits are operations like softmax in attention if inputs become too large/NaN.\")\n"," print(\"This experiment highlights that LayerNorm is critical for the stability of pre-trained models;\")\n"," print(\"simply swapping it out without retraining is generally not viable for correct model output.\")\n"," print(\"The DyT paper's hypothesis is about training models with tanh units from scratch.\")\n"," print(\"=\"*50 + \"\\n\")\n"," # No further comparison if DyT model failed\n"," avg_time_dyt = float('nan') # Ensure it's NaN if it failed\n","\n","if not (torch.isnan(torch.tensor(avg_time_orig)) or torch.isnan(torch.tensor(avg_time_dyt))):\n"," if avg_time_dyt < avg_time_orig:\n"," speedup_percentage = ((avg_time_orig - avg_time_dyt) / avg_time_orig) * 100\n"," print(f\"Speedup with Tanh replacement: {speedup_percentage:.2f}%\")\n"," else:\n"," slowdown_percentage = ((avg_time_dyt - avg_time_orig) / avg_time_orig) * 100\n"," print(f\"Slowdown with Tanh replacement: {slowdown_percentage:.2f}% (This is unexpected if Tanh is truly faster and model runs)\")\n","else:\n"," print(\"\\nCould not complete the performance comparison due to errors or incomplete runs.\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ELbp7_sUk6no","executionInfo":{"status":"ok","timestamp":1746937556758,"user_tz":-420,"elapsed":10717,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"2f46402a-3965-45e1-f03d-70f99fa6d1b8"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Using device: cuda\n","\n","Benchmarking with sequence length: 50\n","Original GPT-2 avg inference time: 12.8963 ms\n","DyT-like (Tanh) GPT-2 avg inference time: 12.3657 ms\n","Speedup with Tanh replacement: 4.11%\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"Ngv4Pof_lvmb"},"execution_count":null,"outputs":[]}]}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5d296cf30a9e7cb3384ddd27e8763c4e151efdde9b88e43ce72401e524c693c
+size 5816

vlmu_dialog_v1.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5123680a1f7f8c71249bd18fe8eb0326278c9544f1f168952b12fdba31d25dc4
+size 26528

vlmu_drop_v1.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5a4c8afa5d0aa0a0a0cafe0ae79cbc3843bd9cb59e193c650c2fdb8b235a867
+size 867221

vlmu_mqa_v1.5.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:677e78e4270777c4649b717184d9bca5bee79147462333f5c4e905311abff6ef
+size 869284

vlmu_squad_v1.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:def8851fe51d3c26070105534a073a6fadf6b26050ac4fb99caeaa48e897dd11
+size 1872864

vmlu.ipynb ADDED Viewed

	@@ -0,0 +1 @@

+ {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"mount_file_id":"12GgxlLzeKU8ur3RBFou3725TiWqNIu5t","authorship_tag":"ABX9TyOI2W/0hXXZ371SvFKil863"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"TGLkbIr6TCiD","executionInfo":{"status":"ok","timestamp":1753065902565,"user_tz":-420,"elapsed":31447,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"a7633214-9b3c-4b41-f316-fea8a98cf9c7"},"outputs":[{"output_type":"stream","name":"stdout","text":["Hello Li Lei! \n","This is one of humanity’s oldest and most personal questions, so I’ll share a few perspectives and then invite you to decide which resonates with you.\n","\n","1. Biological lens \n"," From an evolutionary standpoint, the “purpose” encoded in every living organism is simply to survive, reproduce, and pass on its genes. Life persists because it is good at persisting.\n","\n","2. Philosophical lens \n"," Thinkers across cultures have argued that meaning is not handed to us by nature; we must create it ourselves. For Sartre, existence precedes essence—we are free to define our own essence through choices. For Confucian traditions, meaning arises from cultivating virtue (仁, rén) and harmonious relationships within family and society.\n","\n","3. Psychological lens \n"," Research in positive psychology suggests that people feel life is meaningful when they experience three things: \n"," - Purpose: having goals that feel worthwhile. \n"," - Coherence: understanding how life fits together. \n"," - Significance: feeling that one’s existence matters to others.\n","\n","4. Personal lens (a suggestion) \n"," Instead of hunting for a single cosmic answer, try treating meaning as an ongoing craft project. Ask: \n"," - What relationships do I want to deepen? \n"," - What skills or values do I want to cultivate? \n"," - What small acts today could make someone else’s life better?\n","\n","If you experiment with those questions regularly, you may discover that meaning is less a hidden treasure and more a garden you keep tending.\n","\n","What kind of activities or relationships make you feel most alive, Li Lei?\n"]}],"source":["from openai import OpenAI\n","from google.colab import userdata\n","\n","\n","client = OpenAI(\n"," api_key=userdata.get('MOONSHOT_API_KEY'), # Replace MOONSHOT_API_KEY with the API Key you obtained from the Kimi Open Platform\n"," base_url=\"https://api.moonshot.ai/v1\",\n",")\n","\n","completion = client.chat.completions.create(\n"," # model = \"moonshot-v1-8k\",\n"," model = \"kimi-k2-0711-preview\",\n"," messages = [\n"," {\"role\": \"system\", \"content\": \"You are Kimi, an AI assistant provided by Moonshot AI. You are proficient in Vietnamese and English conversations. You provide users with safe, helpful, and accurate answers. You will reject any requests involving terrorism, racism, or explicit content. Moonshot AI is a proper noun and should not be translated.\"},\n"," {\"role\": \"user\", \"content\": \"Hello, my name is Li Lei. What is the meaning of life?\"}\n"," ],\n"," temperature = 0.3,\n",")\n","\n","# We receive a response from the Kimi large language model via the API (role=assistant)\n","print(completion.choices[0].message.content)"]},{"cell_type":"code","source":["%%writefile vmlu_kimi.py\n","import os\n","import json\n","import tqdm\n","import pandas as pd\n","import time\n","import re\n","from openai import OpenAI\n","import random\n","from typing import Optional, Dict, Any\n","from google.colab import userdata\n","\n","class KimiClient:\n"," def __init__(self, api_key: str, base_url: str = \"https://api.moonshot.ai/v1\"):\n"," self.client = OpenAI(\n"," api_key=api_key,\n"," base_url=base_url,\n"," )\n"," self.model = \"kimi-k2-0711-preview\"\n","\n"," def chat_completion_with_retry(\n"," self,\n"," messages: list,\n"," temperature: float = 0,\n"," max_retries: int = 5,\n"," base_delay: float = 1.0\n"," ) -> Optional[str]:\n"," \"\"\"\n"," Chat completion with exponential backoff retry for rate limits\n"," \"\"\"\n"," for attempt in range(max_retries):\n"," try:\n"," response = self.client.chat.completions.create(\n"," model=self.model,\n"," messages=messages,\n"," temperature=temperature,\n"," )\n"," return response.choices[0].message.content\n","\n"," except Exception as e:\n"," error_str = str(e).lower()\n","\n"," # Rate limit handling\n"," if \"rate_limit\" in error_str or \"rate limit\" in error_str:\n"," # Extract wait time from error message if available\n"," wait_time = self._extract_wait_time(str(e))\n"," if wait_time:\n"," print(f\"Rate limit hit. Waiting {wait_time}s (from error message)\")\n"," time.sleep(wait_time)\n"," else:\n"," # Exponential backoff with jitter\n"," delay = base_delay * (2 ** attempt) + random.uniform(0, 1)\n"," print(f\"Rate limit hit. Retry {attempt + 1}/{max_retries}. Waiting {delay:.2f}s\")\n"," time.sleep(delay)\n"," continue\n","\n"," # Other API errors\n"," elif \"timeout\" in error_str or \"connection\" in error_str:\n"," delay = base_delay * (2 ** attempt)\n"," print(f\"Connection issue. Retry {attempt + 1}/{max_retries}. Waiting {delay:.2f}s\")\n"," time.sleep(delay)\n"," continue\n","\n"," # Unknown errors\n"," else:\n"," print(f\"Unexpected error: {e}\")\n"," if attempt < max_retries - 1:\n"," delay = base_delay * (2 ** attempt)\n"," print(f\"Retry {attempt + 1}/{max_retries}. Waiting {delay:.2f}s\")\n"," time.sleep(delay)\n"," continue\n"," else:\n"," print(f\"Max retries reached for unknown error\")\n"," return None\n","\n"," print(f\"Failed after {max_retries} attempts\")\n"," return None\n","\n"," def _extract_wait_time(self, error_message: str) -> Optional[float]:\n"," \"\"\"Extract wait time from rate limit error message\"\"\"\n"," # Common patterns for rate limit messages\n"," patterns = [\n"," r'try again in (\\d+\\.?\\d*) seconds',\n"," r'retry after (\\d+\\.?\\d*) seconds',\n"," r'wait (\\d+\\.?\\d*) seconds'\n"," ]\n","\n"," for pattern in patterns:\n"," match = re.search(pattern, error_message.lower())\n"," if match:\n"," return float(match.group(1))\n"," return None\n","\n","def process_vmlu_with_kimi(api_key: str, data_path: str = 'test.jsonl'):\n"," \"\"\"Process VMLU dataset with Kimi K2 model\"\"\"\n","\n"," # Initialize Kimi client\n"," kimi = KimiClient(api_key)\n","\n"," # Load data\n"," data = []\n"," with open(data_path, 'r', encoding='utf-8') as f:\n"," lines = f.readlines()\n"," for line in lines:\n"," data.append(json.loads(line))\n","\n"," print(f\"Loaded {len(data)} questions\")\n","\n"," # Process questions\n"," all_res = []\n"," failed_ids = []\n","\n"," for idx, doc in enumerate(tqdm.tqdm(data)):\n"," text_choice = '\\n'.join(doc['choices'])\n"," prompt = (\"Chỉ đưa ra chữ cái đứng trước câu trả lời đúng (A, B, C, D hoặc E) \"\n"," \"của câu hỏi trắc nghiệm sau: \\n\"\n"," + doc[\"question\"] + \"\\n\\n\" + text_choice + \"\\n\" + \"Đáp án: \")\n","\n"," messages = [\n"," {\n"," \"role\": \"system\",\n"," \"content\": \"You are Kimi, an AI assistant. Provide only the letter (A, B, C, D, or E) that corresponds to the correct answer for Vietnamese multiple choice questions.\"\n"," },\n"," {\n"," \"role\": \"user\",\n"," \"content\": prompt\n"," }\n"," ]\n","\n"," # Get response with retry logic\n"," response_str = kimi.chat_completion_with_retry(messages, temperature=0)\n","\n"," if response_str is None:\n"," print(f\"Failed to get response for question {doc['id']}\")\n"," failed_ids.append(doc['id'])\n"," response_str = \"\"\n","\n"," all_res.append({\n"," \"id\": doc['id'],\n"," \"prompt\": prompt,\n"," \"question\": doc[\"question\"],\n"," \"answer\": response_str\n"," })\n","\n"," # Save progress every 100 questions\n"," if idx % 100 == 0 and idx > 0:\n"," result_folder = \"all_res/kimi_result\"\n"," os.makedirs(result_folder, exist_ok=True)\n"," pd.DataFrame(all_res).to_csv(\n"," f\"{result_folder}/raw_result_{len(all_res)}.csv\",\n"," index=False,\n"," encoding='utf-8'\n"," )\n"," print(f\"Progress saved: {len(all_res)} questions processed\")\n","\n"," # Rate limiting: small delay between requests\n"," time.sleep(0.1)\n","\n"," # Final processing\n"," df = pd.DataFrame(all_res)\n","\n"," # Clean answers - extract first letter and ensure it's A-E\n"," def clean_answer(answer_str):\n"," if not answer_str:\n"," return \"\"\n","\n"," # Extract first character that's A-E\n"," cleaned = re.sub(r'[^ABCDEabcde]', '', str(answer_str))\n"," if cleaned:\n"," return cleaned[0].upper()\n","\n"," # Fallback: try to find A-E in the original string\n"," for char in str(answer_str).upper():\n"," if char in 'ABCDE':\n"," return char\n","\n"," return \"\"\n","\n"," df['answer'] = df['answer'].apply(clean_answer)\n","\n"," # Save final results\n"," result_folder = \"all_res/kimi_result\"\n"," os.makedirs(result_folder, exist_ok=True)\n","\n"," # Save raw results\n"," df.to_csv(f\"{result_folder}/final_raw_result.csv\", index=False, encoding='utf-8')\n","\n"," # Create submission file\n"," submission_df = df[['id', 'answer']].copy()\n"," submission_df.to_csv('kimi_submission_k2.csv', index=False)\n","\n"," # Print statistics\n"," total_questions = len(data)\n"," answered_questions = len(df[df['answer'] != ''])\n"," valid_answers = len(df[df['answer'].isin(['A', 'B', 'C', 'D', 'E'])])\n","\n"," print(f\"\\n=== Results Summary ===\")\n"," print(f\"Total questions: {total_questions}\")\n"," print(f\"Answered questions: {answered_questions}\")\n"," print(f\"Valid answers (A-E): {valid_answers}\")\n"," print(f\"Success rate: {valid_answers/total_questions*100:.2f}%\")\n","\n"," if failed_ids:\n"," print(f\"Failed question IDs: {failed_ids[:10]}{'...' if len(failed_ids) > 10 else ''}\")\n","\n"," return df\n","\n","if __name__ == \"__main__\":\n"," # Get API key from environment\n"," api_key = \"sk-VZFI2IgonKwgzpKsASVeUDoEUfq0FUwUKDNNjk0JWoF0SOiQ\"\n"," # api_key = userdata.get('MOONSHOT_API_KEY')\n"," if not api_key:\n"," raise ValueError(\"Please set MOONSHOT_API_KEY environment variable\")\n","\n"," # Process VMLU dataset\n"," results_df = process_vmlu_with_kimi(api_key)\n"," print(\"VMLU processing completed!\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Pe9UAV6nTc_D","executionInfo":{"status":"ok","timestamp":1753065906779,"user_tz":-420,"elapsed":15,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"a6c28d6b-1fce-4e05-893c-deae16d6d2bd"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["Writing vmlu_kimi.py\n"]}]},{"cell_type":"code","source":["!cp /content/drive/MyDrive/2025/llm/vlmu_mqa_v1.5.zip ./"],"metadata":{"id":"RxCZRMLRV445","executionInfo":{"status":"ok","timestamp":1753065909241,"user_tz":-420,"elapsed":412,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}}},"execution_count":6,"outputs":[]},{"cell_type":"code","source":["!unzip -q vlmu_mqa_v1.5.zip"],"metadata":{"id":"XYeg5CVLWHZB","executionInfo":{"status":"ok","timestamp":1753065909424,"user_tz":-420,"elapsed":105,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["!python vmlu_kimi.py"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4_0fCASbWJC_","executionInfo":{"status":"ok","timestamp":1753078194019,"user_tz":-420,"elapsed":4471511,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"59500cf8-49fc-4782-9f7a-1838c8824693"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["Loaded 9833 questions\n"," 1% 100/9833 [02:33<3:09:40, 1.17s/it]Progress saved: 101 questions processed\n"," 2% 200/9833 [04:54<3:19:56, 1.25s/it]Progress saved: 201 questions processed\n"," 3% 300/9833 [07:05<3:20:14, 1.26s/it]Progress saved: 301 questions processed\n"," 4% 400/9833 [09:30<3:09:14, 1.20s/it]Progress saved: 401 questions processed\n"," 5% 500/9833 [11:51<5:04:13, 1.96s/it]Progress saved: 501 questions processed\n"," 6% 600/9833 [14:26<4:43:51, 1.84s/it]Progress saved: 601 questions processed\n"," 7% 700/9833 [16:43<3:31:54, 1.39s/it]Progress saved: 701 questions processed\n"," 8% 800/9833 [19:05<3:00:43, 1.20s/it]Progress saved: 801 questions processed\n"," 9% 900/9833 [21:08<2:56:15, 1.18s/it]Progress saved: 901 questions processed\n"," 10% 1000/9833 [23:19<2:57:40, 1.21s/it]Progress saved: 1001 questions processed\n"," 11% 1100/9833 [25:47<3:11:10, 1.31s/it]Progress saved: 1101 questions processed\n"," 12% 1200/9833 [27:54<3:46:54, 1.58s/it]Progress saved: 1201 questions processed\n"," 13% 1300/9833 [29:58<2:52:19, 1.21s/it]Progress saved: 1301 questions processed\n"," 14% 1400/9833 [32:05<2:57:24, 1.26s/it]Progress saved: 1401 questions processed\n"," 15% 1500/9833 [34:21<2:51:51, 1.24s/it]Progress saved: 1501 questions processed\n"," 16% 1600/9833 [36:37<4:23:43, 1.92s/it]Progress saved: 1601 questions processed\n"," 17% 1700/9833 [39:06<3:17:19, 1.46s/it]Progress saved: 1701 questions processed\n"," 18% 1800/9833 [41:14<2:46:59, 1.25s/it]Progress saved: 1801 questions processed\n"," 19% 1900/9833 [43:35<2:39:28, 1.21s/it]Progress saved: 1901 questions processed\n"," 20% 2000/9833 [45:45<2:44:22, 1.26s/it]Progress saved: 2001 questions processed\n"," 21% 2100/9833 [48:08<2:51:33, 1.33s/it]Progress saved: 2101 questions processed\n"," 22% 2200/9833 [50:30<2:49:13, 1.33s/it]Progress saved: 2201 questions processed\n"," 23% 2300/9833 [52:50<3:07:03, 1.49s/it]Progress saved: 2301 questions processed\n"," 24% 2400/9833 [55:12<2:34:53, 1.25s/it]Progress saved: 2401 questions processed\n"," 25% 2500/9833 [57:32<3:00:22, 1.48s/it]Progress saved: 2501 questions processed\n"," 26% 2600/9833 [59:48<3:27:09, 1.72s/it]Progress saved: 2601 questions processed\n"," 27% 2700/9833 [1:01:59<2:45:05, 1.39s/it]Progress saved: 2701 questions processed\n"," 28% 2800/9833 [1:04:09<2:48:18, 1.44s/it]Progress saved: 2801 questions processed\n"," 29% 2900/9833 [1:06:23<2:31:44, 1.31s/it]Progress saved: 2901 questions processed\n"," 31% 3000/9833 [1:08:34<2:25:09, 1.27s/it]Progress saved: 3001 questions processed\n"," 32% 3100/9833 [1:10:44<2:32:37, 1.36s/it]Progress saved: 3101 questions processed\n"," 33% 3200/9833 [1:12:53<2:41:59, 1.47s/it]Progress saved: 3201 questions processed\n"," 34% 3300/9833 [1:15:01<2:10:51, 1.20s/it]Progress saved: 3301 questions processed\n"," 35% 3400/9833 [1:17:09<2:08:32, 1.20s/it]Progress saved: 3401 questions processed\n"," 36% 3500/9833 [1:19:15<2:17:38, 1.30s/it]Progress saved: 3501 questions processed\n"," 37% 3600/9833 [1:21:22<2:01:05, 1.17s/it]Progress saved: 3601 questions processed\n"," 38% 3700/9833 [1:23:22<1:56:58, 1.14s/it]Progress saved: 3701 questions processed\n"," 39% 3800/9833 [1:25:21<2:04:11, 1.24s/it]Progress saved: 3801 questions processed\n"," 40% 3900/9833 [1:27:24<2:01:42, 1.23s/it]Progress saved: 3901 questions processed\n"," 41% 4000/9833 [1:29:27<2:01:59, 1.25s/it]Progress saved: 4001 questions processed\n"," 42% 4100/9833 [1:31:34<1:52:50, 1.18s/it]Progress saved: 4101 questions processed\n"," 43% 4200/9833 [1:33:33<1:53:58, 1.21s/it]Progress saved: 4201 questions processed\n"," 44% 4300/9833 [1:35:33<1:46:58, 1.16s/it]Progress saved: 4301 questions processed\n"," 45% 4400/9833 [1:37:31<1:49:11, 1.21s/it]Progress saved: 4401 questions processed\n"," 46% 4500/9833 [1:39:28<1:41:19, 1.14s/it]Progress saved: 4501 questions processed\n"," 47% 4600/9833 [1:41:26<1:44:24, 1.20s/it]Progress saved: 4601 questions processed\n"," 48% 4700/9833 [1:43:23<1:47:47, 1.26s/it]Progress saved: 4701 questions processed\n"," 49% 4800/9833 [1:45:22<1:35:46, 1.14s/it]Progress saved: 4801 questions processed\n"," 50% 4900/9833 [1:47:22<1:47:53, 1.31s/it]Progress saved: 4901 questions processed\n"," 51% 5000/9833 [1:49:23<1:36:55, 1.20s/it]Progress saved: 5001 questions processed\n"," 52% 5100/9833 [1:51:23<1:37:03, 1.23s/it]Progress saved: 5101 questions processed\n"," 53% 5200/9833 [1:53:19<1:34:56, 1.23s/it]Progress saved: 5201 questions processed\n"," 54% 5300/9833 [1:55:15<1:26:26, 1.14s/it]Progress saved: 5301 questions processed\n"," 55% 5400/9833 [1:57:09<1:21:15, 1.10s/it]Progress saved: 5401 questions processed\n"," 56% 5500/9833 [1:59:04<1:19:48, 1.11s/it]Progress saved: 5501 questions processed\n"," 57% 5600/9833 [2:00:57<1:19:08, 1.12s/it]Progress saved: 5601 questions processed\n"," 58% 5700/9833 [2:02:50<1:14:58, 1.09s/it]Progress saved: 5701 questions processed\n"," 59% 5800/9833 [2:04:41<1:12:58, 1.09s/it]Progress saved: 5801 questions processed\n"," 60% 5900/9833 [2:06:30<1:08:50, 1.05s/it]Progress saved: 5901 questions processed\n"," 61% 6000/9833 [2:08:21<1:09:27, 1.09s/it]Progress saved: 6001 questions processed\n"," 62% 6100/9833 [2:10:14<1:08:45, 1.11s/it]Progress saved: 6101 questions processed\n"," 63% 6200/9833 [2:12:04<1:04:11, 1.06s/it]Progress saved: 6201 questions processed\n"," 64% 6300/9833 [2:13:56<1:07:21, 1.14s/it]Progress saved: 6301 questions processed\n"," 65% 6400/9833 [2:15:54<1:06:56, 1.17s/it]Progress saved: 6401 questions processed\n"," 66% 6500/9833 [2:17:50<1:06:23, 1.20s/it]Progress saved: 6501 questions processed\n"," 67% 6600/9833 [2:19:44<1:01:03, 1.13s/it]Progress saved: 6601 questions processed\n"," 68% 6700/9833 [2:21:38<1:01:32, 1.18s/it]Progress saved: 6701 questions processed\n"," 69% 6800/9833 [2:23:38<1:01:37, 1.22s/it]Progress saved: 6801 questions processed\n"," 70% 6900/9833 [2:25:37<57:01, 1.17s/it]Progress saved: 6901 questions processed\n"," 71% 7000/9833 [2:27:34<56:24, 1.19s/it]Progress saved: 7001 questions processed\n"," 72% 7100/9833 [2:29:33<51:23, 1.13s/it]Progress saved: 7101 questions processed\n"," 73% 7200/9833 [2:31:29<49:43, 1.13s/it]Progress saved: 7201 questions processed\n"," 74% 7300/9833 [2:33:23<47:21, 1.12s/it]Progress saved: 7301 questions processed\n"," 75% 7400/9833 [2:35:19<48:48, 1.20s/it]Progress saved: 7401 questions processed\n"," 76% 7500/9833 [2:37:12<44:00, 1.13s/it]Progress saved: 7501 questions processed\n"," 77% 7600/9833 [2:39:07<45:15, 1.22s/it]Progress saved: 7601 questions processed\n"," 78% 7700/9833 [2:41:08<43:07, 1.21s/it]Progress saved: 7701 questions processed\n"," 79% 7800/9833 [2:43:15<44:07, 1.30s/it]Progress saved: 7801 questions processed\n"," 80% 7900/9833 [2:45:16<39:18, 1.22s/it]Progress saved: 7901 questions processed\n"," 81% 8000/9833 [2:47:20<37:48, 1.24s/it]Progress saved: 8001 questions processed\n"," 82% 8100/9833 [2:49:22<34:03, 1.18s/it]Progress saved: 8101 questions processed\n"," 83% 8200/9833 [2:51:22<32:22, 1.19s/it]Progress saved: 8201 questions processed\n"," 84% 8300/9833 [2:53:22<30:42, 1.20s/it]Progress saved: 8301 questions processed\n"," 85% 8400/9833 [2:55:23<27:57, 1.17s/it]Progress saved: 8401 questions processed\n"," 86% 8500/9833 [2:57:23<26:21, 1.19s/it]Progress saved: 8501 questions processed\n"," 87% 8600/9833 [2:59:23<24:04, 1.17s/it]Progress saved: 8601 questions processed\n"," 88% 8700/9833 [3:01:24<23:50, 1.26s/it]Progress saved: 8701 questions processed\n"," 89% 8800/9833 [3:03:25<20:10, 1.17s/it]Progress saved: 8801 questions processed\n"," 91% 8900/9833 [3:05:26<18:41, 1.20s/it]Progress saved: 8901 questions processed\n"," 92% 9000/9833 [3:07:24<15:50, 1.14s/it]Progress saved: 9001 questions processed\n"," 93% 9100/9833 [3:09:28<17:15, 1.41s/it]Progress saved: 9101 questions processed\n"," 94% 9200/9833 [3:11:29<12:07, 1.15s/it]Progress saved: 9201 questions processed\n"," 95% 9300/9833 [3:13:34<10:55, 1.23s/it]Progress saved: 9301 questions processed\n"," 96% 9400/9833 [3:15:37<08:47, 1.22s/it]Progress saved: 9401 questions processed\n"," 97% 9500/9833 [3:17:41<06:48, 1.23s/it]Progress saved: 9501 questions processed\n"," 98% 9600/9833 [3:19:45<04:35, 1.18s/it]Progress saved: 9601 questions processed\n"," 99% 9700/9833 [3:21:50<02:33, 1.15s/it]Progress saved: 9701 questions processed\n","100% 9800/9833 [3:23:57<00:44, 1.36s/it]Progress saved: 9801 questions processed\n","100% 9833/9833 [3:24:38<00:00, 1.25s/it]\n","\n","=== Results Summary ===\n","Total questions: 9833\n","Answered questions: 9833\n","Valid answers (A-E): 9833\n","Success rate: 100.00%\n","VMLU processing completed!\n"]}]},{"cell_type":"code","source":["!wc -l kimi_submission_k2.csv"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_RW1kt79WZoR","executionInfo":{"status":"ok","timestamp":1753081803528,"user_tz":-420,"elapsed":112,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"e5b1f5c2-a85d-4354-d76f-f56a489ee89b"},"execution_count":9,"outputs":[{"output_type":"stream","name":"stdout","text":["9834 kimi_submission_k2.csv\n"]}]},{"cell_type":"code","source":["!head -n 5 kimi_submission_k2.csv"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GxeVS4nwPsOt","executionInfo":{"status":"ok","timestamp":1753081813594,"user_tz":-420,"elapsed":41,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"8735d02a-6b94-4921-a21c-e1b5828db59d"},"execution_count":10,"outputs":[{"output_type":"stream","name":"stdout","text":["id,answer\n","28-0021,B\n","28-0022,A\n","28-0023,D\n","28-0024,A\n"]}]},{"cell_type":"code","source":["!cp kimi_submission_k2.csv /content/drive/MyDrive/2025/llm/kimi_submission_k2.csv"],"metadata":{"id":"ThwlRT9hx7Ed","executionInfo":{"status":"ok","timestamp":1753081816784,"user_tz":-420,"elapsed":113,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}}},"execution_count":11,"outputs":[]},{"cell_type":"code","source":["!wc -l /content/drive/MyDrive/2025/llm/kimi_submission_k2.csv"],"metadata":{"id":"BDoLAt0sFBnc","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1753081818107,"user_tz":-420,"elapsed":106,"user":{"displayName":"Laam Pham","userId":"04566654796696849937"}},"outputId":"16d8b577-7183-4fa0-ed4f-9a134f96f52a"},"execution_count":12,"outputs":[{"output_type":"stream","name":"stdout","text":["9834 /content/drive/MyDrive/2025/llm/kimi_submission_k2.csv\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"ZP_UuVVj3Vi5"},"execution_count":null,"outputs":[]}]}