Update README.md

Files changed (1) hide show

README.md CHANGED Viewed

@@ -45,8 +45,6 @@ Once set up, you can proceed to run the model by running the snippet below:
 from mlx_lm import load, generate
 from transformers import AutoTokenizer
-model, tokenizer = load("HalleyAI/gpt-oss-20b-6bit-gs32")
 model, tokenizer = load("HalleyAI/gpt-oss-20b-6bit-gs32")
 print(generate(
     model, tokenizer,
@@ -57,7 +55,7 @@ print(generate(
 ## Performance (Apple Silicon, real-world)
-LM Studio and CLI (MLX, Q4 gs32): ~63–72 tok/s, TTFB ~0.3–0.4 s (2k-token responses)
 - tested on on M1 Max 32 GB (short runs show lower t/s due to startup overhead)
 Throughput varies with Mac model, context, and sampler settings.
@@ -70,7 +68,7 @@ We report perplexity (PPL) on a small internal text corpus using the same tokeni
   </thead>
   <tbody>
     <tr><td>MLX Q8 (reference)</td><td>2.4986</td></tr>
-    <tr><td>MLX Q4 (gs=32)</td><td> 2.4858 (~-0.51% vs Q8)</td></tr>
   </tbody>
 </table>
 Note: This is a small, domain-specific eval for quick sanity; not a benchmark suite.

 from mlx_lm import load, generate
 from transformers import AutoTokenizer
 model, tokenizer = load("HalleyAI/gpt-oss-20b-6bit-gs32")
 print(generate(
     model, tokenizer,
 ## Performance (Apple Silicon, real-world)
+LM Studio and CLI (MLX, Q6 gs32): ~63–72 tok/s, TTFB ~0.3–0.4 s (2k-token responses)
 - tested on on M1 Max 32 GB (short runs show lower t/s due to startup overhead)
 Throughput varies with Mac model, context, and sampler settings.
   </thead>
   <tbody>
     <tr><td>MLX Q8 (reference)</td><td>2.4986</td></tr>
+    <tr><td>MLX Q6 (gs=32)</td><td> 2.4858 (-0.51% vs Q8)</td></tr>
   </tbody>
 </table>
 Note: This is a small, domain-specific eval for quick sanity; not a benchmark suite.