Build (fp8)
Browse filesThis view is limited to 50 files because it contains too many changes. Β 
							See raw diff
- build/torch25-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +2 -2
 - build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
 - build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py +10 -0
 - build/torch25-cxx11-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
 - build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py +3 -3
 - build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py +10 -0
 - build/torch25-cxx11-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
 - build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
 - build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py +10 -0
 - build/{torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so} +1 -1
 - build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
 - build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py +10 -0
 - build/torch25-cxx98-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
 - build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py +3 -3
 - build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py +10 -0
 - build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
 - build/{torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so} +2 -2
 - build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
 - build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py +10 -0
 - build/torch26-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
 - build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py +3 -3
 - build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py +10 -0
 - build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
 - build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
 - build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py +3 -3
 - build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py +10 -0
 - build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so +0 -3
 - build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
 - build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py +3 -3
 - build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py +10 -0
 - build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so +3 -0
 - build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py +3 -3
 - build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py +10 -0
 - build/torch26-cxx98-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
 - build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py +3 -3
 - build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py +24 -0
 - build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py +10 -0
 - build/torch26-cxx98-cu126-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so} +1 -1
 - build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py +3 -3
 - build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py +24 -0
 
    	
        build/torch25-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:49dc6c1d936b3dc6c483a4ef5d581c5d2f08f50f6ea2ffcdbfecdf0b719c3410
         
     | 
| 3 | 
         
            +
            size 87056328
         
     | 
    	
        build/torch25-cxx11-cu118-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch25-cxx11-cu118-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch25-cxx11-cu118-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch25-cxx11-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 87254968
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:12bb26a0a9a47039bbcbf2c5fda7c068211cb711827b0e0e0d98b2fe99ed3b54
         
     | 
| 3 | 
         
             
            size 87254968
         
     | 
    	
        build/torch25-cxx11-cu121-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch25-cxx11-cu121-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch25-cxx11-cu121-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch25-cxx11-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 86965608
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:ca9a24c28dab4109a13549ee7ce379b36d950930b8bd106669188262863f3795
         
     | 
| 3 | 
         
             
            size 86965608
         
     | 
    	
        build/torch25-cxx11-cu124-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch25-cxx11-cu124-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch25-cxx11-cu124-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/{torch26-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β torch25-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 87048408
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:d65d3a08c44b65a44d2c58566aa7e26e85d0d949be71096e09f7ad73d0b5e040
         
     | 
| 3 | 
         
             
            size 87048408
         
     | 
    	
        build/torch25-cxx98-cu118-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch25-cxx98-cu118-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch25-cxx98-cu118-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch25-cxx98-cu121-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 87243240
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:d2d4157287a3e7979780f23a709eba01e787186bc32a5e56c0620b5429e9cfd3
         
     | 
| 3 | 
         
             
            size 87243240
         
     | 
    	
        build/torch25-cxx98-cu121-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch25-cxx98-cu121-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch25-cxx98-cu121-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch25-cxx98-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
    DELETED
    
    | 
         @@ -1,3 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256:820b62662956741ae78d7c51fb9fc978ff2e86c7dc1efa1335b0701e0e28749a
         
     | 
| 3 | 
         
            -
            size 86957976
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        build/{torch25-cxx98-cu118-x86_64-linux/moe/_moe_21a4db0.abi3.so β torch25-cxx98-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
            -
            size  
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:122544181246b179a772eb07c9e01c8df6b3025c20b333c566d0e84bfd7bea2d
         
     | 
| 3 | 
         
            +
            size 86953880
         
     | 
    	
        build/torch25-cxx98-cu124-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch25-cxx98-cu124-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch25-cxx98-cu124-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch26-cxx11-cu118-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 87060352
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:49e17eb28438bddf98e314893cf262b807d64ee03850b46abe4d0bf6151f62b6
         
     | 
| 3 | 
         
             
            size 87060352
         
     | 
    	
        build/torch26-cxx11-cu118-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch26-cxx11-cu118-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch26-cxx11-cu118-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_21a4db0.abi3.so
    DELETED
    
    | 
         @@ -1,3 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256:9a9acc9198a56410e1d6bddec3a4529fb14b12843f6589b4477bc4ee795f7278
         
     | 
| 3 | 
         
            -
            size 86961568
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        build/torch26-cxx11-cu124-x86_64-linux/moe/_moe_2218ad7.abi3.so
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:f804164f561c9b46f3b997a6d13552ca4d704c43484b5cd8d14682b4450ed472
         
     | 
| 3 | 
         
            +
            size 86965664
         
     | 
    	
        build/torch26-cxx11-cu124-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch26-cxx11-cu124-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch26-cxx11-cu124-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_21a4db0.abi3.so
    DELETED
    
    | 
         @@ -1,3 +0,0 @@ 
     | 
|
| 1 | 
         
            -
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256:eb26fad3cfe2db1cc88637e020d6d8ddbc54df3e7e8edd64ba9370cd96177587
         
     | 
| 3 | 
         
            -
            size 87428864
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
    	
        build/torch26-cxx11-cu126-x86_64-linux/moe/_moe_2218ad7.abi3.so
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:1102bf615b2d2f7c320ac73eed63b982e969683ac72c958080dddb87166fa595
         
     | 
| 3 | 
         
            +
            size 87432960
         
     | 
    	
        build/torch26-cxx11-cu126-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch26-cxx11-cu126-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch26-cxx11-cu126-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch26-cxx98-cu118-x86_64-linux/moe/_moe_2218ad7.abi3.so
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:9e739bb546d3d1730fa7696fbd767fd588286dec369f1b7551edd1ec481df96f
         
     | 
| 3 | 
         
            +
            size 87044288
         
     | 
    	
        build/torch26-cxx98-cu118-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch26-cxx98-cu118-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch26-cxx98-cu118-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch26-cxx98-cu124-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 86953856
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:bcb950d2e7196ad22cad926749b7e0e06e5454f0a732755b72f0b8dd456529c6
         
     | 
| 3 | 
         
             
            size 86953856
         
     | 
    	
        build/torch26-cxx98-cu124-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch26-cxx98-cu124-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     | 
    	
        build/torch26-cxx98-cu124-x86_64-linux/moe/layers.py
    CHANGED
    
    | 
         @@ -36,6 +36,14 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
                    out = fused_moe(
         
     | 
| 40 | 
         
             
                        hidden_states,
         
     | 
| 41 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         @@ -45,6 +53,8 @@ class Llama4TextMoe(nn.Module): 
     | 
|
| 45 | 
         
             
                        renormalize=False,
         
     | 
| 46 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 47 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 
         | 
|
| 
         | 
|
| 48 | 
         
             
                    )
         
     | 
| 49 | 
         | 
| 50 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
| 
         | 
|
| 36 | 
         
             
                    _fix_llama4_experts(hidden_states, self.experts)
         
     | 
| 37 | 
         | 
| 38 | 
         
             
                    router_logits = self.router(hidden_states)
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
                    extra_kwargs = {}
         
     | 
| 41 | 
         
            +
                    use_fp8_w8a8 = False
         
     | 
| 42 | 
         
            +
                    if hasattr(self.experts, "gate_up_proj_scale"):
         
     | 
| 43 | 
         
            +
                        use_fp8_w8a8 = True
         
     | 
| 44 | 
         
            +
                        extra_kwargs["w1_scale"] = self.experts.gate_up_proj_scale
         
     | 
| 45 | 
         
            +
                        extra_kwargs["w2_scale"] = self.experts.down_proj_scale
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
             
                    out = fused_moe(
         
     | 
| 48 | 
         
             
                        hidden_states,
         
     | 
| 49 | 
         
             
                        w1=self.experts.gate_up_proj,
         
     | 
| 
         | 
|
| 53 | 
         
             
                        renormalize=False,
         
     | 
| 54 | 
         
             
                        custom_routing_function=_llama4_topk,
         
     | 
| 55 | 
         
             
                        apply_router_weight_on_input=True,
         
     | 
| 56 | 
         
            +
                        use_fp8_w8a8=use_fp8_w8a8,
         
     | 
| 57 | 
         
            +
                        **extra_kwargs
         
     | 
| 58 | 
         
             
                    )
         
     | 
| 59 | 
         | 
| 60 | 
         
             
                    out += self.shared_expert(hidden_states)
         
     | 
    	
        build/torch26-cxx98-cu126-x86_64-linux/moe/{_moe_21a4db0.abi3.so β _moe_2218ad7.abi3.so}
    RENAMED
    
    | 
         @@ -1,3 +1,3 @@ 
     | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            -
            oid sha256: 
     | 
| 3 | 
         
             
            size 87417064
         
     | 
| 
         | 
|
| 1 | 
         
             
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:fe5c605f1da902aebc1d7ce0355b649fcfcc44aed0023fdc87974f3d56273897
         
     | 
| 3 | 
         
             
            size 87417064
         
     | 
    	
        build/torch26-cxx98-cu126-x86_64-linux/moe/_ops.py
    CHANGED
    
    | 
         @@ -1,9 +1,9 @@ 
     | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            -
            from . import  
     | 
| 3 | 
         
            -
            ops = torch.ops. 
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            -
                return f" 
     | 
| 
         | 
|
| 1 | 
         
             
            import torch
         
     | 
| 2 | 
         
            +
            from . import _moe_2218ad7
         
     | 
| 3 | 
         
            +
            ops = torch.ops._moe_2218ad7
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            def add_op_namespace_prefix(op_name: str):
         
     | 
| 6 | 
         
             
                """
         
     | 
| 7 | 
         
             
                Prefix op by namespace.
         
     | 
| 8 | 
         
             
                """
         
     | 
| 9 | 
         
            +
                return f"_moe_2218ad7::{op_name}"
         
     | 
    	
        build/torch26-cxx98-cu126-x86_64-linux/moe/fused_moe.py
    CHANGED
    
    | 
         @@ -27,6 +27,30 @@ VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = bool( 
     | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 30 | 
         
             
            @triton.jit
         
     | 
| 31 | 
         
             
            def write_zeros_to_output(
         
     | 
| 32 | 
         
             
                c_ptr,
         
     | 
| 
         | 
|
| 27 | 
         
             
            )
         
     | 
| 28 | 
         | 
| 29 | 
         | 
| 30 | 
         
            +
            def cdiv(a: int, b: int) -> int:
         
     | 
| 31 | 
         
            +
                """Ceiling division."""
         
     | 
| 32 | 
         
            +
                return -(a // -b)
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
             
     | 
| 35 | 
         
            +
            def _fp8_quantize(
         
     | 
| 36 | 
         
            +
                A: torch.Tensor,
         
     | 
| 37 | 
         
            +
                A_scale: Optional[torch.Tensor],
         
     | 
| 38 | 
         
            +
                block_shape: Optional[List[int]],
         
     | 
| 39 | 
         
            +
            ) -> Tuple[torch.Tensor, torch.Tensor]:
         
     | 
| 40 | 
         
            +
                """
         
     | 
| 41 | 
         
            +
                Perform fp8 quantization on the inputs.  If a block_shape
         
     | 
| 42 | 
         
            +
                is provided, the output will be blocked.
         
     | 
| 43 | 
         
            +
                """
         
     | 
| 44 | 
         
            +
                if block_shape is None:
         
     | 
| 45 | 
         
            +
                    A, A_scale = scaled_fp8_quant(A, A_scale)
         
     | 
| 46 | 
         
            +
                else:
         
     | 
| 47 | 
         
            +
                    assert len(block_shape) == 2
         
     | 
| 48 | 
         
            +
                    _, block_k = block_shape[0], block_shape[1]
         
     | 
| 49 | 
         
            +
                    A, A_scale = per_token_group_quant_fp8(A, block_k)
         
     | 
| 50 | 
         
            +
                    assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
         
     | 
| 51 | 
         
            +
                return A, A_scale
         
     | 
| 52 | 
         
            +
             
     | 
| 53 | 
         
            +
             
     | 
| 54 | 
         
             
            @triton.jit
         
     | 
| 55 | 
         
             
            def write_zeros_to_output(
         
     | 
| 56 | 
         
             
                c_ptr,
         
     |