MapleSilicon
diff --git a/‎experiments/old_structure/python/sparseflow/nn.py‎
Lines changed: 2 additions & 2 deletions b/‎experiments/old_structure/python/sparseflow/nn.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sparseflow/__init__.py‎
Lines changed: 26 additions & 16 deletions b/‎sparseflow/__init__.py‎
Lines changed: 26 additions & 16 deletions
diff --git a/‎sparseflow/kernels/__init__.py‎ b/‎sparseflow/kernels/__init__.py‎
diff --git a/‎sparseflow/kernels/fused_silu_mul.py‎
Lines changed: 57 additions & 0 deletions b/‎sparseflow/kernels/fused_silu_mul.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎sparseflow/nn/__init__.py‎
Lines changed: 26 additions & 3 deletions b/‎sparseflow/nn/__init__.py‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎sparseflow/nn/llama_surgery_mlp.py‎
Lines changed: 16 additions & 0 deletions b/‎sparseflow/nn/llama_surgery_mlp.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎sparseflow/nn/sparseflow_linear.py‎
Lines changed: 1 addition & 1 deletion b/‎sparseflow/nn/sparseflow_linear.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sparseflow/nn/sparseflow_mlp.py‎
Lines changed: 93 additions & 77 deletions b/‎sparseflow/nn/sparseflow_mlp.py‎
Lines changed: 93 additions & 77 deletions
@@ -57,7 +57,7 @@ def from_dense(
             diff_report: Optional accuracy report (if return_diff=True)
         """
         # Get weight
-        weight = dense_linear.weight.data
+        weight = dense_linear.weight.detach()
 
         # Prune to 2:4 pattern
         weight_sparse = sf.prune_2_4(weight, method=method)
@@ -74,7 +74,7 @@ def from_dense(
         sparse_linear = SparseLinear(
             weight_compressed,
             metadata=None,
-            bias=dense_linear.bias.data if dense_linear.bias is not None else None
+            bias=dense_linear.bias.detach() if dense_linear.bias is not None else None
         )
 
         # Measure accuracy impact if requested
 
@@ -1,20 +1,30 @@
-"""SparseFlow: Hardware-aware sparse inference for A100"""
+"""
+SparseFlow package.
 
-from sparseflow.nn.sparseflow_linear import SparseFlowLinear, make_sparseflow_linear, prune_24_dense_weight
-from sparseflow.nn.policy import SparseFlowPolicy
-from sparseflow.compiled_model import compile_sparseflow_model, CompiledSparseFlowModel
+IMPORTANT:
+- Keep this __init__ lightweight.
+- Do NOT hard-import submodules that may depend on CUDA builds / optional ops.
+- Re-export symbols *best-effort* so `import sparseflow` doesn't crash.
+"""
 
-__version__ = "2.2.0.post1"
+from importlib import import_module
 
-__all__ = [
-    'SparseFlowLinear',
-    'make_sparseflow_linear', 
-    'SparseFlowPolicy',
-    'prune_24_dense_weight',
-    'compile_sparseflow_model',
-    'CompiledSparseFlowModel',
-]
+__all__ = []
 
-# SparseFlow MLP module
-from sparseflow.nn.sparseflow_mlp import SparseFlowMLP, make_sparseflow_mlp
-__all__.extend(['SparseFlowMLP', 'make_sparseflow_mlp'])
+def _safe_export(mod: str, names: list[str]) -> None:
+    try:
+        m = import_module(mod)
+        g = globals()
+        for n in names:
+            if hasattr(m, n):
+                g[n] = getattr(m, n)
+                __all__.append(n)
+    except Exception:
+        # swallow import errors so package import stays healthy
+        pass
+
+# Best-effort re-exports
+_safe_export("sparseflow.nn.policy", ["SparseFlowPolicy"])
+_safe_export("sparseflow.nn.sparseflow_linear", ["SparseFlowLinear", "make_sparseflow_linear", "prune_24_dense_weight"])
+_safe_export("sparseflow.nn.sparseflow_mlp", ["SparseFlowMLP", "make_sparseflow_mlp"])
+_safe_export("sparseflow.nn.surgery", ["replace_llama_mlp_module"])
@@ -0,0 +1,57 @@
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _fused_silu_mul_2d_strided(
+    g_ptr, u_ptr, o_ptr,
+    n_rows: tl.constexpr, n_cols: tl.constexpr,
+    g_s0: tl.constexpr, g_s1: tl.constexpr,
+    u_s0: tl.constexpr, u_s1: tl.constexpr,
+    o_s0: tl.constexpr, o_s1: tl.constexpr,
+    BLOCK: tl.constexpr
+):
+    pid0 = tl.program_id(0)  # row
+    pid1 = tl.program_id(1)  # col-block
+    row = pid0
+    col0 = pid1 * BLOCK
+    cols = col0 + tl.arange(0, BLOCK)
+    mask = cols < n_cols
+
+    g_offs = row * g_s0 + cols * g_s1
+    u_offs = row * u_s0 + cols * u_s1
+    o_offs = row * o_s0 + cols * o_s1
+
+    g = tl.load(g_ptr + g_offs, mask=mask, other=0.0).to(tl.float32)
+    u = tl.load(u_ptr + u_offs, mask=mask, other=0.0).to(tl.float32)
+
+    s = 1.0 / (1.0 + tl.exp(-g))     # sigmoid(g)
+    y = (g * s) * u                  # silu(g) * u
+
+    tl.store(o_ptr + o_offs, y.to(tl.float16), mask=mask)
+
+def fused_silu_mul(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
+    """
+    Fast path: expects 2D tensors (e.g., [I, BT]) but DOES NOT require contiguity.
+    Uses explicit strides so we avoid .contiguous() and the clone/copy_ tax.
+    """
+    assert gate.is_cuda and up.is_cuda
+    assert gate.dtype == up.dtype
+    assert gate.ndim == 2 and up.ndim == 2, "fused_silu_mul expects 2D tensors"
+
+    I, BT = gate.shape
+    assert up.shape == (I, BT)
+
+    out = torch.empty((I, BT), device=gate.device, dtype=gate.dtype)
+
+    grid = (I, triton.cdiv(BT, 1024))
+    _fused_silu_mul_2d_strided[grid](
+        gate, up, out,
+        n_rows=I, n_cols=BT,
+        g_s0=gate.stride(0), g_s1=gate.stride(1),
+        u_s0=up.stride(0),   u_s1=up.stride(1),
+        o_s0=out.stride(0),  o_s1=out.stride(1),
+        BLOCK=1024,
+        num_warps=4
+    )
+    return out
@@ -1,4 +1,27 @@
-from .policy import SparseFlowPolicy
-from .sparseflow_linear import SparseFlowLinear, make_sparseflow_linear, prune_24_dense_weight
+"""
+sparseflow.nn
 
-__all__ = ["SparseFlowPolicy", "SparseFlowLinear", "make_sparseflow_linear", "prune_24_dense_weight"]
+Keep this module lightweight. Do NOT hard-import optional CUDA-dependent modules
+or large submodules on import, otherwise `from sparseflow.nn.policy import ...`
+will fail if any other file has issues.
+"""
+
+from importlib import import_module
+
+__all__ = []
+
+def _safe_export(mod: str, names: list[str]) -> None:
+    try:
+        m = import_module(mod)
+        g = globals()
+        for n in names:
+            if hasattr(m, n):
+                g[n] = getattr(m, n)
+                __all__.append(n)
+    except Exception:
+        pass
+
+_safe_export("sparseflow.nn.policy", ["SparseFlowPolicy"])
+_safe_export("sparseflow.nn.surgery", ["replace_llama_mlp_module"])
+_safe_export("sparseflow.nn.sparseflow_linear", ["SparseFlowLinear", "make_sparseflow_linear", "prune_24_dense_weight"])
+_safe_export("sparseflow.nn.sparseflow_mlp", ["SparseFlowMLP", "make_sparseflow_mlp"])
@@ -0,0 +1,16 @@
+"""
+Shim module so tooling can import:
+  from sparseflow.nn.llama_surgery_mlp import replace_llama_mlp_module
+
+Actual implementation lives in tools/llama_surgery_mlp.py
+"""
+import os, sys
+
+# Ensure repo root is on path so `tools.*` is importable when sparseflow is imported as a package.
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+if _REPO_ROOT not in sys.path:
+    sys.path.insert(0, _REPO_ROOT)
+
+from tools.llama_surgery_mlp import replace_llama_mlp_module  # re-export
+
+__all__ = ["replace_llama_mlp_module"]
@@ -121,7 +121,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             Ws = self.W_sparse
             x2d_T = x2d.transpose(0, 1)  # NO contiguous
             y2d_T = torch.ops.aten._sparse_semi_structured_mm(Ws.packed, Ws.meta, x2d_T)
-            y2d = y2d_T.transpose(0, 1)  # NO contiguous
+            y2d = y2d_T.transpose(0, 1).contiguous()  # Force contiguous!
             if self.bias is not None:
                 y2d = y2d + self.bias
         else:
 
@@ -1,13 +1,20 @@
 """SparseFlowMLP: Optimized MLP replacement"""
+
+from __future__ import annotations
+
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from typing import Optional
+
 from sparseflow.nn.policy import SparseFlowPolicy
 from sparseflow.nn.sparseflow_linear import prune_24_dense_weight
+from sparseflow.kernels.fused_silu_mul import fused_silu_mul
+
 
 class SparseFlowMLP(nn.Module):
     """Drop-in replacement for LlamaMLP using sparse tensor cores."""
-    
+
     def __init__(
         self,
         hidden_size: int,
@@ -23,97 +30,106 @@ def __init__(
         dtype: torch.dtype = torch.float16,
     ):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
+
+        # Some HF/profiling codepaths run under torch.inference_mode().
+        # Registering buffers from inference tensors can later trip:
+        # "Cannot set version_counter for inference tensor".
+        # Force normal tensors here.
+        with torch.inference_mode(False):
+            gate_weight = gate_weight.detach().clone()
+            up_weight   = up_weight.detach().clone()
+            down_weight = down_weight.detach().clone()
+
+            if gate_bias is not None:
+                gate_bias = gate_bias.detach().clone()
+            if up_bias is not None:
+                up_bias = up_bias.detach().clone()
+            if down_bias is not None:
+                down_bias = down_bias.detach().clone()
+
+        # Keep original dense down_weight for the final matmul (for now)
+        self.register_buffer("down_weight", down_weight, persistent=False)
+
+        self.hidden_size = int(hidden_size)
+        self.intermediate_size = int(intermediate_size)
         self.policy = policy
-        
-        # Convert to sparse - store as SparseSemiStructuredTensor
+
+        # Convert to sparse semi-structured tensors (2:4)
         gate_pruned = prune_24_dense_weight(gate_weight.contiguous())
-        self.register_buffer("gate_sparse", 
-                           torch.sparse.to_sparse_semi_structured(gate_pruned), 
-                           persistent=False)
-        self.register_buffer("gate_bias", gate_bias.contiguous() if gate_bias is not None else None)
-        
-        up_pruned = prune_24_dense_weight(up_weight.contiguous())
-        self.register_buffer("up_sparse",
-                           torch.sparse.to_sparse_semi_structured(up_pruned),
-                           persistent=False)
-        self.register_buffer("up_bias", up_bias.contiguous() if up_bias is not None else None)
-        
+        up_pruned   = prune_24_dense_weight(up_weight.contiguous())
         down_pruned = prune_24_dense_weight(down_weight.contiguous())
-        self.register_buffer("down_sparse",
-                           torch.sparse.to_sparse_semi_structured(down_pruned),
-                           persistent=False)
-        self.register_buffer("down_bias", down_bias.contiguous() if down_bias is not None else None)
+
+        self.register_buffer(
+            "gate_sparse",
+            torch.sparse.to_sparse_semi_structured(gate_pruned),
+            persistent=False,
+        )
+        self.register_buffer(
+            "up_sparse",
+            torch.sparse.to_sparse_semi_structured(up_pruned),
+            persistent=False,
+        )
+        self.register_buffer(
+            "down_sparse",
+            torch.sparse.to_sparse_semi_structured(down_pruned),
+            persistent=False,
+        )
+
+        # Biases (may be None)
+        self.register_buffer("gate_bias", gate_bias.contiguous() if gate_bias is not None else None, persistent=False)
+        self.register_buffer("up_bias",   up_bias.contiguous()   if up_bias is not None else None, persistent=False)
+        self.register_buffer("down_bias", down_bias.contiguous() if down_bias is not None else None, persistent=False)
 
     def _spmm_pick(self, Ws, xT):
-        """Auto-pick packed/meta orientation + handle alignment (K must be multiple of 16)"""
-        # Pad xT's last dimension (N/tokens) to multiple of 16 for CUTLASS alignment
+        """
+        Auto-pick packed/meta orientation + handle alignment.
+        xT: [K, Ntokens] (transposed view)
+        """
         orig_n = xT.shape[1]
         pad_val = (16 - (orig_n % 16)) % 16
-        
-        if pad_val > 0:
-            xT = torch.nn.functional.pad(xT, (0, pad_val))
-        
-        # Pick orientation based on shape
+        if pad_val:
+            xT = F.pad(xT, (0, pad_val))
+
+        # Ws is SparseSemiStructuredTensor
         if Ws.data.shape[1] == xT.shape[0]:
             res = torch.ops.aten._sparse_semi_structured_mm(Ws.packed, Ws.meta, xT)
         elif Ws.data.shape[0] == xT.shape[0]:
             res = torch.ops.aten._sparse_semi_structured_mm(Ws.packed_t, Ws.meta_t, xT)
         else:
-            raise RuntimeError(
-                f"Shape mismatch: Ws.data={tuple(Ws.data.shape)} xT={tuple(xT.shape)}"
-            )
-        
-        # Slice back to original size
-        if pad_val > 0:
+            raise RuntimeError(f"Shape mismatch: Ws.data={tuple(Ws.data.shape)} xT={tuple(xT.shape)}")
+
+        if pad_val:
             res = res[:, :orig_n]
-        
         return res
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward using low-level sparse ops"""
-        # Reshape to 2D
-        leading = x.shape[:-1]
-        T = 1
-        for d in leading:
-            T *= int(d)
-        
-        x_2d = x.reshape(T, self.hidden_size).contiguous()  # [T, H]
-        
-        # Gate projection - use transpose approach that we know works
-        xT = x_2d.transpose(0, 1)  # [H, T].contiguous()
-        gateT = self._spmm_pick(self.gate_sparse, xT)  # [I, T]
-        gate = gateT.transpose(0, 1)  # [T, I]
+    def forward(self, x):
+        # x: [B, T, H]
+        B, T, H = x.shape
+        BT = B * T
+
+        x2d = x.view(BT, H)
+        xT = x2d.transpose(0, 1)  # [H, BT]
+
+        gateT = self._spmm_pick(self.gate_sparse, xT)  # [I, BT]
+        upT   = self._spmm_pick(self.up_sparse,   xT)  # [I, BT]
+
         if self.gate_bias is not None:
-            gate = gate + self.gate_bias
-        
-        # Up projection
-        upT = self._spmm_pick(self.up_sparse, xT)  # [I, T]
-        up = upT.transpose(0, 1)  # [T, I]
+            gateT = gateT + self.gate_bias.view(-1, 1)
         if self.up_bias is not None:
-            up = up + self.up_bias
-        
-        # Activation
-        hidden = torch.nn.functional.silu(gate) * up  # [T, I]
-        
-        # Down projection
-        hT = hidden.transpose(0, 1)  # [I, T]
-        outT = self._spmm_pick(self.down_sparse, hT)  # [H, T]
-        out = outT.transpose(0, 1)  # [T, H]
+            upT = upT + self.up_bias.view(-1, 1)
+
+        hiddenT = fused_silu_mul(gateT, upT)  # [I, BT]
+
+        # Down projection (dense for now): [H, I] @ [I, BT] -> [H, BT]
+        outT = torch.matmul(self.down_weight, hiddenT)
+
         if self.down_bias is not None:
-            out = out + self.down_bias
-        
-        return out.reshape(*leading, self.hidden_size)
-
-
-def make_sparseflow_mlp(mlp_module, policy: SparseFlowPolicy = SparseFlowPolicy()):
-    """Convert LlamaMLP to SparseFlowMLP"""
-    return SparseFlowMLP(
-        hidden_size=mlp_module.gate_proj.in_features,
-        intermediate_size=mlp_module.gate_proj.out_features,
-        gate_weight=mlp_module.gate_proj.weight.data,
-        up_weight=mlp_module.up_proj.weight.data,
-        down_weight=mlp_module.down_proj.weight.data,
-        policy=policy,
-    )
+            outT = outT + self.down_bias.view(-1, 1)
+
+        out2d = outT.transpose(0, 1)  # [BT, H]
+        return out2d.view(B, T, H)
+
+
+def make_sparseflow_mlp(*args, **kwargs):
+    """Backward-compat factory for older tooling."""
+    return SparseFlowMLP(*args, **kwargs)