Fix critical TTQ beta mismatch bug

dariocazzani · dariocazzani · commit db13507d09a9 · 2026-03-11T08:31:18.000-04:00
Bug: softplus was applied twice - once in ttq_quantize and once when
computing beta. This caused quantized weights to use scale X but
dequantization to use scale softplus(X), completely breaking learning.

Fix: ttq_quantize now returns (quantized, wp_pos, wn_pos) tuple so
layers can use the exact same scales for beta computation.

Result: Model was stuck at 10% (random guessing), now should learn.
diff --git a/bitnet/nn/ttq_conv2d.py b/bitnet/nn/ttq_conv2d.py
@@ -34,10 +34,10 @@ def forward(self, x: Tensor) -> Tensor:
         x_quant, gamma = quantize_activations(x, self.num_bits)
 
         # TTQ weight quantization with learned scales
-        w_quant = ttq_quantize(self.weight, self.wp, self.wn, self.delta)
+        w_quant, wp_pos, wn_pos = ttq_quantize(self.weight, self.wp, self.wn, self.delta)
 
         # Use average of positive scales as beta for dequantization
-        beta = (f.softplus(self.wp) + f.softplus(self.wn)) / 2
+        beta = (wp_pos + wn_pos) / 2
 
         out = f.conv2d(x_quant, w_quant, self.bias, self.stride, self.padding, self.dilation, self.groups)
         return dequantize(out, gamma, beta, self.num_bits)
diff --git a/bitnet/nn/ttq_linear.py b/bitnet/nn/ttq_linear.py
@@ -40,10 +40,10 @@ def forward(self, x: Tensor) -> Tensor:
         x_quant, gamma = quantize_activations(x, self.num_bits)
 
         # TTQ weight quantization with learned scales
-        w_quant = ttq_quantize(self.weight, self.wp, self.wn, self.delta)
+        w_quant, wp_pos, wn_pos = ttq_quantize(self.weight, self.wp, self.wn, self.delta)
 
         # Use average of positive scales as beta for dequantization
-        beta = (f.softplus(self.wp) + f.softplus(self.wn)) / 2
+        beta = (wp_pos + wn_pos) / 2
 
         out = f.linear(x_quant, w_quant, self.bias)
         return dequantize(out, gamma, beta, self.num_bits)
diff --git a/bitnet/nn/ttq_quantization.py b/bitnet/nn/ttq_quantization.py
@@ -3,7 +3,7 @@
 from torch import Tensor
 
 
-def ttq_quantize(weight: Tensor, wp: Tensor, wn: Tensor, delta: Tensor) -> Tensor:
+def ttq_quantize(weight: Tensor, wp: Tensor, wn: Tensor, delta: Tensor) -> tuple[Tensor, Tensor, Tensor]:
     """Quantize weights to {-wn, 0, +wp} using Trained Ternary Quantization.
 
     TTQ (Zhu et al., ICLR 2017) learns per-layer positive/negative scales
@@ -19,7 +19,7 @@ def ttq_quantize(weight: Tensor, wp: Tensor, wn: Tensor, delta: Tensor) -> Tenso
         delta: Learnable threshold
 
     Returns:
-        Quantized tensor in {-wn, 0, +wp}
+        Tuple of (quantized weights, wp_positive, wn_positive)
     """
     # Ensure scales and threshold are positive with softplus (maintains gradients)
     wp_pos = f.softplus(wp)
@@ -35,4 +35,5 @@ def ttq_quantize(weight: Tensor, wp: Tensor, wn: Tensor, delta: Tensor) -> Tenso
     quantized[neg_mask] = -wn_pos
 
     # Straight-through estimator for gradients
-    return quantized + (weight - weight.detach())
+    quantized_ste = quantized + (weight - weight.detach())
+    return quantized_ste, wp_pos, wn_pos