bitlinear/functional.py · krisaujla/BitLinear at main

Upload folder using huggingface_hub

fd8c8b9 verified about 1 month ago

7.62 kB

	"""
	Functional API for BitLinear operations.

	This module provides the core functional implementations that will be called
	by the nn.Module wrappers. These functions implement the mathematical operations
	described in the BitNet and ternary neural network papers.
	"""

	import torch
	import torch.nn.functional as F
	from typing import Optional, Tuple


	def bitlinear_python(
	x: torch.Tensor,
	W: torch.Tensor,
	gamma: torch.Tensor,
	bias: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Pure PyTorch reference implementation of BitLinear forward pass.

	This implements the core BitLinear computation:
	output = x @ W^T * gamma + bias

	where W is a ternary weight matrix ({-1, 0, +1}), and gamma is a per-output
	scaling factor that compensates for the quantization.

	Args:
	x: Input tensor of shape [..., in_features]
	W: Ternary weight matrix of shape [out_features, in_features]
	with values in {-1, 0, +1}
	gamma: Scaling factors of shape [out_features] or [1, out_features]
	bias: Optional bias tensor of shape [out_features]

	Returns:
	Output tensor of shape [..., out_features]

	Notes:
	- This is the reference implementation for correctness
	- CUDA kernels will optimize the ternary matrix multiplication
	- Gamma scaling is applied per output channel
	"""
	# Matrix multiplication: [..., in_features] @ [in_features, out_features]
	# W is [out_features, in_features], so we transpose it
	output = torch.matmul(x, W.t()) # Shape: [..., out_features]

	# Apply per-channel scaling with gamma
	# Ensure gamma broadcasts correctly: reshape to [1, out_features] if needed
	if gamma.dim() == 1:
	# Reshape gamma from [out_features] to [1, out_features] for broadcasting
	output = output * gamma.unsqueeze(0)
	else:
	# gamma is already 2D, use as is
	output = output * gamma

	# Add bias if provided
	if bias is not None:
	output = output + bias

	return output


	def greedy_ternary_decomposition(
	W: torch.Tensor,
	k: int,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Greedy ternary decomposition of a weight matrix.

	Decomposes a dense weight matrix W into a sum of k ternary matrices:
	W ≈ sum_{i=1}^k gamma_i * W_i^ternary

	This follows the greedy residual minimization approach:
	1. Quantize W to ternary → W_1, compute gamma_1
	2. Compute residual R_1 = W - gamma_1 * W_1
	3. Quantize R_1 to ternary → W_2, compute gamma_2
	4. Repeat for k iterations

	Args:
	W: Dense weight matrix of shape [out_features, in_features]
	k: Number of ternary components (typically 2-4 for BitNet)

	Returns:
	W_ternary: Stacked ternary matrices of shape [k, out_features, in_features]
	gammas: Scaling factors of shape [k, out_features]

	Notes:
	- Each iteration reduces the residual error
	- Larger k provides better approximation but more computation
	- This is used in MultiTernaryLinear for improved expressiveness

	References:
	- BitNet paper: "BitNet: Scaling 1-bit Transformers for Large Language Models"
	- JMLR paper: https://jmlr.org/papers/volume26/24-2050/24-2050.pdf
	"""
	from .quantization import weight_to_ternary

	# Initialize residual with the original weight matrix
	residual = W.clone()

	# Lists to store ternary components and their scaling factors
	ternary_weights = []
	gammas = []

	# Greedy residual quantization loop
	for i in range(k):
	# Quantize current residual to ternary with per-channel scaling
	W_t, gamma = weight_to_ternary(residual, per_channel=True)

	# Store this component
	ternary_weights.append(W_t)
	gammas.append(gamma)

	# Compute residual for next iteration
	# residual = residual - gamma * W_t
	# Expand gamma for proper broadcasting: [out_features] -> [out_features, 1]
	residual = residual - (gamma.unsqueeze(1) * W_t)

	# Stack all components
	W_ternary = torch.stack(ternary_weights, dim=0) # [k, out_features, in_features]
	gammas_stacked = torch.stack(gammas, dim=0) # [k, out_features]

	return W_ternary, gammas_stacked



	def multi_ternary_linear_python(
	x: torch.Tensor,
	W_ternary: torch.Tensor,
	gammas: torch.Tensor,
	bias: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	Forward pass for multi-component ternary linear layer.

	Computes the sum of k ternary linear transformations:
	output = sum_{i=1}^k (x @ W_i^T * gamma_i) + bias

	Args:
	x: Input tensor of shape [..., in_features]
	W_ternary: Stacked ternary weights of shape [k, out_features, in_features]
	gammas: Scaling factors of shape [k, out_features]
	bias: Optional bias tensor of shape [out_features]

	Returns:
	Output tensor of shape [..., out_features]
	"""
	k = W_ternary.size(0) # Number of ternary components

	# Initialize output with zeros
	# Get output shape by doing a dummy matmul with first component
	output_shape = list(x.shape[:-1]) + [W_ternary.size(1)] # [..., out_features]
	output = torch.zeros(output_shape, dtype=x.dtype, device=x.device)

	# Sum contributions from all k ternary components
	for i in range(k):
	# Get i-th ternary weight matrix and its scaling factor
	W_i = W_ternary[i] # [out_features, in_features]
	gamma_i = gammas[i] # [out_features]

	# Compute: x @ W_i^T * gamma_i
	component_output = bitlinear_python(x, W_i, gamma_i, bias=None)

	# Accumulate
	output = output + component_output

	# Add bias once at the end
	if bias is not None:
	output = output + bias

	return output


	def activation_quant(x: torch.Tensor, bits: int = 8) -> torch.Tensor:
	"""
	Quantize activations for BitLinear.

	BitNet uses activation quantization in addition to weight quantization.
	This function implements per-token absmax quantization for activations.

	Args:
	x: Input activations of shape [..., features]
	bits: Number of bits for quantization (default: 8)

	Returns:
	Quantized activations (as float, not int)

	Notes:
	- Uses absmax scaling per token
	- Returns float tensor for compatibility with autograd
	- Simulates quantization effects without actual INT8 storage
	"""
	# Compute quantization levels
	Q_max = 2 ** (bits - 1) - 1 # e.g., 127 for 8-bit
	Q_min = -Q_max # e.g., -127 for 8-bit

	# Compute absmax scale per token (last dimension)
	# Keep dimensions for broadcasting
	scale = torch.max(torch.abs(x), dim=-1, keepdim=True)[0]

	# Avoid division by zero
	scale = torch.clamp(scale, min=1e-5)

	# Normalize to [-1, 1] range
	x_normalized = x / scale

	# Scale to quantization range and round
	x_quant_int = torch.clamp(
	torch.round(x_normalized * Q_max),
	min=Q_min,
	max=Q_max
	)

	# Scale back to original range (simulate dequantization)
	x_quant = (x_quant_int / Q_max) * scale

	return x_quant