diff --git a/.gitignore b/.gitignore
index 97e6a6f..ffd6da3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.jl.*.cov
 *.jl.mem
 /Manifest.toml
+.vscode
diff --git a/src/AlgebraicMultigrid.jl b/src/AlgebraicMultigrid.jl
index 1f91621..ea62549 100644
--- a/src/AlgebraicMultigrid.jl
+++ b/src/AlgebraicMultigrid.jl
@@ -9,7 +9,7 @@ using Printf
 @reexport import CommonSolve: solve, solve!, init
 using Reexport
 
-using LinearAlgebra: rmul!
+using LinearAlgebra: rmul!, qr
 
 include("utils.jl")
 export approximate_spectral_radius
diff --git a/src/aggregation.jl b/src/aggregation.jl
index 4258cdc..c5e4e65 100644
--- a/src/aggregation.jl
+++ b/src/aggregation.jl
@@ -1,5 +1,6 @@
-function smoothed_aggregation(A::TA, 
+function smoothed_aggregation(A::TA,
                         ::Type{Val{bs}}=Val{1};
+                        B = nothing,
                         symmetry = HermitianSymmetry(),
                         strength = SymmetricStrength(),
                         aggregate = StandardAggregation(),
@@ -12,10 +13,9 @@ function smoothed_aggregation(A::TA,
                         diagonal_dominance = false,
                         keep = false,
                         coarse_solver = Pinv, kwargs...) where {T,V,bs,TA<:SparseMatrixCSC{T,V}}
-
     n = size(A, 1)
-    # B = kron(ones(n, 1), eye(1))
-    B = ones(T,n)
+    B = isnothing(B) ? ones(T,n) : copy(B)
+    @assert size(A, 1) == size(B, 1)
 
     #=max_levels, max_coarse, strength =
         levelize_strength_or_aggregation(max_levels, max_coarse, strength)
@@ -70,7 +70,7 @@ function extend_hierarchy!(levels, strength, aggregate, smooth,
     # b = zeros(eltype(A), size(A, 1))
 
     # Improve candidates
-    b = zeros(size(A,1))
+    b = zeros(size(A,1),size(B,2))
     improve_candidates(A, B, b)
     T, B = fit_candidates(AggOp, B)
 
@@ -88,59 +88,39 @@ function extend_hierarchy!(levels, strength, aggregate, smooth,
 end
 construct_R(::HermitianSymmetry, P) = P'
 
-function fit_candidates(AggOp, B, tol = 1e-10)
-
+function fit_candidates(AggOp, B; tol=1e-10)
     A = adjoint(AggOp)
-    n_fine, n_coarse = size(A)
-    n_col = n_coarse
-
-    R = zeros(eltype(B), n_coarse)
-    Qx = zeros(eltype(B), nnz(A))
-    # copy!(Qx, B)
-    for i = 1:size(Qx, 1)
-        Qx[i] = B[i]
-    end
-    # copy!(A.nzval, B)
-    for i = 1:n_col
-        for j in nzrange(A,i)
-            row = A.rowval[j]
-            A.nzval[j] = B[row]
-        end
-    end
-    k = 1
-    for i = 1:n_col
-        norm_i = norm_col(A, Qx, i)
-        threshold_i = tol * norm_i
-        if norm_i > threshold_i
-            scale = 1 / norm_i
-            R[i] = norm_i
-        else
-            scale = 0
-            R[i] = 0
+    n_fine, m = ndims(B) == 1 ? (length(B), 1) : size(B)
+    n_fine2, n_agg = size(A)
+    @assert n_fine2 == n_fine
+    n_coarse = m * n_agg
+    T = eltype(B)
+    Qs = spzeros(T, n_fine, n_coarse)
+    R = zeros(T, n_coarse, m)
+
+    for agg in 1:n_agg
+        rows = A.rowval[A.colptr[agg]:A.colptr[agg+1]-1]
+        M = @view B[rows, :]     # size(rows) × m
+
+
+        F = qr(M)
+        r = min(length(rows), m)
+        Qfull = Matrix(F.Q)
+        Qj = Qfull[:, 1:r]
+        Rj = F.R
+
+        offset = (agg - 1) * m
+
+        for local_i in 1:length(rows), local_j in 1:r
+            val = Qj[local_i, local_j]
+            if abs(val) >= tol
+                Qs[rows[local_i], offset+local_j] = val
+            end
         end
-        for j in nzrange(A, i)
-            row = A.rowval[j]
-            # Qx[row] *= scale
-            #@show k
-            # Qx[k] *= scale
-            # k += 1
-            A.nzval[j] *= scale
-        end
-    end
+        dropzeros!(Qs)
 
-    # SparseMatrixCSC(size(A)..., A.colptr, A.rowval, Qx), R
-    A, R
-end
-function norm_col(A, Qx, i)
-    s = zero(eltype(A))
-    for j in nzrange(A, i)
-        if A.rowval[j] > length(Qx)
-            val = 1
-        else
-            val = Qx[A.rowval[j]]
-        end
-        # val = A.nzval[A.rowval[j]]
-        s += val*val
+        R[offset+1:offset+r, :] .= Rj[1:r, :]
     end
-    sqrt(s)
+
+    return Qs, R
 end
diff --git a/src/classical.jl b/src/classical.jl
index 8925471..3b4abc2 100644
--- a/src/classical.jl
+++ b/src/classical.jl
@@ -9,6 +9,10 @@ function ruge_stuben(_A::Union{TA, Symmetric{Ti, TA}, Hermitian{Ti, TA}},
                 max_coarse = 10,
                 coarse_solver = Pinv, kwargs...) where {Ti,Tv,bs,TA<:SparseMatrixCSC{Ti,Tv}}
 
+    
+    # fails if near null space `B` is provided
+    haskey(kwargs, :B) && kwargs[:B] !== nothing && error("near null space `B` is only supported for smoothed aggregation AMG, not Ruge-Stüben AMG.")
+                
     if _A isa Symmetric && Ti <: Real || _A isa Hermitian
         A = _A.data
         symmetric = true
diff --git a/src/multilevel.jl b/src/multilevel.jl
index 40b7b15..18146e0 100644
--- a/src/multilevel.jl
+++ b/src/multilevel.jl
@@ -187,7 +187,13 @@ Keyword Arguments
 * maxiter::Int64 - maximum number of iterations to execute
 * verbose::Bool - display residual at each iteration
 * log::Bool - return vector of residuals along with solution
+* B::AbstractArray - the **near null space** in SA-AMG, which represents the low energy that cannot be attenuated by relaxtion, and thus needs to be perserved across the coarse grid. 
 
+!!! note
+    `B` can be:
+        - a `Vector` (e.g., for scalar PDEs),
+        - a `Matrix` (e.g., for vector PDEs or systems with multiple equations),
+    If `B` is not provided, it defaults to a vector of ones.
 """
 function _solve(ml::MultiLevel, b::AbstractArray, args...; kwargs...)
     n = length(ml) == 1 ? size(ml.final_A, 1) : size(ml.levels[1].A, 1)
@@ -296,9 +302,9 @@ end
 function init(::RugeStubenAMG, A, b, args...; kwargs...)
     AMGSolver(ruge_stuben(A; kwargs...), b)
 end
-function init(::SmoothedAggregationAMG, A, b; kwargs...)
+function init(sa::SmoothedAggregationAMG, A, b; kwargs...)
     AMGSolver(smoothed_aggregation(A; kwargs...), b)
 end
 function solve!(solt::AMGSolver, args...; kwargs...) 
     _solve(solt.ml, solt.b, args...; kwargs...)   
-end
\ No newline at end of file
+end
diff --git a/src/smoother.jl b/src/smoother.jl
index fdd6657..829c6ec 100644
--- a/src/smoother.jl
+++ b/src/smoother.jl
@@ -29,6 +29,7 @@ end
 function gs!(A, b, x, start, step, stop)
     n = size(A, 1)
     z = zero(eltype(A))
+    @assert size(x,2) == size(b, 2) "x and b must have the same number of columns"
     @inbounds for col in 1:size(x, 2)
         for i in start:step:stop
             rsum = z
diff --git a/test/lin_elastic_2d.jld2 b/test/lin_elastic_2d.jld2
new file mode 100644
index 0000000..02e07b9
Binary files /dev/null and b/test/lin_elastic_2d.jld2 differ
diff --git a/test/nns_test.jl b/test/nns_test.jl
new file mode 100644
index 0000000..3e86d2d
--- /dev/null
+++ b/test/nns_test.jl
@@ -0,0 +1,252 @@
+## Test `B` as an argument for `SmoothedAggregationAMG`
+@testset "Different `B` as an argument for `SmoothedAggregationAMG` " begin
+    A = poisson(100); 
+    b = rand(100);
+    n = size(A,1)
+    T = eltype(A)
+
+    #1. pass `B`` as nothing (Default case)
+    x_nothing = solve(A, b, SmoothedAggregationAMG(), maxiter = 1, abstol = 1e-6)
+
+    #2. pass `B` as vector
+    B = ones(T,n)
+    x_vec = solve(A, b, SmoothedAggregationAMG(), maxiter = 1, abstol = 1e-6;B=B)
+    @test x_vec ≈ x_nothing
+
+    #3. pass `B` as matrix
+    B = ones(T,n,1)
+    x_mat = solve(A, b, SmoothedAggregationAMG(), maxiter = 1, abstol = 1e-6;B=B)
+    @test x_mat ≈ x_nothing
+end
+
+
+## Test QR factorization
+@testset "fit_candidates unit test cases" begin
+    cases = Vector{Tuple{SparseMatrixCSC{Float64,Int},Matrix{Float64}}}()
+
+    # 1. Aggregates include all dofs, one candidate
+    push!(cases, (
+        sparse([1, 2, 3, 4, 5], [1, 1, 1, 2, 2], ones(5), 5, 2),
+        ones(5, 1)
+    ))
+    push!(cases, (
+        sparse([1, 2, 3, 4, 5], [2, 2, 1, 1, 1], ones(5), 5, 2),
+        ones(5, 1)
+    ))
+    push!(cases, (
+        sparse([1, 2, 3, 4, 5, 6, 7, 8, 9],
+            repeat([1, 2, 3], inner=3),
+            ones(9), 9, 3),
+        ones(9, 1)
+    ))
+    push!(cases, (
+        sparse([1, 2, 3, 4, 5, 6, 7, 8, 9], [3, 2, 1, 1, 2, 3, 2, 1, 3],
+            ones(9), 9, 3),
+        reshape(Float64.(0:8), 9, 1)
+    ))
+
+    # 2. Aggregates include all dofs, two candidates
+    push!(cases, (
+        sparse([1, 2, 3, 4], [1, 1, 2, 2], ones(4), 4, 2),
+        hcat(ones(4), collect(0:3))
+    ))
+    push!(cases, (
+        sparse([1, 2, 3, 4, 5, 6, 7, 8, 9], repeat([1, 2, 3], inner=3), ones(9), 9, 3),
+        hcat(ones(9), collect(0:8))
+    ))
+    push!(cases, (
+        sparse([1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 1, 2, 2, 3, 3, 4, 4, 4], ones(9), 9, 4),
+        hcat(ones(9), collect(0:8))
+    ))
+
+    # 3. Small norms
+    push!(cases, (
+        sparse([1, 2, 3, 4], [1, 1, 2, 2], ones(4), 4, 2),
+        hcat(ones(4), 1e-20 .* collect(0:3))
+    ))
+    push!(cases, (
+        sparse([1, 2, 3, 4], [1, 1, 2, 2], ones(4), 4, 2),
+        1e-20 .* hcat(ones(4), collect(0:3))
+    ))
+
+    # Run tests
+    for (AggOp, fine) in cases
+        # mask dofs not in aggregation
+        for i in 1:size(AggOp, 1)
+            if nnz(AggOp[i, :]) == 0
+                fine[i, :] .= 0.0
+            end
+        end
+        Q, R = fit_candidates(AggOp |> adjoint, fine)
+        # fit exactly and via projection
+        @test fine ≈ Q * R
+        @test fine ≈ Q * (Q' * fine)
+    end
+end
+
+## Helper functions for cantilever frame beam ##
+
+# Element stiffness matrix
+function frame_element_stiffness(EA, EI, le)
+    l2 = le^2
+    l3 = le^3
+    Ke = zeros(6, 6)
+
+    # Axial (u-u)
+    Ke[1, 1] = EA / le
+    Ke[1, 4] = -EA / le
+    Ke[4, 1] = -EA / le
+    Ke[4, 4] = EA / le
+
+    # Bending (w, theta)
+    Kb = EI / l3 * [
+        12.0 6*le -12.0 6*le;
+        6*le 4*l2 -6*le 2*l2;
+        -12.0 -6*le 12.0 -6*le;
+        6*le 2*l2 -6*le 4*l2
+    ]
+    idx = [2, 3, 5, 6]
+    for i in 1:4, j in 1:4
+        Ke[idx[i], idx[j]] = Kb[i, j]
+    end
+    return Ke
+end
+
+
+function create_nns_frame(x_coords::Vector{Float64}, dofmap::Vector{Int})
+    # NNS matrix (3 columns)
+    # i = 1 (u) : 1 0 -y
+    # i = 2 (v) : 0 1 x
+    # i = 3 (θ) : 0 0 1
+    N = length(dofmap)
+    B = zeros(N, 3)  # 3 rigid body modes: x-translation, y-translation, rotation
+
+    for (i, dof) in enumerate(dofmap)
+        node = div(dof - 1, 3) + 1
+        offset = mod(dof - 1, 3)
+        x = x_coords[node]
+        y = 0.0  # 1D beam along x
+
+        if offset == 0      # u (x-translation)
+            B[i, 1] = 1.0              # translation in x
+            B[i, 3] = -y               # rotation effect on u (−y)
+        elseif offset == 1  # v (y-translation)
+            B[i, 2] = 1.0              # translation in y
+            B[i, 3] = x                # rotation effect on v (+x)
+        elseif offset == 2  # θ (rotation DOF)
+            B[i, 3] = 1.0              # direct contribution to rigid rotation
+        end
+    end
+
+    return B
+end
+
+function cantilever_beam(P, E, A, I, L, n_elem)
+    le = L / n_elem
+    n_nodes = n_elem + 1
+    dofs_per_node = 3  # u, w, theta
+    n_dofs = n_nodes * dofs_per_node
+
+    Ke = frame_element_stiffness(E * A, E * I, le)
+
+    # Assemble global stiffness matrix
+    row = Int[]
+    col = Int[]
+    val = Float64[]
+    for e in 1:n_elem
+        n1 = e
+        n2 = e + 1
+        dofmap = [
+            3 * n1 - 2,  # u₁
+            3 * n1 - 1,  # w₁
+            3 * n1,      # θ₁
+            3 * n2 - 2,  # u₂
+            3 * n2 - 1,  # w₂
+            3 * n2       # θ₂
+        ]
+        for i in 1:6, j in 1:6
+            push!(row, dofmap[i])
+            push!(col, dofmap[j])
+            push!(val, Ke[i, j])
+        end
+    end
+    A = sparse(row, col, val, n_dofs, n_dofs)
+    # rhs
+    b = zeros(n_dofs)
+    force_dof = 3 * (n_nodes - 1) + 2  # w at last node
+    b[force_dof] = P # Apply downward force at the last node
+    # Boundary conditions: clamp left end
+    fixed_dofs = [1, 2, 3]  # u₁, w₁, θ₁
+    free_dofs = setdiff(1:n_dofs, fixed_dofs)
+    A_free = A[free_dofs, free_dofs]
+    b_free = b[free_dofs]
+
+    # x-coordinates of nodes
+    x_coords = [le * (i - 1) for i in 1:n_nodes]
+    B = create_nns_frame(x_coords, free_dofs)
+
+    return A_free, b_free, B
+end
+
+@testset "Mechanics test cases" begin
+    @testset "Linear elasticity 2D" begin
+        # load linear elasticity test data
+        @load "lin_elastic_2d.jld2" A b B
+        A = SparseMatrixCSC(A.m, A.n, A.colptr, A.rowval, A.nzval)
+
+        x_nns, residuals_nns = solve(A, b, SmoothedAggregationAMG(), log=true, reltol=1e-10;B=B)
+        x_wonns, residuals_wonns = solve(A, b, SmoothedAggregationAMG(), log=true, reltol=1e-10)
+
+        println("No NNS: final residual at iteration ", length(residuals_wonns), ": ", residuals_wonns[end])
+        println("With NNS: final residual at iteration ", length(residuals_nns), ": ", residuals_nns[end])
+
+
+        #test QR factorization on linear elasticity
+        aggregate = StandardAggregation()
+        AggOp = aggregate(A)
+        Q, R = fit_candidates(AggOp, B)
+        # fit exactly and via projection
+        @test B ≈ Q * R
+        @test B ≈ Q * (Q' * B)
+
+        # Check convergence
+        @test !(A * x_wonns ≈ b)
+        @test A * x_nns ≈ b
+
+    end
+
+    @testset "fit_candidates on cantilever frame beam" begin
+        # Beam parameters
+        P = -1000.0    # Applied force at the end of the beam
+        n_elem = 10
+        E = 210e9      # Young's modulus
+        A = 1e-4       # Cross-section area (for axial)
+        I = 1e-6       # Moment of inertia (for bending)
+        L = 1.0        # Total length
+        A, b, B = cantilever_beam(P, E, A, I, L, n_elem)
+        # test solution
+        # Analaytical solution for cantilever beam
+        u = A \ b
+        @test u[end-1] ≈ (P * L^3) / (3 * E * I) # vertical disp. at the end of the beam
+
+
+        x_nns, residuals_nns = solve(A, b, SmoothedAggregationAMG(), log=true, reltol=1e-10,B=B)
+        x_wonns, residuals_wonns = solve(A, b, SmoothedAggregationAMG(), log=true, reltol=1e-10)
+
+        println("No NNS: final residual at iteration ", length(residuals_wonns), ": ", residuals_wonns[end])
+        println("With NNS: final residual at iteration ", length(residuals_nns), ": ", residuals_nns[end])
+
+
+        # test QR factorization on bending beam
+        # Aggregation
+        aggregate = StandardAggregation()
+        AggOp = aggregate(A)
+        Q, R = fit_candidates(AggOp, B)
+        @test B ≈ Q * R
+        @test B ≈ Q * (Q' * B)
+
+        # Check convergence
+        @test !(A * x_wonns ≈ b)
+        @test A * x_nns ≈ b
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0adca40..543084c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,6 +9,7 @@ using Random: seed!
 
 include("sa_tests.jl")
 include("cycle_tests.jl")
+include("nns_test.jl")
 
 @testset "AlgebraicMultigrid Tests" begin
 
@@ -347,7 +348,6 @@ for sz in [ (10,10), (20,20), (50,50)]
     strategy = KrylovJL_CG(precs = RugeStubenPreconBuilder())
     @test solve(prob, strategy, atol=1.0e-14) ≈ u0 rtol = 1.0e-8
 
-
     strategy = KrylovJL_CG(precs = SmoothedAggregationPreconBuilder())
     @test solve(prob, strategy, atol=1.0e-14) ≈ u0 rtol = 1.0e-8