Examples

This page provides detailed examples of using HPCSparseArrays.jl for various distributed sparse matrix operations.

Matrix Multiplication

Square Matrices

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# Create a tridiagonal matrix (same on all ranks)
n = 100
I = [1:n; 1:n-1; 2:n]
J = [1:n; 2:n; 1:n-1]
V = [2.0*ones(n); -0.5*ones(n-1); -0.5*ones(n-1)]
A = sparse(I, J, V, n, n)

# Create another tridiagonal matrix
V2 = [1.5*ones(n); 0.25*ones(n-1); 0.25*ones(n-1)]
B = sparse(I, J, V2, n, n)

# Distribute matrices
Adist = HPCSparseMatrix(A, backend)
Bdist = HPCSparseMatrix(B, backend)

# Multiply
Cdist = Adist * Bdist

# Verify against reference
C_ref = A * B
C_ref_dist = HPCSparseMatrix(C_ref, backend)
err = norm(Cdist - C_ref_dist, Inf)

println(io0(), "Multiplication error: $err")

Non-Square Matrices

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# A is 6x8, B is 8x10, result is 6x10
m, k, n = 6, 8, 10

I_A = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4]
J_A = [1, 2, 3, 4, 5, 6, 7, 8, 1, 2]
V_A = Float64.(1:length(I_A))
A = sparse(I_A, J_A, V_A, m, k)

I_B = [1, 2, 3, 4, 5, 6, 7, 8, 1, 3]
J_B = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
V_B = Float64.(1:length(I_B))
B = sparse(I_B, J_B, V_B, k, n)

Adist = HPCSparseMatrix(A, backend)
Bdist = HPCSparseMatrix(B, backend)

Cdist = Adist * Bdist

@assert size(Cdist) == (m, n)

println(io0(), "Result size: $(size(Cdist))")

Complex Matrices

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

n = 8
I = [1:n; 1:n-1; 2:n]
J = [1:n; 2:n; 1:n-1]

# Complex values
V_A = ComplexF64.([2.0*ones(n); -0.5*ones(n-1); -0.5*ones(n-1)]) .+
      im .* ComplexF64.([0.1*ones(n); 0.2*ones(n-1); -0.2*ones(n-1)])
A = sparse(I, J, V_A, n, n)

V_B = ComplexF64.([1.5*ones(n); 0.25*ones(n-1); 0.25*ones(n-1)]) .+
      im .* ComplexF64.([-0.1*ones(n); 0.1*ones(n-1); 0.1*ones(n-1)])
B = sparse(I, J, V_B, n, n)

Adist = HPCSparseMatrix(A, backend)
Bdist = HPCSparseMatrix(B, backend)

# Multiplication
Cdist = Adist * Bdist

# Conjugate
Aconj = conj(Adist)

# Adjoint (conjugate transpose) - returns lazy wrapper
Aadj = Adist'

# Using adjoint in multiplication (materializes automatically)
result = Aadj * Bdist

println(io0(), "Complex matrix operations completed")

Addition and Subtraction

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

n = 8

# Matrices with different sparsity patterns
# Matrix A: upper triangular entries
I_A = [1, 1, 2, 3, 4, 5, 6, 7, 8]
J_A = [1, 2, 2, 3, 4, 5, 6, 7, 8]
V_A = Float64.(1:9)
A = sparse(I_A, J_A, V_A, n, n)

# Matrix B: lower triangular entries
I_B = [1, 2, 2, 3, 4, 5, 6, 7, 8]
J_B = [1, 1, 2, 3, 4, 5, 6, 7, 8]
V_B = Float64.(9:-1:1)
B = sparse(I_B, J_B, V_B, n, n)

Adist = HPCSparseMatrix(A, backend)
Bdist = HPCSparseMatrix(B, backend)

# Addition - handles different sparsity patterns
Cdist = Adist + Bdist

# Subtraction
Ddist = Adist - Bdist

# Verify
C_ref_dist = HPCSparseMatrix(A + B, backend)
err = norm(Cdist - C_ref_dist, Inf)

println(io0(), "Addition error: $err")

Transpose Operations

Lazy Transpose

The transpose function creates a lazy wrapper without transposing the data. This is efficient because the actual transpose is only computed when needed:

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

m, n = 8, 6
I_C = [1, 2, 3, 4, 5, 6, 7, 8, 1, 3]
J_C = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4]
V_C = Float64.(1:length(I_C))
C = sparse(I_C, J_C, V_C, m, n)

I_D = [1, 2, 3, 4, 5, 6, 1, 2]
J_D = [1, 2, 3, 4, 5, 6, 7, 8]
V_D = Float64.(1:length(I_D))
D = sparse(I_D, J_D, V_D, n, m)

Cdist = HPCSparseMatrix(C, backend)
Ddist = HPCSparseMatrix(D, backend)

# transpose(C) * transpose(D) = transpose(D * C)
# This is computed efficiently without explicitly transposing
result = transpose(Cdist) * transpose(Ddist)

println(io0(), "Lazy transpose multiplication completed")

Transpose in Multiplication

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# A is 8x6, so A' is 6x8
# B is 8x10, so A' * B is 6x10
m, n, p = 8, 6, 10

I_A = [1, 2, 3, 4, 5, 6, 7, 8, 1, 3, 5, 7]
J_A = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]
V_A = Float64.(1:length(I_A))
A = sparse(I_A, J_A, V_A, m, n)

I_B = [1, 2, 3, 4, 5, 6, 7, 8, 1, 3, 5, 7]
J_B = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2]
V_B = Float64.(1:length(I_B))
B = sparse(I_B, J_B, V_B, m, p)

Adist = HPCSparseMatrix(A, backend)
Bdist = HPCSparseMatrix(B, backend)

# transpose(A) * B - A is automatically materialized as transpose
result_dist = transpose(Adist) * Bdist

# Verify
ref = sparse(A') * B
ref_dist = HPCSparseMatrix(ref, backend)
err = norm(result_dist - ref_dist, Inf)

println(io0(), "transpose(A) * B error: $err")

Scalar Multiplication

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

m, n = 6, 8
I = [1, 2, 3, 4, 5, 6, 1, 3]
J = [1, 2, 3, 4, 5, 6, 7, 8]
V = Float64.(1:length(I))
A = sparse(I, J, V, m, n)

Adist = HPCSparseMatrix(A, backend)

# Scalar times matrix
a = 2.5
result1 = a * Adist
result2 = Adist * a  # Equivalent

# Scalar times lazy transpose
At = transpose(Adist)
result3 = a * At  # Returns transpose(a * A)

# Verify
ref_dist = HPCSparseMatrix(a * A, backend)
err1 = norm(result1 - ref_dist, Inf)
err2 = norm(result2 - ref_dist, Inf)

println(io0(), "Scalar multiplication errors: $err1, $err2")

Computing Norms

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

m, n = 6, 8
I = [1, 2, 3, 4, 5, 6, 1, 3, 2, 4]
J = [1, 2, 3, 4, 5, 6, 7, 8, 1, 3]
V = Float64.(1:length(I))
A = sparse(I, J, V, m, n)

Adist = HPCSparseMatrix(A, backend)

# Element-wise norms (treating matrix as vector)
frob_norm = norm(Adist)        # Frobenius (2-norm)
one_norm = norm(Adist, 1)      # Sum of absolute values
inf_norm = norm(Adist, Inf)    # Max absolute value
p_norm = norm(Adist, 3)        # General p-norm

# Operator norms
op_1 = opnorm(Adist, 1)        # Max absolute column sum
op_inf = opnorm(Adist, Inf)    # Max absolute row sum

println(io0(), "Frobenius norm: $frob_norm")
println(io0(), "1-norm: $one_norm")
println(io0(), "Inf-norm: $inf_norm")
println(io0(), "3-norm: $p_norm")
println(io0(), "Operator 1-norm: $op_1")
println(io0(), "Operator Inf-norm: $op_inf")

Iterative Methods Example

Here's an example of using HPCSparseArrays.jl for power iteration to find the dominant eigenvalue:

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# Create a symmetric positive definite matrix
n = 100
I = [1:n; 1:n-1; 2:n]
J = [1:n; 2:n; 1:n-1]
V = [4.0*ones(n); -ones(n-1); -ones(n-1)]
A = sparse(I, J, V, n, n)

Adist = HPCSparseMatrix(A, backend)

# For power iteration, we need matrix-vector products
# Currently HPCSparseArrays focuses on matrix-matrix products
# This example shows how to use A*A for related computations

# Compute A^2
A2dist = Adist * Adist

# Compute the Frobenius norm of A^2
norm_A2 = norm(A2dist)

println(io0(), "||A^2||_F = $norm_A2")
# For SPD matrices, this relates to the eigenvalues

Solving Linear Systems

HPCSparseArrays provides distributed sparse direct solvers using the multifrontal method.

LDLT Factorization (Symmetric Matrices)

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# Create a symmetric positive definite tridiagonal matrix
n = 100
I = [1:n; 1:n-1; 2:n]
J = [1:n; 2:n; 1:n-1]
V = [4.0*ones(n); -ones(n-1); -ones(n-1)]
A = sparse(I, J, V, n, n)

# Distribute the matrix
Adist = HPCSparseMatrix(A, backend)

# Compute LDLT factorization
F = ldlt(Adist)

# Create right-hand side
b = HPCVector(ones(n), backend)

# Solve Ax = b
x = solve(F, b)

# Or use backslash syntax
x = F \ b

# Verify solution
x_full = Vector(x)
residual = norm(A * x_full - ones(n), Inf)

println(io0(), "LDLT solve residual: $residual")

LU Factorization (General Matrices)

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# Create a general (non-symmetric) tridiagonal matrix
n = 100
I = [1:n; 1:n-1; 2:n]
J = [1:n; 2:n; 1:n-1]
V = [2.0*ones(n); -0.5*ones(n-1); -0.8*ones(n-1)]  # Non-symmetric
A = sparse(I, J, V, n, n)

# Distribute and factorize
Adist = HPCSparseMatrix(A, backend)
F = lu(Adist)

# Solve
b = HPCVector(ones(n), backend)
x = solve(F, b)

# Verify
x_full = Vector(x)
residual = norm(A * x_full - ones(n), Inf)

println(io0(), "LU solve residual: $residual")

Symmetric Indefinite Matrices

LDLT uses Bunch-Kaufman pivoting to handle symmetric indefinite matrices:

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# Symmetric indefinite matrix (alternating signs on diagonal)
n = 50
I = [1:n; 1:n-1; 2:n]
J = [1:n; 2:n; 1:n-1]
diag_vals = [(-1.0)^i * 2.0 for i in 1:n]  # Alternating signs
V = [diag_vals; -ones(n-1); -ones(n-1)]
A = sparse(I, J, V, n, n)

Adist = HPCSparseMatrix(A, backend)
F = ldlt(Adist)

b = HPCVector(collect(1.0:n), backend)
x = solve(F, b)

x_full = Vector(x)
residual = norm(A * x_full - collect(1.0:n), Inf)

println(io0(), "Indefinite LDLT residual: $residual")

Reusing Symbolic Factorization

For sequences of matrices with the same sparsity pattern, the symbolic factorization is cached and reused:

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

n = 100
I = [1:n; 1:n-1; 2:n]
J = [1:n; 2:n; 1:n-1]

# First matrix
V1 = [4.0*ones(n); -ones(n-1); -ones(n-1)]
A1 = sparse(I, J, V1, n, n)
A1dist = HPCSparseMatrix(A1, backend)

# First factorization - computes symbolic phase
F1 = ldlt(A1dist; reuse_symbolic=true)

# Second matrix - same structure, different values
V2 = [8.0*ones(n); -2.0*ones(n-1); -2.0*ones(n-1)]
A2 = sparse(I, J, V2, n, n)
A2dist = HPCSparseMatrix(A2, backend)

# Second factorization - reuses cached symbolic phase (faster)
F2 = ldlt(A2dist; reuse_symbolic=true)

# Both factorizations work
b = HPCVector(ones(n), backend)
x1 = solve(F1, b)
x2 = solve(F2, b)

x1_full = Vector(x1)
x2_full = Vector(x2)
println(io0(), "F1 residual: ", norm(A1 * x1_full - ones(n), Inf))
println(io0(), "F2 residual: ", norm(A2 * x2_full - ones(n), Inf))

Plan Caching and Management

using MPI
MPI.Init()
using HPCSparseArrays
using SparseArrays

backend = BACKEND_CPU_MPI

n = 100
A = spdiagm(0 => 2.0*ones(n), 1 => -ones(n-1), -1 => -ones(n-1))
B = spdiagm(0 => 1.5*ones(n), 1 => 0.5*ones(n-1), -1 => 0.5*ones(n-1))

Adist = HPCSparseMatrix(A, backend)
Bdist = HPCSparseMatrix(B, backend)

# First multiplication - creates and caches the plan
C1 = Adist * Bdist

# Second multiplication - reuses cached plan (faster)
C2 = Adist * Bdist

# Third multiplication - still uses cached plan
C3 = Adist * Bdist

# Clear caches when done to free memory
clear_plan_cache!()  # Clears all caches including factorization

# Or clear specific caches:
# clear_symbolic_cache!()           # Symbolic factorizations only
# clear_factorization_plan_cache!() # Factorization plans only

println(io0(), "Cached multiplication completed")

Dense Matrix Operations with mapslices

The mapslices function applies a function to each row or column of a distributed dense matrix. This is useful for computing row-wise or column-wise statistics.

Row-wise Operations (dims=2)

Row-wise operations are local - no MPI communication is needed since rows are already distributed:

using MPI
MPI.Init()
using HPCSparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# Create a deterministic dense matrix (same on all ranks)
m, n = 100, 10
A_global = Float64.([i + 0.1*j for i in 1:m, j in 1:n])

# Distribute
Adist = HPCMatrix(A_global, backend)

# Compute row statistics: for each row, compute [norm, max, sum]
# This transforms 100×10 matrix to 100×3 matrix
row_stats = mapslices(x -> [norm(x), maximum(x), sum(x)], Adist; dims=2)

println(io0(), "Row statistics shape: $(size(row_stats))")  # (100, 3)

Column-wise Operations (dims=1)

Column-wise operations require MPI communication to gather each full column:

using MPI
MPI.Init()
using HPCSparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# Create a deterministic dense matrix
m, n = 100, 10
A_global = Float64.([i + 0.1*j for i in 1:m, j in 1:n])

Adist = HPCMatrix(A_global, backend)

# Compute column statistics: for each column, compute [norm, max]
# This transforms 100×10 matrix to 2×10 matrix
col_stats = mapslices(x -> [norm(x), maximum(x)], Adist; dims=1)

println(io0(), "Column statistics shape: $(size(col_stats))")  # (2, 10)

Use Case: Replacing vcat(f.(eachrow(A))...)

The standard Julia pattern vcat(f.(eachrow(A))...) doesn't work with distributed matrices because the type information is lost after broadcasting. Use mapslices instead:

using MPI
MPI.Init()
using HPCSparseArrays
using LinearAlgebra

backend = BACKEND_CPU_MPI

# Standard Julia pattern (for comparison):
# A = randn(5, 2)
# f(x) = transpose([norm(x), maximum(x)])
# B = vcat(f.(eachrow(A))...)

# MPI-compatible equivalent:
A_global = Float64.([i + 0.1*j for i in 1:100, j in 1:10])
Adist = HPCMatrix(A_global, backend)

# Use mapslices with dims=2 to apply function to each row
# The function returns a vector, which becomes a row in the result
g(x) = [norm(x), maximum(x)]
Bdist = mapslices(g, Adist; dims=2)

println(io0(), "Result: $(size(Bdist))")  # (100, 2)