NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎builder/utils.py‎
Lines changed: 2 additions & 2 deletions b/‎builder/utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/sphinx/_static/switcher.json‎
Lines changed: 4 additions & 0 deletions b/‎docs/sphinx/_static/switcher.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/sphinx/bindings/cudss.rst‎
Lines changed: 68 additions & 0 deletions b/‎docs/sphinx/bindings/cudss.rst‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎docs/sphinx/bindings/index.rst‎
Lines changed: 3 additions & 0 deletions b/‎docs/sphinx/bindings/index.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/sphinx/conf.py‎
Lines changed: 43 additions & 7 deletions b/‎docs/sphinx/conf.py‎
Lines changed: 43 additions & 7 deletions
diff --git a/‎docs/sphinx/host-apis/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/sphinx/host-apis/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/sphinx/host-apis/sparse/index.rst‎
Lines changed: 52 additions & 0 deletions b/‎docs/sphinx/host-apis/sparse/index.rst‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎docs/sphinx/installation.rst‎
Lines changed: 14 additions & 20 deletions b/‎docs/sphinx/installation.rst‎
Lines changed: 14 additions & 20 deletions
diff --git a/‎docs/sphinx/overview.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/sphinx/overview.rst‎
Lines changed: 1 addition & 1 deletion
@@ -13,6 +13,9 @@ dist
 build
 wheelhouse
 
+#Mac system files
+.DS_Store
+
 #Test
 .pytest_cache
 *.pickle
@@ -23,3 +26,5 @@ wheelhouse
 Pipfile*
 .venv*
 .pipenv-cache
+
+.mypy_cache
@@ -43,7 +43,7 @@ def check_path(header):
 
 def decide_lib_name(ext_name):
     # TODO: move the record of the supported lib list elsewhere?
-    for lib in ("cublas", "cusolver", "cufftMp", "cufft", "cusparse", "curand", "nvpl", "nvshmem", "mathdx"):
+    for lib in ("cublas", "cusolver", "cufftMp", "cufft", "cusparse", "curand", "nvpl", "nvshmem", "mathdx", "cudss"):
         if lib in ext_name:
             return lib
     else:
@@ -105,7 +105,7 @@ def _prep_includes_libs_rpaths(self, lib_name):
                         ldflag += f",-rpath,$ORIGIN/../../../../{lib_name}/lib:$ORIGIN/../../../../../../"
                     case "cufftMp":
                         ldflag += ",-rpath,$ORIGIN/../../../nvidia/cufftmp/cu12/lib"
-                    case "mathdx":
+                    case "mathdx" | "cudss":
                         ldflag += ",-rpath,$ORIGIN/../../../nvidia/cu12/lib"
                     case _:
                         ldflag += f",-rpath,$ORIGIN/../../../nvidia/{lib_name}/lib"
 
@@ -3,6 +3,10 @@
         "version": "latest",
         "url": "https://docs.nvidia.com/cuda/nvmath-python/latest"
     },
+    {
+        "version": "0.5.0",
+        "url": "https://docs.nvidia.com/cuda/nvmath-python/0.5.0"
+    },
     {
         "version": "0.4.0",
         "url": "https://docs.nvidia.com/cuda/nvmath-python/0.4.0"
 
@@ -0,0 +1,68 @@
+.. module:: nvmath.bindings.cudss
+
+cuDSS (:mod:`nvmath.bindings.cudss`)
+==========================================
+
+For detailed documentation on the original C APIs, refer to the `cuDSS documentation
+<https://docs.nvidia.com/cuda/cudss/>`_.
+
+Enums and constants
+*******************
+
+.. autosummary::
+   :toctree: generated/
+
+    AlgType
+    ConfigParam
+    cuDSSError
+    DataParam
+    IndexBase
+    Layout
+    MatrixFormat
+    MatrixType
+    MatrixViewType
+    OpType
+    Phase
+    PivotType
+    Status
+
+Functions
+*********
+
+.. autosummary::
+   :toctree: generated/
+
+    check_status
+    config_create
+    config_destroy
+    config_get
+    config_set
+    create
+    data_create
+    data_destroy
+    data_get
+    data_set
+    destroy
+    execute
+    get_config_param_dtype
+    get_data_param_dtype
+    get_device_mem_handler
+    get_property
+    matrix_create_batch_csr
+    matrix_create_batch_dn
+    matrix_create_csr
+    matrix_create_dn
+    matrix_destroy
+    matrix_get_batch_csr
+    matrix_get_batch_dn
+    matrix_get_csr
+    matrix_get_dn
+    matrix_get_format
+    matrix_set_batch_csr_pointers
+    matrix_set_batch_values
+    matrix_set_csr_pointers
+    matrix_set_values
+    set_comm_layer
+    set_device_mem_handler
+    set_stream
+    set_threading_layer
@@ -31,6 +31,8 @@ follows:
      - :mod:`nvmath.bindings.cublas`
    * - cuBLASLt
      - :mod:`nvmath.bindings.cublasLt`
+   * - cuDSS
+     - :mod:`nvmath.bindings.cudss`
    * - cuFFT
      - :mod:`nvmath.bindings.cufft`
    * - cuRAND
@@ -186,6 +188,7 @@ This reference describes all nvmath-python's math primitives.
 
    cublas
    cublasLt
+   cudss
    cufft
    cusolver
    cusolverDn
 
@@ -163,7 +163,7 @@
         "json_url": "/cuda/nvmath-python/latest/_static/switcher.json",
         "version_match": version,
     },
-    "navbar_start": ["navbar-logo", "version-switcher"],
+    "navbar_start": ["navbar-logo"],
 }
 html_show_sphinx = False
 
@@ -182,7 +182,7 @@
 
 # TODO: remove this once examples are published.
 linkcheck_ignore = [
-    "https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/.*",
+    "https://github.com/NVIDIA/nvmath-python/tree/main/examples/sparse/.*",
 ]
 
 
@@ -193,6 +193,21 @@ def autodoc_process_docstring(app, what, name, obj, options, lines):
     # there's no way we can touch the docstrings of np.dtype objects, so we
     # need to do post-processing here
     if isinstance(obj, np.dtype):
+        docs = {}
+        from nvmath.sparse._internal.cudss_data_ifc import memory_estimates_dtype
+
+        # TODO: find better way to declare docs in the source code.
+        if obj == memory_estimates_dtype:
+            docs = {
+                "permanent_device_memory": "permanent device memory",
+                "peak_device_memory": "peak device memory",
+                "permanent_host_memory": "permanent host memory",
+                "peak_host_memory": "peak host memory",
+                "hybrid_min_device_memory": "(if in hybrid memory mode) minimum device memory for the hybrid memory mode",
+                "hybrid_max_device_memory": "(if in hybrid memory mode) maximum host memory for the hybrid memory mode",
+                "reserved": "reserved for future use",
+            }
+
         _, *mod, struct = name.split(".")
         mod = ".".join(mod)
         if mod == "bindings":
@@ -207,7 +222,8 @@ def autodoc_process_docstring(app, what, name, obj, options, lines):
         lines.append(line)
         lines.append("\n")
         for k in obj.fields:
-            lines.append(f":param {k}:\n")
+            docs_value = docs.get(k, "")
+            lines.append(f":param {k}: {docs_value}\n")
     else:
         match_numba_dtype = re.search(r"nvmath.device.float(\d+)x(\d+)_type", name)
         if match_numba_dtype:
@@ -302,6 +318,7 @@ def remove_notebook_copyright(self, app, docname, content):
 
 
 def setup(app):
+    fixup_internal_alias()
     app.add_css_file("nvmath_override.css")
     app.connect("autodoc-process-docstring", autodoc_process_docstring)
     app.connect("source-read", lambda *args, **kwargs: notebook_handler.remove_notebook_copyright(*args, **kwargs))
@@ -310,6 +327,16 @@ def setup(app):
     app.add_post_transform(UnqualifiedTitlesTransform)
 
 
+def fixup_internal_alias():
+    from nvmath.sparse._internal import cudss_config_ifc, cudss_data_ifc
+
+    cudss_config_ifc.PlanConfig.__name__ = "DirectSolverPlanConfig"
+    cudss_data_ifc.PlanInfo.__name__ = "DirectSolverPlanInfo"
+    cudss_config_ifc.FactorizationConfig.__name__ = "DirectSolverFactorizationConfig"
+    cudss_config_ifc.SolutionConfig.__name__ = "DirectSolverSolutionConfig"
+    cudss_data_ifc.FactorizationInfo.__name__ = "DirectSolverFactorizationInfo"
+
+
 # -- Other options -------------------------------------------------
 
 autosummary_generate = True
@@ -323,12 +350,21 @@ def setup(app):
 }
 
 intersphinx_mapping = {
-    "python": ("https://docs.python.org/3/", None),
-    "numpy": ("https://numpy.org/doc/stable/", None),
+    "cublas": ("https://docs.nvidia.com/cuda/cublas/", None),
+    "cuda-bindings": ("https://nvidia.github.io/cuda-python/cuda-bindings/", None),
+    "cuda-core": ("https://nvidia.github.io/cuda-python/cuda-core/", None),
+    "cudss": ("https://docs.nvidia.com/cuda/cudss/", None),
+    "cufft": ("https://docs.nvidia.com/cuda/cufft/", None),
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
-    "torch": ("https://pytorch.org/docs/stable/", None),
+    # curand is not using sphinx yet - June, 2025
+    # "curand": ("https://docs.nvidia.com/cuda/curand/", None),
+    "cusolver": ("https://docs.nvidia.com/cuda/cusolver/", None),
+    "cusparse": ("https://docs.nvidia.com/cuda/cusparse/", None),
     "numba": ("https://numba.readthedocs.io/en/stable/", None),
-    "cufft": ("https://docs.nvidia.com/cuda/cufft/", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+    "python": ("https://docs.python.org/3/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/", None),
+    "torch": ("https://docs.pytorch.org/docs/stable/", None),
 }
 
 napoleon_google_docstring = True
 
@@ -16,5 +16,6 @@ Contents
    :maxdepth: 2
 
    Linear Algebra <linalg/index.rst>
+   Sparse Linear Algebra <sparse/index.rst>
    Fast Fourier Transform <fft/index.rst>
    Host API Utilities <utils.rst>
@@ -0,0 +1,52 @@
+*********************
+Sparse Linear Algebra
+*********************
+
+.. _sparse-overview:
+
+Overview
+========
+
+The sparse linear algebra module :mod:`nvmath.sparse` in nvmath-python leverages various
+NVIDIA math libraries to support sparse linear algebra computations. As of the current Beta
+release, we offer the specialized sparse direct solver API based on the `cuDSS library
+<https://docs.nvidia.com/cuda/cudss/>`_.
+
+.. _sparse-api-reference:
+
+API Reference
+=============
+
+.. module:: nvmath.sparse
+
+Generic Linear Algebra APIs (:mod:`nvmath.sparse`)
+--------------------------------------------------
+
+Generic APIs will be available in a later release.
+
+.. module:: nvmath.sparse.advanced
+
+Specialized Linear Algebra APIs (:mod:`nvmath.sparse.advanced`)
+---------------------------------------------------------------
+
+.. autosummary::
+   :toctree: generated/
+
+   direct_solver
+   DirectSolver
+   DirectSolverFactorizationConfig
+   DirectSolverFactorizationInfo
+   DirectSolverPlanConfig
+   DirectSolverPlanInfo
+   DirectSolverSolutionConfig
+   memory_estimates_dtype
+
+   :template: dataclass.rst
+
+   DirectSolverAlgType
+   DirectSolverMatrixType
+   DirectSolverMatrixViewType
+   DirectSolverOptions
+   ExecutionCUDA
+   ExecutionHybrid
+   HybridMemoryModeOptions
@@ -289,19 +289,19 @@ dependency is *required* unless stated otherwise.
      - 525.60.13+ (Linux) with CUDA 12.x
      - 525.60.13+ (Linux) with CUDA 12.x
    * - Python
-     - 3.10-3.12
-     - 3.10-3.12
-     - 3.10-3.12
-     - 3.10-3.12
-     - 3.10-3.12
+     - 3.10-3.13
+     - 3.10-3.13
+     - 3.10-3.13
+     - 3.10-3.13
+     - 3.10-3.13
    * - pip
      - 22.3.1+
      -
      -
      -
      -
    * - setuptools
-     - >=70.0.0
+     - >=77.0.3
      -
      -
      -
@@ -313,7 +313,7 @@ dependency is *required* unless stated otherwise.
      -
      -
    * - Cython
-     - >=0.29.22,<3
+     - >=3.0.4,<3.1
      -
      -
      -
@@ -330,10 +330,10 @@ dependency is *required* unless stated otherwise.
      - CUDA 12.x
    * - NumPy
      -
-     - >=1.24
-     - >=1.24
-     - >=1.24
-     - >=1.24
+     - >=1.25
+     - >=1.25
+     - >=1.25
+     - >=1.25
    * - | CuPy
        | (see `CuPy installation guide <https://docs.cupy.dev/en/stable/install.html>`_)
      -
@@ -351,7 +351,7 @@ dependency is *required* unless stated otherwise.
    * - libmathdx (cuBLASDx, cuFFTDx, ...)
      -
      -
-     - 0.2.*
+     - >=0.2.1,<0.3
      -
      -
    * - numba-cuda
@@ -360,12 +360,6 @@ dependency is *required* unless stated otherwise.
      - >=0.11.0
      - >=0.11.0
      -
-   * - Numba
-     -
-     -
-     - >=0.59.1
-     - >=0.59.1
-     -
    * - pynvjitlink
      -
      -
@@ -404,7 +398,7 @@ nvmath-python is tested in the following environments:
    * - GPU model
      - H100, B200, RTX 4090, CG1 (Grace-Hopper)
    * - Python
-     - 3.10, 3.11, 3.12
+     - 3.10, 3.11, 3.12, 3.13
    * - CPU architecture
      - x86_64, aarch64
    * - Operating system
@@ -626,7 +620,7 @@ For more information with regard to the new CUDA 12+ package layout on conda-for
 .. [2] nvmath-python relies on `CUDA minor version compatibility
     <https://docs.nvidia.com/deploy/cuda-compatibility/index.html
     #minor-version-compatibility>`_.
-.. [4] As of beta 4.0 (v0.4.0), CuPy is a required run-time dependency except for CPU-only
+.. [4] As of beta 5.0 (v0.5.0), CuPy is a required run-time dependency except for CPU-only
     execution. In a future release it will be turned into an optional run-time dependency.
 .. [5] For example, Hopper GPUs are supported starting CUDA 11.8, so they would not work
     with libraries from CUDA 11.7 or below.
 
@@ -220,7 +220,7 @@ An example illustrating the use of the global Python logger is shown below:
       level=logging.DEBUG,
       format='%(asctime)s %(levelname)-8s %(message)s',
       datefmt='%m-%d %H:%M:%S'
-   )
+    )
 
     # Call nvmath-python Pythonic APIs
     out = nvmath.linalg.advanced.matmul(...)
Original file line number	Diff line number	Diff line change
`@@ -289,19 +289,19 @@ dependency is required unless stated otherwise.`
`289`	`289`	`- 525.60.13+ (Linux) with CUDA 12.x`
`290`	`290`	`- 525.60.13+ (Linux) with CUDA 12.x`
`291`	`291`	`* - Python`
`292`		`- - 3.10-3.12`
`293`		`- - 3.10-3.12`
`294`		`- - 3.10-3.12`
`295`		`- - 3.10-3.12`
`296`		`- - 3.10-3.12`
	`292`	`+ - 3.10-3.13`
	`293`	`+ - 3.10-3.13`
	`294`	`+ - 3.10-3.13`
	`295`	`+ - 3.10-3.13`
	`296`	`+ - 3.10-3.13`
`297`	`297`	`* - pip`
`298`	`298`	`- 22.3.1+`
`299`	`299`	`-`
`300`	`300`	`-`
`301`	`301`	`-`
`302`	`302`	`-`
`303`	`303`	`* - setuptools`
`304`		`- - >=70.0.0`
	`304`	`+ - >=77.0.3`
`305`	`305`	`-`
`306`	`306`	`-`
`307`	`307`	`-`
`@@ -313,7 +313,7 @@ dependency is required unless stated otherwise.`
`313`	`313`	`-`
`314`	`314`	`-`
`315`	`315`	`* - Cython`
`316`		`- - >=0.29.22,<3`
	`316`	`+ - >=3.0.4,<3.1`
`317`	`317`	`-`
`318`	`318`	`-`
`319`	`319`	`-`
`@@ -330,10 +330,10 @@ dependency is required unless stated otherwise.`
`330`	`330`	`- CUDA 12.x`
`331`	`331`	`* - NumPy`
`332`	`332`	`-`
`333`		`- - >=1.24`
`334`		`- - >=1.24`
`335`		`- - >=1.24`
`336`		`- - >=1.24`
	`333`	`+ - >=1.25`
	`334`	`+ - >=1.25`
	`335`	`+ - >=1.25`
	`336`	`+ - >=1.25`
`337`	`337`	`* - \| CuPy`
`338`	`338`	\| (see `CuPy installation guide <https://docs.cupy.dev/en/stable/install.html>`_)
`339`	`339`	`-`
`@@ -351,7 +351,7 @@ dependency is required unless stated otherwise.`
`351`	`351`	`* - libmathdx (cuBLASDx, cuFFTDx, ...)`
`352`	`352`	`-`
`353`	`353`	`-`
`354`		`- - 0.2.*`
	`354`	`+ - >=0.2.1,<0.3`
`355`	`355`	`-`
`356`	`356`	`-`
`357`	`357`	`* - numba-cuda`
`@@ -360,12 +360,6 @@ dependency is required unless stated otherwise.`
`360`	`360`	`- >=0.11.0`
`361`	`361`	`- >=0.11.0`
`362`	`362`	`-`
`363`		`- * - Numba`
`364`		`- -`
`365`		`- -`
`366`		`- - >=0.59.1`
`367`		`- - >=0.59.1`
`368`		`- -`
`369`	`363`	`* - pynvjitlink`
`370`	`364`	`-`
`371`	`365`	`-`
`@@ -404,7 +398,7 @@ nvmath-python is tested in the following environments:`
`404`	`398`	`* - GPU model`
`405`	`399`	`- H100, B200, RTX 4090, CG1 (Grace-Hopper)`
`406`	`400`	`* - Python`
`407`		`- - 3.10, 3.11, 3.12`
	`401`	`+ - 3.10, 3.11, 3.12, 3.13`
`408`	`402`	`* - CPU architecture`
`409`	`403`	`- x86_64, aarch64`
`410`	`404`	`* - Operating system`
`@@ -626,7 +620,7 @@ For more information with regard to the new CUDA 12+ package layout on conda-for`
`626`	`620`	.. [2] nvmath-python relies on `CUDA minor version compatibility
`627`	`621`	`<https://docs.nvidia.com/deploy/cuda-compatibility/index.html`
`628`	`622`	#minor-version-compatibility>`_.
`629`		`-.. [4] As of beta 4.0 (v0.4.0), CuPy is a required run-time dependency except for CPU-only`
	`623`	`+.. [4] As of beta 5.0 (v0.5.0), CuPy is a required run-time dependency except for CPU-only`
`630`	`624`	`execution. In a future release it will be turned into an optional run-time dependency.`
`631`	`625`	`.. [5] For example, Hopper GPUs are supported starting CUDA 11.8, so they would not work`
`632`	`626`	`with libraries from CUDA 11.7 or below.`