diff --git a/.drone.jsonnet b/.drone.jsonnet
index d459d3ec..4aaae745 100644
--- a/.drone.jsonnet
+++ b/.drone.jsonnet
@@ -416,4 +416,10 @@ local windows_pipeline(name, image, environment, arch = "amd64") =
         "cppalliance/dronevs2022:1",
         { TOOLSET: 'msvc-14.3', CXXSTD: '14,17,20,latest', ADDRMD: '32,64' },
     ),
+
+    windows_pipeline(
+        "Windows VS2026 msvc-14.5",
+        "cppalliance/dronevs2026:1",
+        { TOOLSET: 'msvc-14.5', CXXSTD: '14,17,20,latest', ADDRMD: '32,64' },
+    ),
 ]
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..23dab29e
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,15 @@
+# Copyright 2025 Matt Borland
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at http://boost.org/LICENSE_1_0.txt)
+
+version: 2
+updates:
+  - package-ecosystem: "npm"
+    directory: "/doc"
+    schedule:
+      interval: "weekly"
+    groups:
+      all-dependencies:
+        # Groups all updates into a single PR
+        patterns:
+          - "*"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2a945c9a..3418f60e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -298,12 +298,12 @@ jobs:
             source_keys:
               - "https://apt.llvm.org/llvm-snapshot.gpg.key"
 
-          - toolset: clang
-            cxxstd: "03,11,14,17,20,2b"
-            os: macos-14
           - toolset: clang
             cxxstd: "03,11,14,17,20,2b"
             os: macos-15
+          - toolset: clang
+            cxxstd: "03,11,14,17,20,2b,2c"
+            os: macos-26
 
     timeout-minutes: 180
     runs-on: ${{matrix.os}}
@@ -620,8 +620,8 @@ jobs:
         include:
           - os: ubuntu-22.04
           - os: ubuntu-24.04
-          - os: macos-14
           - os: macos-15
+          - os: macos-26
 
     runs-on: ${{matrix.os}}
 
@@ -666,8 +666,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - os: macos-14
           - os: macos-15
+          - os: macos-26
 
     runs-on: ${{matrix.os}}
 
@@ -724,8 +724,8 @@ jobs:
         include:
           - os: ubuntu-22.04
           - os: ubuntu-24.04
-          - os: macos-14
           - os: macos-15
+          - os: macos-26
 
     runs-on: ${{matrix.os}}
 
@@ -780,8 +780,8 @@ jobs:
         include:
           - os: ubuntu-22.04
           - os: ubuntu-24.04
-          - os: macos-14
           - os: macos-15
+          - os: macos-26
 
     runs-on: ${{matrix.os}}
 
@@ -952,8 +952,8 @@ jobs:
       matrix:
         include:
           - os: ubuntu-24.04
-          - os: macos-14
           - os: macos-15
+          - os: macos-26
 
     runs-on: ${{matrix.os}}
 
@@ -1146,3 +1146,60 @@ jobs:
           cd ~/pkgconfig_test
           g++ main.cpp $(pkg-config --cflags --libs boost_int128) -o test_pkgconfig
           ./test_pkgconfig
+
+  cuda-cmake-test:
+    strategy:
+      fail-fast: false
+
+    runs-on: gpu-runner-1
+
+    steps:
+      - uses: Jimver/cuda-toolkit@v0.2.30
+        id: cuda-toolkit
+        with:
+          cuda: '12.8.0'
+          method: 'network'
+          sub-packages: '["nvcc"]'
+
+      - name: Output CUDA information
+        run: |
+          echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"+
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          nvcc -V
+      - uses: actions/checkout@v5
+
+      - name: Install Packages
+        run: |
+          sudo apt-get install -y cmake make
+      - name: Setup Boost
+        run: |
+          echo GITHUB_REPOSITORY: $GITHUB_REPOSITORY
+          LIBRARY=${GITHUB_REPOSITORY#*/}
+          echo LIBRARY: $LIBRARY
+          echo "LIBRARY=$LIBRARY" >> $GITHUB_ENV
+          echo GITHUB_BASE_REF: $GITHUB_BASE_REF
+          echo GITHUB_REF: $GITHUB_REF
+          REF=${GITHUB_BASE_REF:-$GITHUB_REF}
+          REF=${REF#refs/heads/}
+          echo REF: $REF
+          BOOST_BRANCH=develop && [ "$REF" == "master" ] && BOOST_BRANCH=master || true
+          echo BOOST_BRANCH: $BOOST_BRANCH
+          cd ..
+          git clone -b $BOOST_BRANCH --depth 1 https://github.com/boostorg/boost.git boost-root
+          cd boost-root
+          mkdir -p libs/$LIBRARY
+          cp -r $GITHUB_WORKSPACE/* libs/$LIBRARY
+          git submodule update --init tools/boostdep
+          python3 tools/boostdep/depinst/depinst.py --git_args "--jobs 3" $LIBRARY
+      - name: Test C++17/20/23
+        run: |
+          for std in 17 20 23; do
+            echo "======== Testing C++${std} ========"
+            cd ../boost-root
+            rm -rf __build__
+            mkdir __build__ && cd __build__
+            cmake -DBOOST_INCLUDE_LIBRARIES=$LIBRARY -DBUILD_TESTING=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DBOOST_INT128_ENABLE_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="75;86" -DCMAKE_CUDA_STANDARD=${std} ..
+            cmake --build . --target tests -j $(nproc)
+            ctest --output-on-failure --no-tests=error
+            cd $GITHUB_WORKSPACE
+          done
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
index bd2f7c8d..c1486f38 100644
--- a/.github/workflows/codecov.yml
+++ b/.github/workflows/codecov.yml
@@ -80,13 +80,13 @@ jobs:
           fi
           git config --global pack.threads 0
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # For coverage builds fetch the whole history, else only 1 commit using a 'fake ternary'
           fetch-depth: ${{ matrix.coverage && '0' || '1' }}
 
       - name: Cache ccache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         if: env.B2_USE_CCACHE
         with:
           path: ~/.ccache
@@ -94,7 +94,7 @@ jobs:
           restore-keys: ${{matrix.os}}-${{matrix.container}}-${{matrix.compiler}}-
 
       - name: Fetch Boost.CI
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: boostorg/boost-ci
           ref: master
diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
index 1cca9a72..c0dc100c 100644
--- a/.github/workflows/fuzz.yml
+++ b/.github/workflows/fuzz.yml
@@ -41,7 +41,22 @@ jobs:
           echo 'compiling test/fuzzing/test_fuzzing_div_versus_wide_int.cpp'
           ${{ matrix.compiler }} -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I./include -I${{runner.workspace}}/wide-integer test/fuzzing/test_fuzzing_div_versus_wide_int.cpp -o test_fuzzing_div_versus_wide_int
           echo "run test_fuzzing_div_versus_wide_int"
-          ./test_fuzzing_div_versus_wide_int -max_total_time=900 -max_len=32 -verbosity=0 -close_fd_mask=3
+          ./test_fuzzing_div_versus_wide_int -max_total_time=600 -max_len=32 -verbosity=0 -close_fd_mask=3
+
+          echo 'compiling test/fuzzing/test_fuzzing_add_versus_wide_int.cpp'
+          ${{ matrix.compiler }} -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I./include -I${{runner.workspace}}/wide-integer test/fuzzing/test_fuzzing_add_versus_wide_int.cpp -o test_fuzzing_add_versus_wide_int
+          echo "run test_fuzzing_add_versus_wide_int"
+          ./test_fuzzing_add_versus_wide_int -max_total_time=300 -max_len=32 -verbosity=0 -close_fd_mask=3
+          
+          echo 'compiling test/fuzzing/test_fuzzing_sub_versus_wide_int.cpp'
+          ${{ matrix.compiler }} -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I./include -I${{runner.workspace}}/wide-integer test/fuzzing/test_fuzzing_sub_versus_wide_int.cpp -o test_fuzzing_sub_versus_wide_int
+          echo "run test_fuzzing_sub_versus_wide_int"
+          ./test_fuzzing_sub_versus_wide_int -max_total_time=300 -max_len=32 -verbosity=0 -close_fd_mask=3
+          
+          echo 'compiling test/fuzzing/test_fuzzing_mul_versus_wide_int.cpp'
+          ${{ matrix.compiler }} -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I./include -I${{runner.workspace}}/wide-integer test/fuzzing/test_fuzzing_mul_versus_wide_int.cpp -o test_fuzzing_mul_versus_wide_int
+          echo "run test_fuzzing_mul_versus_wide_int"
+          ./test_fuzzing_mul_versus_wide_int -max_total_time=300 -max_len=32 -verbosity=0 -close_fd_mask=3
 
   clang-fuzzing-versus-wide-integer-no-builtin:
     runs-on: ubuntu-latest
@@ -68,4 +83,19 @@ jobs:
           echo 'compiling test/fuzzing/test_fuzzing_div_versus_wide_int.cpp'
           ${{ matrix.compiler }} -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I./include -I${{runner.workspace}}/wide-integer -DBOOST_INT128_NO_BUILTIN_INT128=1 test/fuzzing/test_fuzzing_div_versus_wide_int.cpp -o test_fuzzing_div_versus_wide_int
           echo "run test_fuzzing_div_versus_wide_int"
-          ./test_fuzzing_div_versus_wide_int -max_total_time=900 -max_len=32 -verbosity=0 -close_fd_mask=3
+          ./test_fuzzing_div_versus_wide_int -max_total_time=600 -max_len=32 -verbosity=0 -close_fd_mask=3
+          
+          echo 'compiling test/fuzzing/test_fuzzing_add_versus_wide_int.cpp'
+          ${{ matrix.compiler }} -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I./include -I${{runner.workspace}}/wide-integer -DBOOST_INT128_NO_BUILTIN_INT128=1 test/fuzzing/test_fuzzing_add_versus_wide_int.cpp -o test_fuzzing_add_versus_wide_int
+          echo "run test_fuzzing_add_versus_wide_int"
+          ./test_fuzzing_add_versus_wide_int -max_total_time=300 -max_len=32 -verbosity=0 -close_fd_mask=3
+          
+          echo 'compiling test/fuzzing/test_fuzzing_sub_versus_wide_int.cpp'
+          ${{ matrix.compiler }} -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I./include -I${{runner.workspace}}/wide-integer -DBOOST_INT128_NO_BUILTIN_INT128=1 test/fuzzing/test_fuzzing_sub_versus_wide_int.cpp -o test_fuzzing_sub_versus_wide_int
+          echo "run test_fuzzing_sub_versus_wide_int"
+          ./test_fuzzing_sub_versus_wide_int -max_total_time=300 -max_len=32 -verbosity=0 -close_fd_mask=3
+          
+          echo 'compiling test/fuzzing/test_fuzzing_mul_versus_wide_int.cpp'
+          ${{ matrix.compiler }} -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I./include -I${{runner.workspace}}/wide-integer -DBOOST_INT128_NO_BUILTIN_INT128=1 test/fuzzing/test_fuzzing_mul_versus_wide_int.cpp -o test_fuzzing_mul_versus_wide_int
+          echo "run test_fuzzing_mul_versus_wide_int"
+          ./test_fuzzing_mul_versus_wide_int -max_total_time=300 -max_len=32 -verbosity=0 -close_fd_mask=3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1ff815bb..a965c929 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 3.10...3.20)
 
 # Set version explicitly if not part of Boost superproject
 if(NOT BOOST_SUPERPROJECT_VERSION)
-    set(BOOST_INT128_VERSION 1.6.0)
+    set(BOOST_INT128_VERSION 1.6.1)
 else()
     set(BOOST_INT128_VERSION ${BOOST_SUPERPROJECT_VERSION})
 endif()
diff --git a/README.md b/README.md
index fbc62835..c57a2c5f 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ int128 is under active development and is not an official boost library.
 
 This library is header only. It contains no other dependencies.
 Simply `#include` it and use it.
+With C++20 and greater you can instead `import boost.int128`.
 
 ## CMake
 
@@ -64,13 +65,13 @@ struct int128_t;
 
 These types operate like built-in integer types.
 They have their own implementations of the Standard-Library functions
-(e.g. like those found in `<limits>`, `<iostream>`, `<bit>` etc.).
+(e.g., like those found in `<limits>`, `<iostream>`, `<bit>` etc.).
 
 The entire library can be conveniently included with `#include <boost/int128.hpp>`
 
 # Full Documentation
 
-The complete documentation can be found at: https://master.int128.cpp.al
+The complete documentation can be found at: https://develop.int128.cpp.al
 
 ## References
 
diff --git a/doc/antora.yml b/doc/antora.yml
index 11c33ddb..68478dfe 100644
--- a/doc/antora.yml
+++ b/doc/antora.yml
@@ -1,6 +1,9 @@
 name: ROOT
 version: ~
 title: Boost.int128
+asciidoc:
+  attributes:
+    page-pagination: ''
 nav:
   - modules/ROOT/nav.adoc
 start_page: overview.adoc
diff --git a/doc/int128-playbook.yml b/doc/int128-playbook.yml
index 6379b801..50418f66 100644
--- a/doc/int128-playbook.yml
+++ b/doc/int128-playbook.yml
@@ -5,7 +5,7 @@ content:
   sources:
     - url: ..
       start_path: doc
-      branches: HEAD
+      branches: develop
 output:
   dir: html
 ui:
diff --git a/doc/modules/ROOT/nav.adoc b/doc/modules/ROOT/nav.adoc
index 7b6280e2..f1b1df9b 100644
--- a/doc/modules/ROOT/nav.adoc
+++ b/doc/modules/ROOT/nav.adoc
@@ -6,12 +6,13 @@
 ** xref:examples.adoc#examples_rollover[Rollover Behavior]
 ** xref:examples.adoc#examples_bit[`<bit>` support]
 ** xref:examples.adoc#examples_numeric[`<numeric>` support (Saturating Arithmetic)]
-** xref:examples.adoc#examples_numeric[`<numeric>` support (Numeric Logarithms)]
+** xref:examples.adoc#examples_numeric_algorithms[`<numeric>` support (Numeric Algorithms)]
 ** xref:examples.adoc#examples_mixed_sign[Mixed Signedness Arithmetic]
 ** xref:examples.adoc#examples_to_string[String Conversion (to_string)]
 ** xref:examples.adoc#examples_boost_math_random[Boost Math and Random Integration]
 ** xref:examples.adoc#examples_boost_charconv[Boost.Charconv Integration]
 ** xref:examples.adoc#examples_cstdlib[`<cstdlib>` support (Combined div and mod)]
+** xref:examples.adoc#examples_cuda[Use of the library in a CUDA kernel]
 * xref:api_reference.adoc[]
 ** xref:api_reference.adoc#api_namespaces[Namespaces]
 ** xref:api_reference.adoc#api_types[Types]
@@ -29,6 +30,7 @@
 ** xref:api_reference.adoc#api_macros[Macros]
 *** xref:api_reference.adoc#api_macro_literals[Literals]
 *** xref:api_reference.adoc#api_macro_configuration[Configuration]
+** xref:api_reference.adoc#api_headers[Headers]
 * xref:uint128_t.adoc[]
 ** xref:uint128_t.adoc#u128_alignment[Alignment]
 ** xref:uint128_t.adoc#u128_operator_behavior[Operator Behavior]
diff --git a/doc/modules/ROOT/pages/api_reference.adoc b/doc/modules/ROOT/pages/api_reference.adoc
index a52e1a3a..af870252 100644
--- a/doc/modules/ROOT/pages/api_reference.adoc
+++ b/doc/modules/ROOT/pages/api_reference.adoc
@@ -20,6 +20,9 @@ https://www.boost.org/LICENSE_1_0.txt
 
 | xref:literals.adoc[`boost::int128::literals`]
 | User-defined literals for 128-bit integers
+
+| xref:charconv.adoc[`boost::charconv`]
+| `to_chars` and `from_chars` overloads for 128-bit integers (requires Boost.Charconv)
 |===
 
 [#api_types]
@@ -146,7 +149,7 @@ Listed by analogous STL header.
 |===
 | Function | Description
 
-| `abs`
+| xref:int128_t.adoc#i128_math_operators[`abs`]
 | Absolute value
 |===
 
@@ -307,6 +310,9 @@ Listed by analogous STL header.
 
 | xref:config.adoc#disable_exceptions[`BOOST_INT128_DISABLE_EXCEPTIONS`]
 | Disables exception throwing
+
+| xref:config.adoc#enable_cuda[`BOOST_INT128_ENABLE_CUDA`]
+| Enables CUDA support allowing the library types and functions to be run on both host and device
 |===
 
 ==== Automatic Configuration
@@ -323,4 +329,58 @@ Listed by analogous STL header.
 
 | xref:config.adoc#automatic_config[`BOOST_INT128_ENDIAN_BIG_BYTE`]
 | Defined on big-endian systems
-|===
\ No newline at end of file
+
+| xref:config.adoc#host_device[`BOOST_INT128_HOST_DEVICE`]
+| Expands to `pass:[__host__ __device__]` under NVCC for CUDA support
+|===
+
+[#api_headers]
+== Headers
+
+[cols="1,2", options="header"]
+|===
+| Header | Contents
+
+| `<boost/int128.hpp>`
+| Convenience header including the entire library
+
+| xref:bit.adoc[`<boost/int128/bit.hpp>`]
+| Bit manipulation functions
+
+| xref:charconv.adoc[`<boost/int128/charconv.hpp>`]
+| Character conversion functions
+
+| `<boost/int128/climits.hpp>`
+| Min and max macros
+
+| xref:cstdlib.adoc[`<boost/int128/cstdlib.hpp>`]
+| Combined division and modulo function
+
+| xref:format.adoc#fmt_format[`<boost/int128/fmt_format.hpp>`]
+| Formatting integration for pass:[{fmt}]
+
+| xref:format.adoc#std_format[`<boost/int128/format.hpp>`]
+| Formatting integration for pass:[C++20] `<format>`
+
+| `<boost/int128/int128.hpp>`
+| The xref:uint128_t.adoc[`uint128_t`] and xref:int128_t.adoc[`int128_t`] types
+
+| xref:stream.adoc[`<boost/int128/iostream.hpp>`]
+| Iostream overloads for `int128_t` and `uint128_t`
+
+| `<boost/int128/limits.hpp>`
+| Overloads for `std::numeric_limits` for `int128_t` and `uint128_t`
+
+| xref:literals.adoc[`<boost/int128/literals.hpp>`]
+| User-defined literals for `int128_t` and `uint128_t`
+
+| xref:numeric.adoc[`<boost/int128/numeric.hpp>`]
+| Numeric algorithms (gcd, lcm, midpoint)
+
+| xref:string.adoc[`<boost/int128/string.hpp>`]
+| `to_string` overloads
+
+| `<boost/int128/random.hpp>`
+| Required for usage of Boost.Random
+
+|===
diff --git a/doc/modules/ROOT/pages/bit.adoc b/doc/modules/ROOT/pages/bit.adoc
index f9bfe4eb..a25f94e9 100644
--- a/doc/modules/ROOT/pages/bit.adoc
+++ b/doc/modules/ROOT/pages/bit.adoc
@@ -28,7 +28,7 @@ Returns `true` if `x` is a power of two; otherwise `false`
 namespace boost {
 namespace int128 {
 
-constexpr bool has_single_bit(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool has_single_bit(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -44,7 +44,7 @@ Returns the number of consecutive `0` bits in the value `x`, starting from the m
 namespace boost {
 namespace int128 {
 
-constexpr int countl_zero(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int countl_zero(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -60,7 +60,7 @@ Returns the number of consecutive `1` bits in the value `x`, starting from the m
 namespace boost {
 namespace int128 {
 
-constexpr int countl_one(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int countl_one(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -77,7 +77,7 @@ If `x` is zero, returns 0
 namespace boost {
 namespace int128 {
 
-constexpr int bit_width(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int bit_width(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -93,7 +93,7 @@ Returns the smallest integral power of two that is not smaller than `x`.
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t bit_ceil(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t bit_ceil(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -110,7 +110,7 @@ If `x` is 0 then returns 0.
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t bit_floor(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t bit_floor(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -126,7 +126,7 @@ Returns the number of consecutive `0` bits in the value `x`, starting from the l
 namespace boost {
 namespace int128 {
 
-constexpr int countr_zero(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int countr_zero(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -142,7 +142,7 @@ Returns the number of consecutive `1` bits in the value `x`, starting from the l
 namespace boost {
 namespace int128 {
 
-constexpr int countr_one(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int countr_one(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -159,7 +159,7 @@ This operation is also known as a left circular shift.
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t rotl(uint128_t x, int s) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t rotl(uint128_t x, int s) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -176,7 +176,7 @@ This operation is also known as a right circular shift.
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t rotr(uint128_t x, int s) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t rotr(uint128_t x, int s) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -192,7 +192,7 @@ Returns the number of `1` bits in `x`.
 namespace boost {
 namespace int128 {
 
-constexpr int popcount(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int popcount(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -208,7 +208,7 @@ Reverses the bytes in the given integer value `x`.
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t byteswap(uint128_t x) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t byteswap(uint128_t x) noexcept;
 
 } // namespace int128
 } // namespace boost
diff --git a/doc/modules/ROOT/pages/config.adoc b/doc/modules/ROOT/pages/config.adoc
index 701e0a46..c9f0b9f8 100644
--- a/doc/modules/ROOT/pages/config.adoc
+++ b/doc/modules/ROOT/pages/config.adoc
@@ -17,6 +17,10 @@ https://www.boost.org/LICENSE_1_0.txt
 
 These macros allow customization of library behavior. User-configurable macros should be defined before including any library headers.
 
+[#enable_cuda]
+- `BOOST_INT128_ENABLE_CUDA`: Defining this macro allows both types and selected functions to be run on both host and device when compiling with NVCC.
+Allowed functions have `BOOST_INT128_HOST_DEVICE` as part of their function signature in their documentation.
+
 [#no_int128]
 - `BOOST_INT128_NO_BUILTIN_INT128`: The user may define this when they do not want the internal implementations to rely on builtin `pass:[__int128]` or `pass:[unsigned __int128]` types.
 
@@ -42,3 +46,7 @@ This macro will automatically be defined in the presence of `-fno-exceptions` or
 - `BOOST_INT128_ENDIAN_LITTLE_BYTE`: This is defined to `1` when compiling on a little endian architecture, otherwise `0`.
 
 - `BOOST_INT128_ENDIAN_BIG_BYTE`: This is defined to `1` when compiling on a big endian architecture, otherwise `0`.
+
+[#host_device]
+- `BOOST_INT128_HOST_DEVICE`: This is defined to `pass:[__host__ __device__]` when compiling with NVCC (`pass:[__NVCC__]` is defined), and to nothing otherwise.
+All public functions, constructors, operators, and conversion operators in the library are annotated with this macro, allowing `int128_t` and `uint128_t` to be used in CUDA device code without modification.
diff --git a/doc/modules/ROOT/pages/cstdlib.adoc b/doc/modules/ROOT/pages/cstdlib.adoc
index de43ed55..b3a6ea8f 100644
--- a/doc/modules/ROOT/pages/cstdlib.adoc
+++ b/doc/modules/ROOT/pages/cstdlib.adoc
@@ -51,9 +51,9 @@ Using the structures defined above, the `div` function computes both quotient an
 namespace boost {
 namespace int128 {
 
-constexpr u128div_t div(uint128_t lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr u128div_t div(uint128_t lhs, uint128_t rhs) noexcept;
 
-constexpr i128div_t div(int128_t lhs, int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr i128div_t div(int128_t lhs, int128_t rhs) noexcept;
 
 } // namespace int128
 } // namespace boost
diff --git a/doc/modules/ROOT/pages/examples.adoc b/doc/modules/ROOT/pages/examples.adoc
index c6be0982..7a8a4693 100644
--- a/doc/modules/ROOT/pages/examples.adoc
+++ b/doc/modules/ROOT/pages/examples.adoc
@@ -303,6 +303,17 @@ signed_value / 4U = 0
 ----
 include::example$math_and_random.cpp[]
 ----
+
+.Example Output (values vary per run)
+[listing]
+----
+=== uint128_t ===
+    Mean: 22125900135088040520646253247977468
+Variance: 15183108029620265677746188314852225
+  Median: 169775281866460752209725324063124732284
+=== int128_t ===
+Random int128_t: 45422201008201503618595888886744218664
+----
 ====
 
 [#examples_boost_charconv]
@@ -481,3 +492,22 @@ Verification: 142857142857142857 * 7 + 1 = 1000000000000000000
 3 / 10 = 0 remainder 3
 ----
 ====
+
+[#examples_cuda]
+== CUDA Usage
+
+.This https://github.com/cppalliance/int128/blob/develop/examples/cuda.cu[example] demonstrates how to use library types and functions inside a CUDA kernel.
+====
+[source, c++]
+----
+include::example$cuda.cu[]
+----
+
+.Expected Output
+[listing]
+----
+[Vector operation on 50000 elements]
+CUDA kernel launch with 196 blocks of 256 threads
+All CPU and GPU computed elements match!
+----
+====
diff --git a/doc/modules/ROOT/pages/file_structure.adoc b/doc/modules/ROOT/pages/file_structure.adoc
index c2d645a3..84c6765f 100644
--- a/doc/modules/ROOT/pages/file_structure.adoc
+++ b/doc/modules/ROOT/pages/file_structure.adoc
@@ -16,22 +16,25 @@ The entire library can be consumed via `<boost/int128.hpp>`, or by independently
 | Header | Description
 
 | `<boost/int128.hpp>`
-| The complete library (includes all headers below)
+| Convenience header (includes most headers below; does not include `charconv.hpp`, `fmt_format.hpp`, or `random.hpp`)
 
 | xref:bit.adoc[`<boost/int128/bit.hpp>`]
 | Bit manipulation functions
 
 | xref:charconv.adoc[`<boost/int128/charconv.hpp>`]
-| Character conversion (`to_chars`/`from_chars`)
+| Character conversion (`to_chars`/`from_chars`); requires Boost.Charconv headers
 
 | xref:api_reference.adoc#api_macro_literals[`<boost/int128/climits.hpp>`]
 | C-style limit macros (`BOOST_INT128_INT128_MAX`, etc.)
 
+| xref:config.adoc[`<boost/int128/config.hpp>`]
+| Configuration macros
+
 | xref:cstdlib.adoc[`<boost/int128/cstdlib.hpp>`]
 | Division with quotient and remainder (`div`)
 
 | xref:format.adoc[`<boost/int128/fmt_format.hpp>`]
-| `pass:[{fmt}]` library support
+| `pass:[{fmt}]` library support; requires the `pass:[{fmt}]` library
 
 | xref:format.adoc[`<boost/int128/format.hpp>`]
 | C++20 `std::format` support
@@ -50,4 +53,10 @@ The entire library can be consumed via `<boost/int128.hpp>`, or by independently
 
 | xref:numeric.adoc[`<boost/int128/numeric.hpp>`]
 | Numeric functions (`gcd`, `lcm`, saturating arithmetic)
+
+| `<boost/int128/random.hpp>`
+| Traits for usage with Boost.Random
+
+| xref:string.adoc[`<boost/int128/string.hpp>`]
+| `to_string` overloads
 |===
diff --git a/doc/modules/ROOT/pages/format.adoc b/doc/modules/ROOT/pages/format.adoc
index 40304463..c257cf58 100644
--- a/doc/modules/ROOT/pages/format.adoc
+++ b/doc/modules/ROOT/pages/format.adoc
@@ -37,7 +37,7 @@ Examples:
 | `{:*^6d}` | `"**42**"` (centered with asterisks)
 |===
 
-NOTE: When no alignment is specified but a width is given (e.g., `{:6d}`), zero-padding is applied from the left.
+NOTE: When no alignment is specified but a `0` prefix and width are given (e.g., `{:06d}`), zero-padding is applied from the left. Without the `0` prefix (e.g., `{:6d}`), space-padding is applied instead, matching `std::format` behavior.
 
 == Sign
 
diff --git a/doc/modules/ROOT/pages/i128_benchmarks.adoc b/doc/modules/ROOT/pages/i128_benchmarks.adoc
index 1914394d..915e1be5 100644
--- a/doc/modules/ROOT/pages/i128_benchmarks.adoc
+++ b/doc/modules/ROOT/pages/i128_benchmarks.adoc
@@ -12,7 +12,7 @@ https://www.boost.org/LICENSE_1_0.txt
 
 The benchmarks below represent the time in microseconds it takes to perform 20'000'000 operations between two values of random width (e.g. 2x1 words, 1x2 words, etc.).
 On most platforms we use the builtin `\__int128` as the reference benchmark.
-When this is unavailable (such as on 32-bit architectures) we us `boost::multiprecision::int128_t` (abbreviated as `boost::mp::int128_t`) as it is widely used, and known to be portable.
+When this is unavailable (such as on 32-bit architectures) we use `boost::multiprecision::int128_t` (abbreviated as `boost::mp::int128_t`) as it is widely used, and known to be portable.
 On MSVC platforms we use as reference `std::_Signed128` from the header `<__msvc_int128.hpp>` since this is bundled with their compiler.
 
 [#i128_linux]
@@ -56,7 +56,7 @@ image::i128_graphs/linux/x64_relative_performance.png[x64 Relative Performance,
 image::i128_graphs/linux/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100%]
 ////
 
-image::i128_graphs/linux/ARM64_relative_performance.png[x64 Relative Performance, width=100%]
+image::i128_graphs/linux/ARM64_relative_performance.png[ARM64 Relative Performance, width=100%]
 
 === S390x
 
diff --git a/doc/modules/ROOT/pages/int128_t.adoc b/doc/modules/ROOT/pages/int128_t.adoc
index a03e6eb7..70090981 100644
--- a/doc/modules/ROOT/pages/int128_t.adoc
+++ b/doc/modules/ROOT/pages/int128_t.adoc
@@ -57,12 +57,7 @@ If your platform has a native 128-bit signed integer, the struct is defined as s
 struct alignas(alignof(__int128)) int128_t
 ----
 
-Otherwise, it is
-
-[source, c++]
-----
-struct alignas(sizeof(std::uint64_t) * 2) int128_t
-----
+Otherwise, it is left up to the compiler to decide.
 
 [#i128_operator_behavior]
 == Operator Behavior
@@ -86,30 +81,30 @@ struct int128_t
     ...
 
     // Defaulted basic construction
-    constexpr int128_t() noexcept = default;
-    constexpr int128_t(const int128_t&) noexcept = default;
-    constexpr int128_t(int128_t&&) noexcept = default;
-    constexpr int128_t& operator=(const int128_t&) noexcept = default;
-    constexpr int128_t& operator=(int128_t&&) noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t() noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const int128_t&) noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(int128_t&&) noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator=(const int128_t&) noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator=(int128_t&&) noexcept = default;
 
-    constexpr int128_t(const uint128_t& v) noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr int128_t(const uint128_t& v) noexcept;
 
     // Construct from integral types
-    constexpr int128_t(const std::int64_t hi, const std::uint64_t lo) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const std::int64_t hi, const std::uint64_t lo) noexcept;
 
     template <BOOST_INT128_SIGNED_INTEGER_CONCEPT SignedInteger>
-    constexpr int128_t(const SignedInteger v) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const SignedInteger v) noexcept;
 
     template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT UnsignedInteger>
-    constexpr int128_t(const UnsignedInteger v) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const UnsignedInteger v) noexcept;
 
     #ifdef BOOST_INT128_HAS_INT128
 
     // Typically a typedef from __int128
-    constexpr int128_t(const detail::builtin_i128 v) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const detail::builtin_i128 v) noexcept;
 
     // Typically a typedef unsigned __int128
-    constexpr int128_t(const detail::builtin_u128 v) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const detail::builtin_u128 v) noexcept;
 
     #endif // BOOST_INT128_HAS_INT128
 };
@@ -134,26 +129,26 @@ struct int128_t
     ...
 
     // Integer conversion operators
-    constexpr operator bool() const noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr operator bool() const noexcept;
 
     template <BOOST_INT128_SIGNED_INTEGER_CONCEPT SignedInteger>
-    explicit constexpr operator SignedInteger() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator SignedInteger() const noexcept;
 
     template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT UnsignedInteger>
-    explicit constexpr operator UnsignedInteger() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator UnsignedInteger() const noexcept;
 
     #ifdef BOOST_INT128_HAS_INT128
 
-    explicit constexpr operator detail::builtin_i128() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator detail::builtin_i128() const noexcept;
 
-    explicit constexpr operator detail::builtin_u128() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator detail::builtin_u128() const noexcept;
 
     #endif // BOOST_INT128_HAS_INT128
 
     // Conversion to float
-    explicit constexpr operator float() const noexcept;
-    explicit constexpr operator double() const noexcept;
-    explicit constexpr operator long double() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator float() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator double() const noexcept;
+    explicit constexpr operator long double() const noexcept; // There are no long doubles on device
 };
 
 } // namespace int128
@@ -173,12 +168,12 @@ as the number of digits it represents can exceed the precision of the significan
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator<(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator<(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr bool operator<(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is less than the `rhs` value without exception.
@@ -189,12 +184,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator<=(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator<=(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr bool operator<=(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is less than or equal to the `rhs` value without exception.
@@ -205,12 +200,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator>(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator>(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr bool operator>(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is greater than the `rhs` value without exception.
@@ -221,12 +216,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator>=(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator>=(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr bool operator>=(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is greater than or equal to the `rhs` value without exception.
@@ -237,12 +232,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator==(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator==(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr bool operator==(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is equal to the `rhs` value without exception.
@@ -253,12 +248,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator!=(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator!=(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr bool operator!=(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is not equal to the `rhs` value without exception.
@@ -268,13 +263,13 @@ This operation is only defined for integers and is subject to mixed sign limitat
 
 [source, c++]
 ----
-constexpr std::strong_ordering operator<=>(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const int128_t lhs, const int128_t rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr std::strong_ordering operator<=>(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr std::strong_ordering operator<=>(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const Integer lhs, const int128_t rhs) noexcept;
 ----
 
 Returns one of the following without exception:
@@ -290,7 +285,7 @@ Returns one of the following without exception:
 
 [source, c++]
 ----
-constexpr int128_t operator~(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator~(const int128_t rhs) noexcept
 ----
 
 Returns the bitwise negation of `rhs` without exception.
@@ -300,12 +295,12 @@ Returns the bitwise negation of `rhs` without exception.
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator|(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator|(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator|(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns the bitwise or of `lhs` and `rhs` without exception.
@@ -316,12 +311,12 @@ This operation is subject to mixed sign limitations discussed xref:int128_t.adoc
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator&(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator&(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator&(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns the bitwise and of `lhs` and `rhs` without exception.
@@ -332,12 +327,12 @@ This operation is subject to mixed sign limitations discussed xref:int128_t.adoc
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator^(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator^(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator^(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns the bitwise xor of `lhs` and `rhs` without exception.
@@ -348,18 +343,18 @@ This operation is subject to mixed sign limitations discussed xref:int128_t.adoc
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator<<(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator<<(const int128_t lhs, const Integer rhs) noexcept;
 
 template <typename Integer, std::enable_if_t<std::is_integral<Integer>::value && (sizeof(Integer) * 8 > 16), bool> = true>
-constexpr Integer operator<<(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr Integer operator<<(const Integer lhs, const int128_t rhs) noexcept;
 
 template <typename SignedInteger, std::enable_if_t<detail::is_signed_integer_v<SignedInteger> && (sizeof(SignedInteger) * 8 <= 16), bool> = true>
-constexpr int operator<<(const SignedInteger lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int operator<<(const SignedInteger lhs, const int128_t rhs) noexcept;
 
 template <typename UnsignedInteger, std::enable_if_t<detail::is_unsigned_integer_v<UnsignedInteger> && (sizeof(UnsignedInteger) * 8 <= 16), bool> = true>
-constexpr unsigned int operator<<(const UnsignedInteger lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr unsigned int operator<<(const UnsignedInteger lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator<<(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator<<(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns the bitwise left shift of `lhs` without exception.
@@ -371,18 +366,18 @@ This operation is subject to mixed sign limitations discussed xref:int128_t.adoc
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator>>(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator>>(const int128_t lhs, const Integer rhs) noexcept;
 
 template <typename Integer, std::enable_if_t<std::is_integral<Integer>::value && (sizeof(Integer) * 8 > 16), bool> = true>
-constexpr Integer operator>>(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr Integer operator>>(const Integer lhs, const int128_t rhs) noexcept;
 
 template <typename SignedInteger, std::enable_if_t<detail::is_signed_integer_v<SignedInteger> && (sizeof(SignedInteger) * 8 <= 16), bool> = true>
-constexpr int operator>>(const SignedInteger lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int operator>>(const SignedInteger lhs, const int128_t rhs) noexcept;
 
 template <typename UnsignedInteger, std::enable_if_t<detail::is_unsigned_integer_v<UnsignedInteger> && (sizeof(UnsignedInteger) * 8 <= 16), bool> = true>
-constexpr unsigned operator>>(UnsignedInteger lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr unsigned operator>>(UnsignedInteger lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator>>(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator>>(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns the bitwise right shift of `lhs` without exception.
@@ -398,12 +393,12 @@ This operation is subject to mixed sign limitations discussed xref:int128_t.adoc
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator+(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator+(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator+(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns as an `int128_t` the sum of `lhs` and `rhs`.
@@ -415,12 +410,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator-(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator-(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator-(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns as an `int128_t` the difference of `lhs` and `rhs`.
@@ -432,12 +427,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator*(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator*(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator*(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns as an `int128_t` the product of `lhs` and `rhs`.
@@ -449,12 +444,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator/(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator/(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator/(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns as an `int128_t` the quotient of `lhs` and `rhs` without exception.
@@ -465,12 +460,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator%(const int128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr int128_t operator%(const Integer lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const Integer lhs, const int128_t rhs) noexcept;
 
-constexpr int128_t operator%(const int128_t lhs, const int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const int128_t rhs) noexcept;
 ----
 
 Returns as an `int128_t` the remainder of `lhs` and `rhs` without exception.
diff --git a/doc/modules/ROOT/pages/literals.adoc b/doc/modules/ROOT/pages/literals.adoc
index c075e67b..bb4a5a0f 100644
--- a/doc/modules/ROOT/pages/literals.adoc
+++ b/doc/modules/ROOT/pages/literals.adoc
@@ -18,29 +18,29 @@ namespace boost {
 namespace int128 {
 namespace literals {
 
-constexpr uint128_t operator ""_u128(const char* str) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_u128(const char* str) noexcept;
 
-constexpr uint128_t operator ""_U128(const char* str) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_U128(const char* str) noexcept;
 
-constexpr uint128_t operator ""_u128(const char* str, std::size_t len) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_u128(const char* str, std::size_t len) noexcept;
 
-constexpr uint128_t operator ""_U128(const char* str, std::size_t len) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_U128(const char* str, std::size_t len) noexcept;
 
-constexpr uint128_t operator ""_u128(unsigned long long v) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_u128(unsigned long long v) noexcept;
 
-constexpr uint128_t operator ""_U128(unsigned long long v) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_U128(unsigned long long v) noexcept;
 
-constexpr int128_t operator ""_i128(const char* str) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_i128(const char* str) noexcept;
 
-constexpr int128_t operator ""_I128(const char* str) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_I128(const char* str) noexcept;
 
-constexpr int128_t operator ""_i128(unsigned long long v) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_i128(const char* str, std::size_t len) noexcept;
 
-constexpr int128_t operator ""_I128(unsigned long long v) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_I128(const char* str, std::size_t len) noexcept;
 
-constexpr int128_t operator ""_i128(const char* str, std::size_t len) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_i128(unsigned long long v) noexcept;
 
-constexpr int128_t operator ""_I128(const char* str, std::size_t len) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_I128(unsigned long long v) noexcept;
 
 } // namespace literals
 } // namespace int128
diff --git a/doc/modules/ROOT/pages/mixed_type_ops.adoc b/doc/modules/ROOT/pages/mixed_type_ops.adoc
index bc932eae..38d2e356 100644
--- a/doc/modules/ROOT/pages/mixed_type_ops.adoc
+++ b/doc/modules/ROOT/pages/mixed_type_ops.adoc
@@ -32,55 +32,194 @@ namespace int128 {
 // Comparison Operators
 //=====================================
 
-constexpr bool operator==(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(uint128_t lhs, int128_t rhs);
 
-constexpr bool operator==(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(int128_t lhs, uint128_t rhs);
 
-constexpr bool operator!=(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(uint128_t lhs, int128_t rhs);
 
-constexpr bool operator!=(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(int128_t lhs, uint128_t rhs);
 
-constexpr bool operator<(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(uint128_t lhs, int128_t rhs);
 
-constexpr bool operator<(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(int128_t lhs, uint128_t rhs);
 
-constexpr bool operator<=(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(uint128_t lhs, int128_t rhs);
 
-constexpr bool operator<=(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(int128_t lhs, uint128_t rhs);
 
-constexpr bool operator>(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(uint128_t lhs, int128_t rhs);
 
-constexpr bool operator>(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(int128_t lhs, uint128_t rhs);
 
-constexpr bool operator>=(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(uint128_t lhs, int128_t rhs);
 
-constexpr bool operator>=(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(int128_t lhs, uint128_t rhs);
 
 //=====================================
 // Arithmetic Operators
 //=====================================
 
-constexpr uint128_t operator+(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(uint128_t lhs, int128_t rhs);
 
-constexpr uint128_t operator+(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(int128_t lhs, uint128_t rhs);
 
-constexpr uint128_t operator-(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(uint128_t lhs, int128_t rhs);
 
-constexpr uint128_t operator-(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(int128_t lhs, uint128_t rhs);
 
-constexpr uint128_t operator*(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(uint128_t lhs, int128_t rhs);
 
-constexpr uint128_t operator*(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(int128_t lhs, uint128_t rhs);
 
-constexpr uint128_t operator/(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(uint128_t lhs, int128_t rhs);
 
-constexpr uint128_t operator/(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(int128_t lhs, uint128_t rhs);
 
-constexpr uint128_t operator%(uint128_t lhs, int128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(uint128_t lhs, int128_t rhs);
 
-constexpr uint128_t operator%(int128_t lhs, uint128_t rhs);
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(int128_t lhs, uint128_t rhs);
 
 } // namespace int128
 } // namespace boost
 
 ----
+
+== Comparisons
+
+If you define xref:config.adoc#sign_compare[`BOOST_INT128_ALLOW_SIGN_COMPARE`], the operators have the following behavior.
+
+=== Equality
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(int128_t lhs, uint128_t rhs);
+----
+
+If the `int128_t` argument is less than 0 returns `false`.
+Otherwise, returns the same as `static_cast<uint128_t>(lhs) == static_cast<uint128_t>(rhs)`.
+
+=== Inequality
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(int128_t lhs, uint128_t rhs);
+----
+
+If the `int128_t` argument is less than 0 returns `true`.
+Otherwise, returns the same as `static_cast<uint128_t>(lhs) != static_cast<uint128_t>(rhs)`.
+
+=== Less Than
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(int128_t lhs, uint128_t rhs);
+----
+
+If `lhs` is type `int128_t` returns `true` if `lhs < 0`
+If `rhs` is type `int128_t` returns `false` if `rhs < 0`
+Otherwise, returns the same as `static_cast<uint128_t>(lhs) < static_cast<uint128_t>(rhs)`.
+
+=== Less Than or Equal To
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(int128_t lhs, uint128_t rhs);
+----
+
+If `lhs` is type `int128_t` returns `true` if `lhs < 0`
+If `rhs` is type `int128_t` returns `false` if `rhs < 0`
+Otherwise, returns the same as `static_cast<uint128_t>(lhs) pass:[<=] static_cast<uint128_t>(rhs)`.
+
+=== Greater Than
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(int128_t lhs, uint128_t rhs);
+----
+
+If `lhs` is type `int128_t` returns `false` if `lhs < 0`
+If `rhs` is type `int128_t` returns `true` if `rhs < 0`
+Otherwise, returns the same as `static_cast<uint128_t>(lhs) > static_cast<uint128_t>(rhs)`.
+
+=== Greater Than or Equal To
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(int128_t lhs, uint128_t rhs);
+----
+
+If `lhs` is type `int128_t` returns `false` if `lhs < 0`
+If `rhs` is type `int128_t` returns `true` if `rhs < 0`
+Otherwise, returns the same as `static_cast<uint128_t>(lhs) pass:[>=] static_cast<uint128_t>(rhs)`.
+
+== Arithmetic
+
+If you define xref:config.adoc#sign_conversion[`BOOST_INT128_ALLOW_SIGN_CONVERSION`], the operators have the following behavior.
+
+=== Addition
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(int128_t lhs, uint128_t rhs);
+----
+
+Returns the same as `static_cast<uint128_t>(lhs) + static_cast<uint128_t>(rhs)`
+
+=== Subtraction
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(int128_t lhs, uint128_t rhs);
+----
+
+Returns the same as `static_cast<uint128_t>(lhs) - static_cast<uint128_t>(rhs)`
+
+=== Multiplication
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(int128_t lhs, uint128_t rhs);
+----
+
+Returns the same as `static_cast<uint128_t>(lhs) * static_cast<uint128_t>(rhs)`
+
+=== Division
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(int128_t lhs, uint128_t rhs);
+----
+
+Returns the same as `static_cast<uint128_t>(lhs) / static_cast<uint128_t>(rhs)`
+
+=== Modulo
+
+[source, c++]
+----
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(uint128_t lhs, int128_t rhs);
+
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(int128_t lhs, uint128_t rhs);
+----
+
+Returns the same as `static_cast<uint128_t>(lhs) % static_cast<uint128_t>(rhs)`
diff --git a/doc/modules/ROOT/pages/numeric.adoc b/doc/modules/ROOT/pages/numeric.adoc
index 904dc145..432044c0 100644
--- a/doc/modules/ROOT/pages/numeric.adoc
+++ b/doc/modules/ROOT/pages/numeric.adoc
@@ -23,21 +23,21 @@ The following functions are provided for saturating arithmetic, and they *do not
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t add_sat(uint128_t lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t add_sat(uint128_t lhs, uint128_t rhs) noexcept;
 
-constexpr int128_t add_sat(int128_t lhs, int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(int128_t lhs, int128_t rhs) noexcept;
 
-constexpr uint128_t sub_sat(uint128_t lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(uint128_t lhs, uint128_t rhs) noexcept;
 
-constexpr int128_t sub_sat(int128_t lhs, int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(int128_t lhs, int128_t rhs) noexcept;
 
-constexpr uint128_t mul_sat(uint128_t lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t mul_sat(uint128_t lhs, uint128_t rhs) noexcept;
 
-constexpr int128_t mul_sat(int128_t lhs, int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t mul_sat(int128_t lhs, int128_t rhs) noexcept;
 
-constexpr uint128_t div_sat(uint128_t lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t div_sat(uint128_t lhs, uint128_t rhs) noexcept;
 
-constexpr int128_t div_sat(int128_t lhs, int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t div_sat(int128_t lhs, int128_t rhs) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -57,8 +57,11 @@ Should the `TargetIntegerType` not be able to represent the value of the `Librar
 namespace boost {
 namespace int128 {
 
-constexpr <typename LibraryIntegerType, typename TargetIntegerType>
-constexpr TargetIntegerType saturate_cast(LibraryIntegerType x) noexcept;
+template <typename TargetType>
+BOOST_INT128_HOST_DEVICE constexpr TargetType saturate_cast(uint128_t value) noexcept;
+
+template <typename TargetType>
+BOOST_INT128_HOST_DEVICE constexpr TargetType saturate_cast(int128_t value) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -76,9 +79,9 @@ Computes the greatest common divisor of `a` and `b`.
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t gcd(uint128_t a, uint128_t b) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t gcd(uint128_t a, uint128_t b) noexcept;
 
-constexpr int128_t gcd(const int128_t a, const int128_t b) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t gcd(const int128_t a, const int128_t b) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -97,9 +100,9 @@ Computes the least common multiple of `a` and `b`.
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t lcm(uint128_t a, uint128_t b) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t lcm(uint128_t a, uint128_t b) noexcept;
 
-constexpr int128_t lcm(const int128_t a, const int128_t b) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t lcm(const int128_t a, const int128_t b) noexcept;
 
 } // namespace int128
 } // namespace boost
@@ -118,9 +121,9 @@ Computes the midpoint of `a` and `b`, rounding towards `a`.
 namespace boost {
 namespace int128 {
 
-constexpr uint128_t midpoint(uint128_t a, uint128_t b) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t midpoint(uint128_t a, uint128_t b) noexcept;
 
-constexpr int128_t midpoint(const int128_t a, const int128_t b) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t midpoint(const int128_t a, const int128_t b) noexcept;
 
 } // namespace int128
 } // namespace boost
diff --git a/doc/modules/ROOT/pages/overview.adoc b/doc/modules/ROOT/pages/overview.adoc
index 236a54d4..bf37e313 100644
--- a/doc/modules/ROOT/pages/overview.adoc
+++ b/doc/modules/ROOT/pages/overview.adoc
@@ -15,16 +15,23 @@ Matt Borland
 Boost.Int128 is a portable implementation of a signed, and an unsigned 128-bit integer and related functionality (i.e. `<bit>`, `<iostream>`, etc).
 Importantly, on all platforms, the `sizeof` the types provided in this library are exactly 128-bits.
 
-The library is header-only, has no dependencies, and requires only C++14.
+The library is header-only, has no dependencies, and requires only pass:[C++14].
+When using pass:[C++20] or newer, the library can be consumed as a module `import boost.int128`.
 
 == Motivation
 
-128-bit integers are remarkably useful in a number of domains, but portability is often an issue.
-An example is a 64-bit machine running Linux (say Ubuntu 24.04) has `__int128`, but the same exact machine running Windows does not have this type.
+128-bit integers are useful across many domains, but pass:[C++] provides no portable way to use them.
+GCC and Clang offer `__int128` as a non-standard extension on 64-bit targets, but it lacks `std::numeric_limits` specializations, `<iostream>` support, and is absent entirely on MSVC.
+Multiprecision libraries can fill the gap, but typically at the cost of a larger `sizeof` and additional overhead (e.g., Boost.Multiprecision always has an extra word).
+Boost.Int128 solves this by providing types that are exactly 128-bits on every platform.
+Operation implementations rely on compiler intrinsics where available for native performance, and optimized software implementations elsewhere.
+The types provided by the library also natively support running on GPUs using CUDA, along with many of the functions.
 
 == Use Cases
 
-Anywhere 128-bit integers are needed.
+* **Networking** — IPv6 addresses are 128 bits wide; a single integer makes masking, comparison, and arithmetic straightforward.
+* **Unique identifiers** — UUIDs / GUIDs are 128-bit values commonly used as database keys and distributed system identifiers.
+* **Scientific and Financial computing** — Extended-range accumulators, large combinatorial values, and algorithms that need overflow-free 64×64 multiplication.
 
 == Supported Compilers
 
@@ -35,6 +42,7 @@ as well as emulated PPC64LE using QEMU with the following compilers:
 * Clang 5 and later
 * Visual Studio 2017 (14.1) and later
 * Intel OneAPI DPC++ 2024.2 and later
+* NVCC 12.8 and later
 
 Tested on https://github.com/cppalliance/int128/actions[Github Actions] and https://drone.cpp.al/cppalliance/int128[Drone].
 Coverage can be found on https://app.codecov.io/gh/cppalliance/int128[Codecov].
diff --git a/doc/modules/ROOT/pages/printer.adoc b/doc/modules/ROOT/pages/printer.adoc
index d1181dca..682651cb 100644
--- a/doc/modules/ROOT/pages/printer.adoc
+++ b/doc/modules/ROOT/pages/printer.adoc
@@ -8,8 +8,8 @@ https://www.boost.org/LICENSE_1_0.txt
 = Pretty Printers
 :idprefix: pretty_printers_
 
-Pretty printers allow debuggers to display `uint128_t` and `int128_t` values in human-readable decimal format instead of showing the raw struct members.
-The library contains pretty printers for LLDB and GDB in the `extra/` folder.
+Pretty printers allow debuggers to display `uint128_t` and `int128_t` values in human-readable format instead of showing the raw struct members.
+The library contains pretty printers for LLDB, GDB, and Visual Studio in the `extra/` folder.
 
 == LLDB
 
@@ -39,3 +39,76 @@ or you can source it manually in GDB.
 ----
 (gdb) source /path/to/int128/extra/int128_printer_gdb.py
 ----
+
+== Visual Studio (NATVIS)
+
+The `extra/int128.natvis` file provides visualization for the Visual Studio debugger.
+There are several ways to register it:
+
+=== Per-Project
+
+Add the `.natvis` file to your Visual Studio project.
+In Solution Explorer, right-click the project, select **Add > Existing Item**, and choose `int128.natvis`.
+Visual Studio will automatically use it when debugging that project.
+
+=== Per-User (All Projects)
+
+Copy `int128.natvis` to your per-user Visualizers directory:
+
+[source]
+----
+%USERPROFILE%\Documents\Visual Studio 2022\Visualizers\
+----
+
+Replace `2022` with your Visual Studio version.
+All projects debugged with that installation will use the visualizer.
+
+=== CMake Projects
+
+Add the `.natvis` file as a source file in your `CMakeLists.txt`:
+
+[source,cmake]
+----
+target_sources(my_target PRIVATE /path/to/int128/extra/int128.natvis)
+----
+
+=== Display Format
+
+Values that fit in 64 bits are displayed in decimal.
+Larger values are displayed as a synthesized hexadecimal value with a `'` digit separator between the high and low halves.
+
+[cols="1,1,2", options="header"]
+|===
+| Type | Value | Display
+
+| `uint128_t`
+| `42`
+| `42`
+
+| `uint128_t`
+| `2^64 + 1`
+| `0x0000000000000001'0000000000000001`
+
+| `uint128_t`
+| `uint128_max`
+| `0xFFFFFFFFFFFFFFFF'FFFFFFFFFFFFFFFF`
+
+| `int128_t`
+| `42`
+| `42`
+
+| `int128_t`
+| `-5`
+| `-5`
+
+| `int128_t`
+| `2^64 + 1`
+| `0x0000000000000001'0000000000000001`
+
+| `int128_t`
+| `int128_min`
+| `0x8000000000000000'0000000000000000`
+|===
+
+NOTE: Full decimal display for values beyond 64 bits is not possible in NATVIS.
+The NATVIS expression evaluator does not support 128-bit arithmetic, so values that exceed the 64-bit range are shown in hexadecimal.
diff --git a/doc/modules/ROOT/pages/stream.adoc b/doc/modules/ROOT/pages/stream.adoc
index bf6de608..04403303 100644
--- a/doc/modules/ROOT/pages/stream.adoc
+++ b/doc/modules/ROOT/pages/stream.adoc
@@ -44,7 +44,9 @@ The following flags from `<ios>` are supported for both streaming directions:
 - `std::oct` - Octal Numbers
 - `std::dec` - Decimal Numbers
 - `std::hex` - Hexadecimal Numbers
-- `std::uppercase` - Upper Case Formatting (e.g. 0XFFFF)
-- `std::nouppercase` - Lower Case Formatting (e.g. 0xffff)
+- `std::uppercase` - Upper Case Formatting (e.g. FFFF)
+- `std::nouppercase` - Lower Case Formatting (e.g. ffff)
+- `std::showbase` - Adds a leading base for hex or oct numbers (e.g. 0xffff)
+- `std::noshowbase` - Removes the leading base for hex or oct numbers (e.g. ffff)
 
 See the xref:examples.adoc#examples_io[IO streaming example] for usage demonstrations.
diff --git a/doc/modules/ROOT/pages/u128_benchmarks.adoc b/doc/modules/ROOT/pages/u128_benchmarks.adoc
index fabf1463..88f9a03b 100644
--- a/doc/modules/ROOT/pages/u128_benchmarks.adoc
+++ b/doc/modules/ROOT/pages/u128_benchmarks.adoc
@@ -12,7 +12,7 @@ https://www.boost.org/LICENSE_1_0.txt
 
 The benchmarks below represent the time in microseconds it takes to perform 20'000'000 operations between two values of random width (e.g. 2x1 words, 1x2 words, etc.).
 On most platforms we use the builtin `unsigned \__int128` as the reference benchmark.
-When this is unavailable (such as on 32-bit architectures) we us `boost::multiprecision::uint128_t` (abbreviated as `boost::mp::uint128_t`) as it is widely used, and known to be portable.
+When this is unavailable (such as on 32-bit architectures) we use `boost::multiprecision::uint128_t` (abbreviated as `boost::mp::uint128_t`) as it is widely used, and known to be portable.
 On MSVC platforms we use as reference `std::_Unsigned128` from the header `<__msvc_int128.hpp>` since this is bundled with their compiler.
 
 [#u128_linux]
@@ -56,7 +56,7 @@ image::u128_graphs/linux/x64_relative_performance.png[x64 Relative Performance,
 image::u128_graphs/linux/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100%]
 ////
 
-image::u128_graphs/linux/ARM64_relative_performance.png[x64 Relative Performance, width=100%]
+image::u128_graphs/linux/ARM64_relative_performance.png[ARM64 Relative Performance, width=100%]
 
 === S390x
 
@@ -232,12 +232,12 @@ image::u128_graphs/macos/ARM64_relative_performance.png[ARM64 Relative Performan
 |===
 | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t`
 
-| Comparisons | 131902 | 133564 | 134182
-| Addition | 20613 | 17912 | 40176
-| Subtraction | 20484 | 18237 | 40311
-| Multiplication | 20160 | 20580 | 43285
-| Division | 686521 | 699201 | 945928
-| Modulo | 777084 | 724648 | 953117
+| Comparisons | 688225 | 712352 | 689146
+| Addition | 104921 | 124992 | 137819
+| Subtraction | 129150 | 102302 | 153484
+| Multiplication | 120363 | 119652 | 164100
+| Division | 2333812 | 1981469 | 2784139
+| Modulo | 2621949 | 2219481 | 2736682
 |===
 
 ////
diff --git a/doc/modules/ROOT/pages/uint128_t.adoc b/doc/modules/ROOT/pages/uint128_t.adoc
index 56473ed7..88802b2c 100644
--- a/doc/modules/ROOT/pages/uint128_t.adoc
+++ b/doc/modules/ROOT/pages/uint128_t.adoc
@@ -57,12 +57,7 @@ If your platform has a native 128-bit unsigned integer, the struct is defined as
 struct alignas(alignof(unsigned __int128)) uint128_t
 ----
 
-Otherwise, it is
-
-[source, c++]
-----
-struct alignas(sizeof(std::uint64_t) * 2) uint128_t
-----
+Otherwise, it is left up to the compiler to decide.
 
 [#u128_operator_behavior]
 == Operator Behavior
@@ -125,30 +120,30 @@ struct uint128_t
     ...
 
     // Defaulted basic construction
-    constexpr uint128_t() noexcept = default;
-    constexpr uint128_t(const uint128_t&) noexcept = default;
-    constexpr uint128_t(uint128_t&&) noexcept = default;
-    constexpr uint128_t& operator=(const uint128_t&) noexcept = default;
-    constexpr uint128_t& operator=(uint128_t&&) noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t() noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const uint128_t&) noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(uint128_t&&) noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator=(const uint128_t&) noexcept = default;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator=(uint128_t&&) noexcept = default;
 
-    constexpr uint128_t(const int128_t& v) noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr uint128_t(const int128_t& v) noexcept;
 
     // Construct from integral types
-    constexpr uint128_t(const std::uint64_t hi, const std::uint64_t lo) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const std::uint64_t hi, const std::uint64_t lo) noexcept;
 
     template <BOOST_INT128_SIGNED_INTEGER_CONCEPT SignedInteger>
-    constexpr uint128_t(const SignedInteger v) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const SignedInteger v) noexcept;
 
     template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT UnsignedInteger>
-    constexpr uint128_t(const UnsignedInteger v) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const UnsignedInteger v) noexcept;
 
     #ifdef BOOST_INT128_HAS_INT128
 
     // Typically a typedef from __int128
-    constexpr uint128_t(const detail::builtin_i128 v) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const detail::builtin_i128 v) noexcept;
 
     // Typically a typedef unsigned __int128
-    constexpr uint128_t(const detail::builtin_u128 v) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const detail::builtin_u128 v) noexcept;
 
     #endif // BOOST_INT128_HAS_INT128
 };
@@ -173,26 +168,26 @@ struct uint128_t
     ...
 
     // Integer conversion operators
-    constexpr operator bool() const noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr operator bool() const noexcept;
 
     template <BOOST_INT128_SIGNED_INTEGER_CONCEPT SignedInteger>
-    explicit constexpr operator SignedInteger() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator SignedInteger() const noexcept;
 
     template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT UnsignedInteger>
-    explicit constexpr operator UnsignedInteger() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator UnsignedInteger() const noexcept;
 
     #ifdef BOOST_INT128_HAS_INT128
 
-    explicit constexpr operator detail::builtin_i128() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator detail::builtin_i128() const noexcept;
 
-    explicit constexpr operator detail::builtin_u128() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator detail::builtin_u128() const noexcept;
 
     #endif // BOOST_INT128_HAS_INT128
 
     // Conversion to float
-    explicit constexpr operator float() const noexcept;
-    explicit constexpr operator double() const noexcept;
-    explicit constexpr operator long double() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator float() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator double() const noexcept;
+    explicit constexpr operator long double() const noexcept; // There are no long doubles on device
 };
 
 } // namespace int128
@@ -212,12 +207,12 @@ as the number of digits it represents can exceed the precision of the significan
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator<(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator<(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr bool operator<(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is less than the `rhs` value without exception.
@@ -228,12 +223,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator<=(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator<=(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr bool operator<=(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is less than or equal to the `rhs` value without exception.
@@ -244,12 +239,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator>(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator>(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr bool operator>(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is greater than the `rhs` value without exception.
@@ -260,12 +255,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator>=(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator>=(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr bool operator>=(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is greater than or equal to the `rhs` value without exception.
@@ -276,12 +271,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator==(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator==(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr bool operator==(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is equal to the `rhs` value without exception.
@@ -292,12 +287,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator!=(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr bool operator!=(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr bool operator!=(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns if the `lhs` value is not equal to the `rhs` value without exception.
@@ -307,13 +302,13 @@ This operation is only defined for integers and is subject to mixed sign limitat
 
 [source, c++]
 ----
-constexpr std::strong_ordering operator<=>(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const uint128_t lhs, const uint128_t rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr std::strong_ordering operator<=>(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr std::strong_ordering operator<=>(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const Integer lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns one of the following without exception:
@@ -329,7 +324,7 @@ Returns one of the following without exception:
 
 [source, c++]
 ----
-constexpr uint128_t operator~(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator~(const uint128_t rhs) noexcept
 ----
 
 Returns the bitwise negation of `rhs` without exception.
@@ -339,12 +334,12 @@ Returns the bitwise negation of `rhs` without exception.
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator|(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator|(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator|(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns the bitwise or of `lhs` and `rhs` without exception.
@@ -355,12 +350,12 @@ This operation is subject to mixed sign limitations discussed xref:uint128_t.ado
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator&(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator&(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator&(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns the bitwise and of `lhs` and `rhs` without exception.
@@ -371,12 +366,12 @@ This operation is subject to mixed sign limitations discussed xref:uint128_t.ado
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator^(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator^(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator^(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns the bitwise xor of `lhs` and `rhs` without exception.
@@ -387,18 +382,18 @@ This operation is subject to mixed sign limitations discussed xref:uint128_t.ado
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator<<(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator<<(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <typename Integer, std::enable_if_t<std::is_integral<Integer>::value && (sizeof(Integer) * 8 > 16), bool> = true>
-constexpr Integer operator<<(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr Integer operator<<(const Integer lhs, const uint128_t rhs) noexcept;
 
 template <typename SignedInteger, std::enable_if_t<detail::is_signed_integer_v<SignedInteger> && (sizeof(SignedInteger) * 8 <= 16), bool> = true>
-constexpr int operator<<(const SignedInteger lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int operator<<(const SignedInteger lhs, const uint128_t rhs) noexcept;
 
 template <typename UnsignedInteger, std::enable_if_t<detail::is_unsigned_integer_v<UnsignedInteger> && (sizeof(UnsignedInteger) * 8 <= 16), bool> = true>
-constexpr unsigned int operator<<(const UnsignedInteger lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr unsigned int operator<<(const UnsignedInteger lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator<<(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator<<(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns the bitwise left shift of `lhs` without exception.
@@ -410,18 +405,18 @@ This operation is subject to mixed sign limitations discussed xref:uint128_t.ado
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator<<(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator>>(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <typename Integer, std::enable_if_t<std::is_integral<Integer>::value && (sizeof(Integer) * 8 > 16), bool> = true>
-constexpr Integer operator<<(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr Integer operator>>(const Integer lhs, const uint128_t rhs) noexcept;
 
 template <typename SignedInteger, std::enable_if_t<detail::is_signed_integer_v<SignedInteger> && (sizeof(SignedInteger) * 8 <= 16), bool> = true>
-constexpr int operator<<(const SignedInteger lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int operator>>(const SignedInteger lhs, const uint128_t rhs) noexcept;
 
 template <typename UnsignedInteger, std::enable_if_t<detail::is_unsigned_integer_v<UnsignedInteger> && (sizeof(UnsignedInteger) * 8 <= 16), bool> = true>
-constexpr unsigned int operator<<(const UnsignedInteger lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr unsigned int operator>>(const UnsignedInteger lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator<<(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator>>(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns the bitwise right shift of `lhs` without exception.
@@ -437,12 +432,12 @@ This operation is subject to mixed sign limitations discussed xref:uint128_t.ado
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator+(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator+(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator+(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns as a `uint128_t` the sum of `lhs` and `rhs`.
@@ -454,12 +449,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator-(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator-(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator-(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns as a `uint128_t` the difference of `lhs` and `rhs`.
@@ -471,12 +466,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator*(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator*(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator*(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns as a `uint128_t` the product of `lhs` and `rhs`.
@@ -488,12 +483,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator/(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator/(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator/(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns as a `uint128_t` the quotient of `lhs` and `rhs` without exception.
@@ -504,12 +499,12 @@ This operation is only defined for integers and is subject to mixed sign limitat
 [source, c++]
 ----
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator%(const uint128_t lhs, const Integer rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint128_t lhs, const Integer rhs) noexcept;
 
 template <BOOST_INT128_INTEGER_CONCEPT Integer>
-constexpr uint128_t operator%(const Integer lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const Integer lhs, const uint128_t rhs) noexcept;
 
-constexpr uint128_t operator%(const uint128_t lhs, const uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint128_t lhs, const uint128_t rhs) noexcept;
 ----
 
 Returns as a `uint128_t` the remainder of `lhs` and `rhs` without exception.
diff --git a/doc/package-lock.json b/doc/package-lock.json
index 86eaeb65..d693fdc1 100644
--- a/doc/package-lock.json
+++ b/doc/package-lock.json
@@ -5,7 +5,7 @@
   "packages": {
     "": {
       "dependencies": {
-        "@antora/lunr-extension": "^1.0.0-alpha.12",
+        "@antora/lunr-extension": "^1.0.0-alpha.13",
         "@cppalliance/antora-downloads-extension": "^0.0.2"
       },
       "devDependencies": {
@@ -162,9 +162,9 @@
       }
     },
     "node_modules/@antora/lunr-extension": {
-      "version": "1.0.0-alpha.12",
-      "resolved": "https://registry.npmjs.org/@antora/lunr-extension/-/lunr-extension-1.0.0-alpha.12.tgz",
-      "integrity": "sha512-iiEXpJae8tCH22ao7kZ4I+eyQ/3IeFIFK1G5I9QLpkCezaVPotI8eLFY7e0xDI+zsqJEfCOsfoZGYXso6xCYlA==",
+      "version": "1.0.0-alpha.13",
+      "resolved": "https://registry.npmjs.org/@antora/lunr-extension/-/lunr-extension-1.0.0-alpha.13.tgz",
+      "integrity": "sha512-u8n8XLB6elMmXbW0bdeL5jG8UBJi6PSiz1zaMn+wIIIu/bnxotRBW4kEWSge+zTfdF4rEYMcJ9LvkAOamMyuKQ==",
       "license": "MPL-2.0",
       "workspaces": [
         "."
@@ -688,10 +688,11 @@
       "license": "MIT"
     },
     "node_modules/convict": {
-      "version": "6.2.4",
-      "resolved": "https://registry.npmjs.org/convict/-/convict-6.2.4.tgz",
-      "integrity": "sha512-qN60BAwdMVdofckX7AlohVJ2x9UvjTNoKVXCL2LxFk1l7757EJqf1nySdMkPQer0bt8kQ5lQiyZ9/2NvrFBuwQ==",
+      "version": "6.2.5",
+      "resolved": "https://registry.npmjs.org/convict/-/convict-6.2.5.tgz",
+      "integrity": "sha512-JtXpxqDqJ8P0UwEHwhxLzCIXQy97vlYBZR222Sbzb1q1Erex9ASrztJ29SyhWFQjod1AeFBaPzEEC8YvtZMIYg==",
       "dev": true,
+      "license": "Apache-2.0",
       "dependencies": {
         "lodash.clonedeep": "^4.5.0",
         "yargs-parser": "^20.2.7"
@@ -1101,9 +1102,9 @@
       }
     },
     "node_modules/handlebars": {
-      "version": "4.7.8",
-      "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.8.tgz",
-      "integrity": "sha512-vafaFqs8MZkRrSX7sFVUdo3ap/eNiLnb4IakshzvP56X5Nr1iGKAIqdX6tMlm6HcNRIkr6AxO5jFEoJzzpT8aQ==",
+      "version": "4.7.9",
+      "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.9.tgz",
+      "integrity": "sha512-4E71E0rpOaQuJR2A3xDZ+GM1HyWYv1clR58tC8emQNeQe3RH7MAzSbat+V0wG78LQBo6m6bzSG/L4pBuCsgnUQ==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -1443,9 +1444,9 @@
       }
     },
     "node_modules/micromatch/node_modules/picomatch": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
-      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "version": "2.3.2",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz",
+      "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==",
       "dev": true,
       "license": "MIT",
       "engines": {
@@ -1492,9 +1493,9 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
-      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+      "version": "3.1.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.5.tgz",
+      "integrity": "sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==",
       "dev": true,
       "license": "ISC",
       "dependencies": {
@@ -1583,9 +1584,9 @@
       "license": "MIT"
     },
     "node_modules/picomatch": {
-      "version": "4.0.3",
-      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
-      "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz",
+      "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
       "dev": true,
       "license": "MIT",
       "engines": {
diff --git a/doc/package.json b/doc/package.json
index 2d89d19c..2d83cedc 100644
--- a/doc/package.json
+++ b/doc/package.json
@@ -6,6 +6,6 @@
   },
   "dependencies": {
     "@cppalliance/antora-downloads-extension": "^0.0.2",
-    "@antora/lunr-extension": "^1.0.0-alpha.12"
+    "@antora/lunr-extension": "^1.0.0-alpha.13"
   }
 }
diff --git a/examples/cuda.cu b/examples/cuda.cu
new file mode 100644
index 00000000..ec20577c
--- /dev/null
+++ b/examples/cuda.cu
@@ -0,0 +1,138 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+// Calculates the GCD of 2 values on device
+__global__ void cuda_gcd(const test_type* in1, const test_type* in2, test_type* out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::gcd(in1[i], in2[i]);
+    }
+}
+
+// Allocate managed space so that the arrays can be used on both host and device
+void allocate(test_type** in, int numElements)
+{
+    cudaError_t err = cudaSuccess;
+    err = cudaMallocManaged(in, numElements * sizeof(test_type));
+    if (err != cudaSuccess)
+    {
+        throw std::runtime_error(cudaGetErrorString(err));
+    }
+
+    cudaDeviceSynchronize();
+}
+
+void cleanup(test_type** in1, test_type** in2, test_type** out)
+{
+    if (*in1 != nullptr)
+    {
+        cudaFree(*in1);
+        *in1 = nullptr;
+    }
+
+    if (*in2 != nullptr)
+    {
+        cudaFree(*in2);
+        *in2 = nullptr;
+    }
+
+    if (*out != nullptr)
+    {
+        cudaFree(*out);
+        *out = nullptr;
+    }
+
+    cudaDeviceReset();
+}
+
+int main()
+{
+    std::mt19937_64 rng {42};
+
+    const int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate managed space for our inputs and GPU outputs
+    // We then fill them with random numbers
+
+    test_type* in1 = nullptr;
+    test_type* in2 = nullptr;
+    test_type* out = nullptr;
+
+    allocate(&in1, numElements);
+    allocate(&in2, numElements);
+    allocate(&out, numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        in1[i] = dist(rng);
+        in2[i] = dist(rng);
+    }
+
+    const int threadsPerBlock = 256;
+    const int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    // Launch the CUDA kernel and check for errors
+
+    cuda_gcd<<<blocksPerGrid, threadsPerBlock>>>(in1, in2, out, numElements);
+    cudaDeviceSynchronize();
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        cleanup(&in1, &in2, &out);
+        return EXIT_FAILURE;
+    }
+
+    // We now will perform the same operation using the same inputs on CPU,
+    // to compare the results for equality
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.emplace_back(boost::int128::gcd(in1[i], in2[i]));
+    }
+
+    // We can now compare that our operation on GPU and the same operation on CPU have identical results
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (out[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element: " << i << "!" << std::endl;
+            cleanup(&in1, &in2, &out);
+            return EXIT_FAILURE;
+        }
+    }
+
+    cleanup(&in1, &in2, &out);
+
+    std::cout << "All CPU and GPU computed elements match!" << std::endl;
+
+    return 0;
+}
diff --git a/examples/math_and_random.cpp b/examples/math_and_random.cpp
index 8a0a5110..d9897028 100644
--- a/examples/math_and_random.cpp
+++ b/examples/math_and_random.cpp
@@ -6,6 +6,7 @@
 #define BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 #include <boost/int128.hpp>
+#include <boost/int128/random.hpp> // Not included in the convenience header, but needed for boost.random interop
 
 #ifdef __clang__
 #pragma clang diagnostic push
@@ -25,19 +26,10 @@
 #include <array>
 #include <random>
 
-// For some bizare reason that I can't figure out Clang-Cl x86 in Github Actions crashes only with C++14
-// I can't replicate this crash locally
-#if defined(_WIN32) && defined(__clang__) && defined(__cplusplus) && __cplusplus == 201402L
-
 int main()
 {
-    return 0;
-}
+    std::cout << "=== uint128_t ===" << '\n';
 
-#else
-
-int main()
-{
     // Setup our rng and distribution
     std::mt19937_64 rng {42};
     boost::random::uniform_int_distribution<boost::int128::uint128_t> dist {0, (std::numeric_limits<boost::int128::uint128_t>::max)()};
@@ -50,11 +42,16 @@ int main()
     }
 
     // Perform some rudimentary statistical analysis on our dataset
-    std::cout << "    Mean: " << boost::math::statistics::mean(data_set) << std::endl;
-    std::cout << "Variance: " << boost::math::statistics::variance(data_set) << std::endl;
-    std::cout << "  Median: " << boost::math::statistics::median(data_set) << std::endl;
+    std::cout << "    Mean: " << boost::math::statistics::mean(data_set) << '\n';
+    std::cout << "Variance: " << boost::math::statistics::variance(data_set) << '\n';
+    std::cout << "  Median: " << boost::math::statistics::median(data_set) << '\n';
+
+    std::cout << "=== int128_t ===" << '\n';
+
+    // We can also generate random signed integers using int128_t
+    boost::random::uniform_int_distribution<boost::int128::int128_t> signed_dist {std::numeric_limits<boost::int128::int128_t>::min(), std::numeric_limits<boost::int128::int128_t>::max()};
+
+    std::cout << "Random int128_t: " << signed_dist(rng) << std::endl;
 
     return 0;
 }
-
-#endif
diff --git a/extra/int128.natvis b/extra/int128.natvis
new file mode 100644
index 00000000..27fa4fa8
--- /dev/null
+++ b/extra/int128.natvis
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+  Copyright 2025 Matt Borland
+  Distributed under the Boost Software License, Version 1.0.
+  https://www.boost.org/LICENSE_1_0.txt
+
+  NATVIS visualizer for boost::int128::uint128_t and boost::int128::int128_t.
+
+  Place this file alongside your project or in:
+    %USERPROFILE%\Documents\Visual Studio <version>\Visualizers\
+
+  Values that fit in 64 bits are displayed in decimal.
+  Larger values are displayed as a synthesized hexadecimal value using
+  nibble-by-nibble extraction for guaranteed zero-padded output.
+-->
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+  <!-- uint128_t: unsigned 128-bit integer -->
+  <Type Name="boost::int128::uint128_t">
+    <DisplayString Condition="high == 0">{low,u}</DisplayString>
+    <DisplayString>0x{high>>60&amp;0xF,X}{high>>56&amp;0xF,X}{high>>52&amp;0xF,X}{high>>48&amp;0xF,X}{high>>44&amp;0xF,X}{high>>40&amp;0xF,X}{high>>36&amp;0xF,X}{high>>32&amp;0xF,X}{high>>28&amp;0xF,X}{high>>24&amp;0xF,X}{high>>20&amp;0xF,X}{high>>16&amp;0xF,X}{high>>12&amp;0xF,X}{high>>8&amp;0xF,X}{high>>4&amp;0xF,X}{high&amp;0xF,X}'{low>>60&amp;0xF,X}{low>>56&amp;0xF,X}{low>>52&amp;0xF,X}{low>>48&amp;0xF,X}{low>>44&amp;0xF,X}{low>>40&amp;0xF,X}{low>>36&amp;0xF,X}{low>>32&amp;0xF,X}{low>>28&amp;0xF,X}{low>>24&amp;0xF,X}{low>>20&amp;0xF,X}{low>>16&amp;0xF,X}{low>>12&amp;0xF,X}{low>>8&amp;0xF,X}{low>>4&amp;0xF,X}{low&amp;0xF,X}</DisplayString>
+    <Expand>
+      <Item Name="high">high</Item>
+      <Item Name="low">low</Item>
+    </Expand>
+  </Type>
+
+  <!-- int128_t: signed 128-bit integer -->
+  <Type Name="boost::int128::int128_t">
+    <DisplayString Condition="high == 0">{low,u}</DisplayString>
+    <DisplayString Condition="high == -1 &amp;&amp; low != 0">-{~low + 1,u}</DisplayString>
+    <DisplayString>0x{(unsigned __int64)high>>60&amp;0xF,X}{(unsigned __int64)high>>56&amp;0xF,X}{(unsigned __int64)high>>52&amp;0xF,X}{(unsigned __int64)high>>48&amp;0xF,X}{(unsigned __int64)high>>44&amp;0xF,X}{(unsigned __int64)high>>40&amp;0xF,X}{(unsigned __int64)high>>36&amp;0xF,X}{(unsigned __int64)high>>32&amp;0xF,X}{(unsigned __int64)high>>28&amp;0xF,X}{(unsigned __int64)high>>24&amp;0xF,X}{(unsigned __int64)high>>20&amp;0xF,X}{(unsigned __int64)high>>16&amp;0xF,X}{(unsigned __int64)high>>12&amp;0xF,X}{(unsigned __int64)high>>8&amp;0xF,X}{(unsigned __int64)high>>4&amp;0xF,X}{(unsigned __int64)high&amp;0xF,X}'{low>>60&amp;0xF,X}{low>>56&amp;0xF,X}{low>>52&amp;0xF,X}{low>>48&amp;0xF,X}{low>>44&amp;0xF,X}{low>>40&amp;0xF,X}{low>>36&amp;0xF,X}{low>>32&amp;0xF,X}{low>>28&amp;0xF,X}{low>>24&amp;0xF,X}{low>>20&amp;0xF,X}{low>>16&amp;0xF,X}{low>>12&amp;0xF,X}{low>>8&amp;0xF,X}{low>>4&amp;0xF,X}{low&amp;0xF,X}</DisplayString>
+    <Expand>
+      <Item Name="high">high</Item>
+      <Item Name="low">low</Item>
+    </Expand>
+  </Type>
+
+</AutoVisualizer>
diff --git a/include/boost/int128/bit.hpp b/include/boost/int128/bit.hpp
index 578e0ab1..e014008a 100644
--- a/include/boost/int128/bit.hpp
+++ b/include/boost/int128/bit.hpp
@@ -13,61 +13,61 @@
 namespace boost {
 namespace int128 {
 
-BOOST_INT128_EXPORT constexpr bool has_single_bit(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool has_single_bit(const uint128_t x) noexcept
 {
     return x && !(x & (x - 1U));
 }
 
-BOOST_INT128_EXPORT constexpr int countl_zero(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int countl_zero(const uint128_t x) noexcept
 {
     return x.high == 0 ? 64 + detail::countl_zero(x.low) : detail::countl_zero(x.high);
 }
 
-BOOST_INT128_EXPORT constexpr int countl_one(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int countl_one(const uint128_t x) noexcept
 {
     return countl_zero(~x);
 }
 
-BOOST_INT128_EXPORT constexpr int bit_width(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int bit_width(const uint128_t x) noexcept
 {
     return x ? 128 - countl_zero(x) : 0;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t bit_ceil(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t bit_ceil(const uint128_t x) noexcept
 {
     return x <= 1U ? static_cast<uint128_t>(1) : static_cast<uint128_t>(1) << bit_width(x - 1U);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t bit_floor(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t bit_floor(const uint128_t x) noexcept
 {
     return x > 0U ? static_cast<uint128_t>(1) << (bit_width(x) - 1U) : static_cast<uint128_t>(0);
 }
 
-BOOST_INT128_EXPORT constexpr int countr_zero(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int countr_zero(const uint128_t x) noexcept
 {
     return x.low == 0 ? 64 + detail::countr_zero(x.high) : detail::countr_zero(x.low);
 }
 
-BOOST_INT128_EXPORT constexpr int countr_one(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int countr_one(const uint128_t x) noexcept
 {
     return countr_zero(~x);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t rotl(const uint128_t x, const int s) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t rotl(const uint128_t x, const int s) noexcept
 {
     constexpr auto mask {127U};
     return x << (static_cast<unsigned>(s) & mask) | x >> (static_cast<unsigned>(-s) & mask);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t rotr(const uint128_t x, const int s) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t rotr(const uint128_t x, const int s) noexcept
 {
     constexpr auto mask {127U};
     return x >> (static_cast<unsigned>(s) & mask) | x << (static_cast<unsigned>(-s) & mask);
 }
 
-#if BOOST_INT128_HAS_BUILTIN(__builtin_popcountll)
+#if BOOST_INT128_HAS_BUILTIN(__builtin_popcountll) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
-BOOST_INT128_EXPORT constexpr int popcount(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int popcount(const uint128_t x) noexcept
 {
     return __builtin_popcountll(x.high) + __builtin_popcountll(x.low);
 }
@@ -76,7 +76,7 @@ BOOST_INT128_EXPORT constexpr int popcount(const uint128_t x) noexcept
 
 namespace impl {
 
-constexpr int popcount_impl(std::uint64_t x) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int popcount_impl(std::uint64_t x) noexcept
 {
     x = x - ((x >> 1U) & UINT64_C(0x5555555555555555));
     x = (x & UINT64_C(0x3333333333333333)) + ((x >> 2U) & UINT64_C(0x3333333333333333));
@@ -89,7 +89,7 @@ constexpr int popcount_impl(std::uint64_t x) noexcept
 
 #if defined(_M_AMD64) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !BOOST_INT128_HAS_BUILTIN(__builtin_popcountll)
 
-BOOST_INT128_EXPORT constexpr int popcount(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int popcount(const uint128_t x) noexcept
 {
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(x))
     {
@@ -111,7 +111,7 @@ BOOST_INT128_EXPORT constexpr int popcount(const uint128_t x) noexcept
 
 #elif defined(_M_IX86) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !BOOST_INT128_HAS_BUILTIN(__builtin_popcountll)
 
-BOOST_INT128_EXPORT constexpr int popcount(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int popcount(const uint128_t x) noexcept
 {
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(x))
     {
@@ -139,18 +139,18 @@ BOOST_INT128_EXPORT constexpr int popcount(const uint128_t x) noexcept
     }
 }
 
-#elif !BOOST_INT128_HAS_BUILTIN(__builtin_popcountll)
+#elif !BOOST_INT128_HAS_BUILTIN(__builtin_popcountll) || (defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
-BOOST_INT128_EXPORT constexpr int popcount(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int popcount(const uint128_t x) noexcept
 {
     return impl::popcount_impl(x.high) + impl::popcount_impl(x.low);
 }
 
 #endif
 
-#if BOOST_INT128_HAS_BUILTIN(__builtin_bswap64)
+#if BOOST_INT128_HAS_BUILTIN(__builtin_bswap64) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
-BOOST_INT128_EXPORT constexpr uint128_t byteswap(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t byteswap(const uint128_t x) noexcept
 {
     return {__builtin_bswap64(x.low), __builtin_bswap64(x.high)};
 }
@@ -159,14 +159,14 @@ BOOST_INT128_EXPORT constexpr uint128_t byteswap(const uint128_t x) noexcept
 
 namespace impl {
 
-BOOST_INT128_EXPORT constexpr std::uint64_t byteswap_impl(const std::uint64_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr std::uint64_t byteswap_impl(const std::uint64_t x) noexcept
 {
     const auto step32 {x << 32U | x >> 32U};
     const auto step16 {(step32 & UINT64_C(0x0000FFFF0000FFFF)) << 16U | (step32 & UINT64_C(0xFFFF0000FFFF0000)) >> 16U};
     return (step16 & UINT64_C(0x00FF00FF00FF00FF)) << 8U | (step16 & UINT64_C(0xFF00FF00FF00FF00)) >> 8U;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t byteswap_impl(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t byteswap_impl(const uint128_t x) noexcept
 {
     return {byteswap_impl(x.low), byteswap_impl(x.high)};
 }
@@ -175,7 +175,7 @@ BOOST_INT128_EXPORT constexpr uint128_t byteswap_impl(const uint128_t x) noexcep
 
 #if defined(_MSC_VER) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !BOOST_INT128_HAS_BUILTIN(__builtin_bswap64)
 
-BOOST_INT128_EXPORT constexpr uint128_t byteswap(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t byteswap(const uint128_t x) noexcept
 {
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(x))
     {
@@ -187,9 +187,9 @@ BOOST_INT128_EXPORT constexpr uint128_t byteswap(const uint128_t x) noexcept
     }
 }
 
-#elif !BOOST_INT128_HAS_BUILTIN(__builtin_bswap64)
+#elif !BOOST_INT128_HAS_BUILTIN(__builtin_bswap64) || (defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
-BOOST_INT128_EXPORT constexpr uint128_t byteswap(const uint128_t x) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t byteswap(const uint128_t x) noexcept
 {
     return impl::byteswap_impl(x);
 }
diff --git a/include/boost/int128/charconv.hpp b/include/boost/int128/charconv.hpp
index 9171cae7..01b4fcb9 100644
--- a/include/boost/int128/charconv.hpp
+++ b/include/boost/int128/charconv.hpp
@@ -10,6 +10,12 @@
 
 #if __has_include(<boost/charconv.hpp>)
 
+// Define for the user automatically,
+// otherwise we'll have an ever-increasing number of these required as we go down the dependency chain
+#if defined(BOOST_INT128_ENABLE_CUDA) && !defined(BOOST_CHARCONV_ENABLE_CUDA)
+#  define BOOST_CHARCONV_ENABLE_CUDA
+#endif
+
 #include <boost/int128/int128.hpp>
 #include <boost/int128/literals.hpp>
 #include <boost/charconv.hpp>
@@ -38,6 +44,24 @@ struct make_signed<int128::uint128_t> { using type = int128::int128_t; };
 template <>
 struct make_signed<int128::int128_t> { using type = int128::int128_t; };
 
+#if defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA)
+
+template <>
+__host__ __device__ constexpr int128::uint128_t get_max_value<int128::uint128_t>()
+{
+    return std::numeric_limits<int128::uint128_t>::max();
+}
+
+template <>
+__host__ __device__ constexpr int128::int128_t get_max_value<int128::int128_t>()
+{
+    return std::numeric_limits<int128::int128_t>::max();
+}
+
+#endif // __NVCC__
+
+#if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
 BOOST_INT128_INLINE_CONSTEXPR int128::uint128_t int128_pow10[39] =
 {
     int128::uint128_t{UINT64_C(0x0), UINT64_C(0x1)},
@@ -81,8 +105,57 @@ BOOST_INT128_INLINE_CONSTEXPR int128::uint128_t int128_pow10[39] =
     int128::uint128_t{UINT64_C(0x4b3b4ca85a86c47a), UINT64_C(0x98a224000000000)}
 };
 
-constexpr int num_digits(const int128::uint128_t& x) noexcept
+#endif // __NVCC__
+
+BOOST_INT128_HOST_DEVICE constexpr int num_digits(const int128::uint128_t& x) noexcept
 {
+    #if defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA)
+
+    constexpr int128::uint128_t int128_pow10[39] =
+    {
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x1)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0xa)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x64)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x3e8)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x2710)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x186a0)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0xf4240)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x989680)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x5f5e100)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x3b9aca00)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x2540be400)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x174876e800)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0xe8d4a51000)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x9184e72a000)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x5af3107a4000)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x38d7ea4c68000)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x2386f26fc10000)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x16345785d8a0000)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0xde0b6b3a7640000)},
+        int128::uint128_t{UINT64_C(0x0), UINT64_C(0x8ac7230489e80000)},
+        int128::uint128_t{UINT64_C(0x5), UINT64_C(0x6bc75e2d63100000)},
+        int128::uint128_t{UINT64_C(0x36), UINT64_C(0x35c9adc5dea00000)},
+        int128::uint128_t{UINT64_C(0x21e), UINT64_C(0x19e0c9bab2400000)},
+        int128::uint128_t{UINT64_C(0x152d), UINT64_C(0x2c7e14af6800000)},
+        int128::uint128_t{UINT64_C(0xd3c2), UINT64_C(0x1bcecceda1000000)},
+        int128::uint128_t{UINT64_C(0x84595), UINT64_C(0x161401484a000000)},
+        int128::uint128_t{UINT64_C(0x52b7d2), UINT64_C(0xdcc80cd2e4000000)},
+        int128::uint128_t{UINT64_C(0x33b2e3c), UINT64_C(0x9fd0803ce8000000)},
+        int128::uint128_t{UINT64_C(0x204fce5e), UINT64_C(0x3e25026110000000)},
+        int128::uint128_t{UINT64_C(0x1431e0fae), UINT64_C(0x6d7217caa0000000)},
+        int128::uint128_t{UINT64_C(0xc9f2c9cd0), UINT64_C(0x4674edea40000000)},
+        int128::uint128_t{UINT64_C(0x7e37be2022), UINT64_C(0xc0914b2680000000)},
+        int128::uint128_t{UINT64_C(0x4ee2d6d415b), UINT64_C(0x85acef8100000000)},
+        int128::uint128_t{UINT64_C(0x314dc6448d93), UINT64_C(0x38c15b0a00000000)},
+        int128::uint128_t{UINT64_C(0x1ed09bead87c0), UINT64_C(0x378d8e6400000000)},
+        int128::uint128_t{UINT64_C(0x13426172c74d82), UINT64_C(0x2b878fe800000000)},
+        int128::uint128_t{UINT64_C(0xc097ce7bc90715), UINT64_C(0xb34b9f1000000000)},
+        int128::uint128_t{UINT64_C(0x785ee10d5da46d9), UINT64_C(0xf436a000000000)},
+        int128::uint128_t{UINT64_C(0x4b3b4ca85a86c47a), UINT64_C(0x98a224000000000)}
+    };
+
+    #endif // __NVCC__
+
     if (x.high == UINT64_C(0))
     {
         return num_digits(x.low);
@@ -91,7 +164,7 @@ constexpr int num_digits(const int128::uint128_t& x) noexcept
     // Use the most significant bit position to approximate log10
     // log10(x) ~= log2(x) / log2(10) ~= log2(x) / 3.32
 
-    const auto msb {64 + (63 - int128::detail::impl::countl_impl(x.high))};
+    const auto msb {64 + (63 - int128::detail::countl_zero(x.high))};
     
     // Approximate log10
     const auto estimated_digits {(msb * 1000) / 3322 + 1};
@@ -112,27 +185,35 @@ constexpr int num_digits(const int128::uint128_t& x) noexcept
 
 } // namespace detail
 
-BOOST_CHARCONV_CONSTEXPR to_chars_result to_chars(char* first, char* last, const int128::uint128_t value, const int base = 10) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_CHARCONV_CONSTEXPR to_chars_result to_chars(char* first, char* last, const int128::uint128_t value, const int base = 10) noexcept
 {
+    #if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
     if (base == 10)
     {
         return detail::to_chars_128integer_impl<int128::uint128_t, int128::uint128_t>(first, last, value);
     }
 
+    #endif // __NVCC__
+
     return detail::to_chars_integer_impl<int128::uint128_t, int128::uint128_t>(first, last, value, base);
 }
 
-BOOST_CHARCONV_CONSTEXPR to_chars_result to_chars(char* first, char* last, const int128::int128_t value, const int base = 10) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_CHARCONV_CONSTEXPR to_chars_result to_chars(char* first, char* last, const int128::int128_t value, const int base = 10) noexcept
 {
+    #if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
     if (base == 10)
     {
         return detail::to_chars_128integer_impl<int128::int128_t, int128::uint128_t>(first, last, value);
     }
 
+    #endif // __NVCC__
+
     return detail::to_chars_integer_impl<int128::int128_t, int128::uint128_t>(first, last, value, base);
 }
 
-BOOST_CHARCONV_GCC5_CONSTEXPR from_chars_result from_chars(const char* first, const char* last, int128::uint128_t& value, const int base = 10) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_CHARCONV_GCC5_CONSTEXPR from_chars_result from_chars(const char* first, const char* last, int128::uint128_t& value, const int base = 10) noexcept
 {
     return detail::from_chars_integer_impl<int128::uint128_t, int128::uint128_t>(first, last, value, base);
 }
@@ -142,7 +223,7 @@ BOOST_CHARCONV_GCC5_CONSTEXPR from_chars_result from_chars(core::string_view sv,
     return detail::from_chars_integer_impl<int128::uint128_t, int128::uint128_t>(sv.data(), sv.data() + sv.size(), value, base);
 }
 
-BOOST_CHARCONV_GCC5_CONSTEXPR from_chars_result from_chars(const char* first, const char* last, int128::int128_t& value, const int base = 10) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_CHARCONV_GCC5_CONSTEXPR from_chars_result from_chars(const char* first, const char* last, int128::int128_t& value, const int base = 10) noexcept
 {
     return detail::from_chars_integer_impl<int128::int128_t, int128::uint128_t>(first, last, value, base);
 }
diff --git a/include/boost/int128/cstdlib.hpp b/include/boost/int128/cstdlib.hpp
index 87c71994..2839ac39 100644
--- a/include/boost/int128/cstdlib.hpp
+++ b/include/boost/int128/cstdlib.hpp
@@ -10,19 +10,19 @@
 namespace boost {
 namespace int128 {
 
-struct u128div_t
+BOOST_INT128_EXPORT struct u128div_t
 {
     uint128_t quot;
     uint128_t rem;
 };
 
-struct i128div_t
+BOOST_INT128_EXPORT struct i128div_t
 {
     int128_t quot;
     int128_t rem;
 };
 
-constexpr u128div_t div(const uint128_t x, const uint128_t y) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr u128div_t div(const uint128_t x, const uint128_t y) noexcept
 {
     if (BOOST_INT128_UNLIKELY(x == 0U || y == 0U))
     {
@@ -54,7 +54,7 @@ constexpr u128div_t div(const uint128_t x, const uint128_t y) noexcept
     }
 }
 
-constexpr i128div_t div(const int128_t x, const int128_t y) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr i128div_t div(const int128_t x, const int128_t y) noexcept
 {
     if (BOOST_INT128_UNLIKELY(x == 0 || y == 0))
     {
diff --git a/include/boost/int128/detail/clz.hpp b/include/boost/int128/detail/clz.hpp
index 4356c830..0ce6ad1f 100644
--- a/include/boost/int128/detail/clz.hpp
+++ b/include/boost/int128/detail/clz.hpp
@@ -20,6 +20,8 @@ namespace detail {
 
 namespace impl {
 
+#if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
 // See: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
 BOOST_INT128_INLINE_CONSTEXPR int index64[64] = {
     0, 47,  1, 56, 48, 27,  2, 60,
@@ -32,8 +34,25 @@ BOOST_INT128_INLINE_CONSTEXPR int index64[64] = {
     13, 18,  8, 12,  7,  6,  5, 63
 };
 
-constexpr int bit_scan_reverse(std::uint64_t bb) noexcept
+#endif
+
+BOOST_INT128_HOST_DEVICE constexpr int bit_scan_reverse(std::uint64_t bb) noexcept
 {
+    #if defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA)
+
+    constexpr int index64[64] = {
+        0, 47,  1, 56, 48, 27,  2, 60,
+        57, 49, 41, 37, 28, 16,  3, 61,
+        54, 58, 35, 52, 50, 42, 21, 44,
+        38, 32, 29, 23, 17, 11,  4, 62,
+        46, 55, 26, 59, 40, 36, 15, 53,
+        34, 51, 20, 43, 31, 22, 10, 45,
+        25, 39, 14, 33, 19, 30,  9, 24,
+        13, 18,  8, 12,  7,  6,  5, 63
+    };
+
+    #endif
+
     constexpr auto debruijn64 {UINT64_C(0x03f79d71b4cb0a89)};
 
     BOOST_INT128_ASSUME(bb != 0); // LCOV_EXCL_LINE
@@ -48,6 +67,8 @@ constexpr int bit_scan_reverse(std::uint64_t bb) noexcept
     return index64[(bb * debruijn64) >> 58];
 }
 
+#if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
 BOOST_INT128_INLINE_CONSTEXPR int countl_mod37[37] = {
     32, 31, 6, 30, 9, 5, 0, 29,
     16, 8, 2, 4, 21, 0, 19, 28,
@@ -56,8 +77,22 @@ BOOST_INT128_INLINE_CONSTEXPR int countl_mod37[37] = {
     27, 12, 24, 13, 14, 0
 };
 
-constexpr int backup_countl_impl(std::uint32_t x) noexcept
+#endif
+
+BOOST_INT128_HOST_DEVICE constexpr int backup_countl_impl(std::uint32_t x) noexcept
 {
+    #if defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA)
+
+    constexpr int countl_mod37[37] = {
+        32, 31, 6, 30, 9, 5, 0, 29,
+        16, 8, 2, 4, 21, 0, 19, 28,
+        25, 15, 0, 7, 10, 1, 17, 3,
+        22, 20, 26, 0, 11, 18, 23,
+        27, 12, 24, 13, 14, 0
+    };
+
+    #endif
+
     x |= x >> 1;
     x |= x >> 2;
     x |= x >> 4;
@@ -67,7 +102,7 @@ constexpr int backup_countl_impl(std::uint32_t x) noexcept
     return countl_mod37[x % 37];
 }
 
-#if BOOST_INT128_HAS_BUILTIN(__builtin_clz)
+#if BOOST_INT128_HAS_BUILTIN(__builtin_clz) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
 constexpr int countl_impl(unsigned int x) noexcept
 {
@@ -84,7 +119,7 @@ constexpr int countl_impl(unsigned long long x) noexcept
     return x ? __builtin_clzll(x) : std::numeric_limits<unsigned long long>::digits;
 }
 
-#elif (defined(_M_AMD64) || defined(_M_ARM64)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
+#elif (defined(_M_AMD64) || defined(_M_ARM64)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
 constexpr int countl_impl(std::uint32_t x) noexcept
 {
@@ -151,7 +186,7 @@ constexpr int countl_impl(std::uint32_t x) noexcept
     }
 }
 
-constexpr int countl_impl(std::uint64_t x) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int countl_impl(std::uint64_t x) noexcept
 {
     return x ? bit_scan_reverse(static_cast<std::uint64_t>(x)) ^ 63 : std::numeric_limits<std::uint64_t>::digits;
 }
@@ -159,12 +194,12 @@ constexpr int countl_impl(std::uint64_t x) noexcept
 #else
 
 template <typename T>
-constexpr int countl_impl(T x) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int countl_impl(T x) noexcept
 {
     return x ? bit_scan_reverse(static_cast<std::uint64_t>(x)) ^ 63 : std::numeric_limits<T>::digits;
 }
 
-constexpr int countl_impl(std::uint32_t x) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int countl_impl(std::uint32_t x) noexcept
 {
     return backup_countl_impl(x);
 }
@@ -175,7 +210,7 @@ constexpr int countl_impl(std::uint32_t x) noexcept
 } // namespace impl
 
 template <typename T>
-constexpr int countl_zero(T x) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int countl_zero(T x) noexcept
 {
     static_assert(std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
                   "Can only count with unsigned integers");
diff --git a/include/boost/int128/detail/common_div.hpp b/include/boost/int128/detail/common_div.hpp
index 088816ca..3ad30332 100644
--- a/include/boost/int128/detail/common_div.hpp
+++ b/include/boost/int128/detail/common_div.hpp
@@ -25,7 +25,7 @@ namespace detail {
 #endif
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr void half_word_div(const T& lhs, const std::uint32_t rhs, T& quotient, T& remainder) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void half_word_div(const T& lhs, const std::uint32_t rhs, T& quotient, T& remainder) noexcept
 {
     using high_word_type = decltype(T{}.high);
 
@@ -54,11 +54,13 @@ BOOST_INT128_FORCE_INLINE constexpr void half_word_div(const T& lhs, const std::
 }
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr void half_word_div(const T& lhs, const std::uint32_t rhs, T& quotient) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void half_word_div(const T& lhs, const std::uint32_t rhs, T& quotient) noexcept
 {
+    using high_word_type = decltype(T{}.high);
+
     BOOST_INT128_ASSUME(rhs != 0); // LCOV_EXCL_LINE
 
-    quotient.high = lhs.high / rhs;
+    quotient.high = static_cast<high_word_type>(static_cast<std::uint64_t>(lhs.high) / rhs);
     auto remainder {((static_cast<std::uint64_t>(lhs.high) % rhs) << 32) | (lhs.low >> 32)};
     quotient.low = (remainder / rhs) << 32;
     remainder = ((remainder % rhs) << 32) | (lhs.low & UINT32_MAX);
@@ -73,7 +75,7 @@ namespace impl {
 #endif
 
 template <std::size_t v_size>
-BOOST_INT128_FORCE_INLINE constexpr void unpack_v(std::uint32_t (&vn)[4], const std::uint32_t (&v)[v_size],
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void unpack_v(std::uint32_t (&vn)[4], const std::uint32_t (&v)[v_size],
     const bool needs_shift, const int s, const int complement_s, const std::integral_constant<std::size_t, 2>&) noexcept
 {
     vn[1] = needs_shift ? ((v[1] << s) | (v[0] >> complement_s)) : v[1];
@@ -81,7 +83,7 @@ BOOST_INT128_FORCE_INLINE constexpr void unpack_v(std::uint32_t (&vn)[4], const
 }
 
 template <std::size_t v_size>
-BOOST_INT128_FORCE_INLINE constexpr void unpack_v(std::uint32_t (&vn)[4], const std::uint32_t (&v)[v_size],
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void unpack_v(std::uint32_t (&vn)[4], const std::uint32_t (&v)[v_size],
     const bool needs_shift, const int s, const int complement_s, const std::integral_constant<std::size_t, 4>&) noexcept
 {
     vn[3] = needs_shift ? ((v[3] << s) | (v[2] >> complement_s)) : v[3];
@@ -93,7 +95,7 @@ BOOST_INT128_FORCE_INLINE constexpr void unpack_v(std::uint32_t (&vn)[4], const
 // See: The Art of Computer Programming Volume 2 (Semi-numerical algorithms) section 4.3.1
 // Algorithm D: Division of Non-negative integers
 template <bool need_remainder, std::size_t u_size, std::size_t v_size, std::size_t q_size>
-constexpr void knuth_divide(std::uint32_t (&u)[u_size], const std::size_t m,
+BOOST_INT128_HOST_DEVICE constexpr void knuth_divide(std::uint32_t (&u)[u_size], const std::size_t m,
                             const std::uint32_t (&v)[v_size], const std::size_t n,
                             std::uint32_t (&q)[q_size]) noexcept
 {
@@ -127,7 +129,7 @@ constexpr void knuth_divide(std::uint32_t (&u)[u_size], const std::size_t m,
         while (q_hat > UINT32_MAX ||
                (q_hat * vn[n-2]) > ((r_hat << 32) | un[j+n-2]))
         {
-            q_hat--;
+            --q_hat;
             r_hat += vn[n-1];
             if (r_hat > UINT32_MAX)
             {
@@ -203,7 +205,7 @@ constexpr void knuth_divide(std::uint32_t (&u)[u_size], const std::size_t m,
 #endif
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const T& x, std::uint32_t (&words)[4]) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const T& x, std::uint32_t (&words)[4]) noexcept
 {
     #if !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !BOOST_INT128_ENDIAN_BIG_BYTE
     if (!BOOST_INT128_IS_CONSTANT_EVALUATED(x))
@@ -230,7 +232,7 @@ BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const T& x, std::uint32
     return word_count;
 }
 
-BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const std::uint64_t x, std::uint32_t (&words)[2]) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const std::uint64_t x, std::uint32_t (&words)[2]) noexcept
 {
     #if !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !BOOST_INT128_ENDIAN_BIG_BYTE
     if (!BOOST_INT128_IS_CONSTANT_EVALUATED(x))
@@ -247,7 +249,7 @@ BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const std::uint64_t x,
     return x > UINT32_MAX ? 2 : 1;
 }
 
-BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const std::uint32_t x, std::uint32_t (&words)[1]) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const std::uint32_t x, std::uint32_t (&words)[1]) noexcept
 {
     words[0] = x;
 
@@ -255,7 +257,7 @@ BOOST_INT128_FORCE_INLINE constexpr std::size_t to_words(const std::uint32_t x,
 }
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr T from_words(const std::uint32_t (&words)[4]) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T from_words(const std::uint32_t (&words)[4]) noexcept
 {
     using high_word_type = decltype(T{}.high);
 
@@ -268,7 +270,7 @@ BOOST_INT128_FORCE_INLINE constexpr T from_words(const std::uint32_t (&words)[4]
 #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
 
 template <bool needs_mod, typename T>
-constexpr T div_mod_msvc(T dividend, T divisor, T& remainder)
+BOOST_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T divisor, T& remainder)
 {
     using high_word_type = decltype(T{}.high);
 
@@ -395,7 +397,7 @@ constexpr T div_mod_msvc(T dividend, T divisor, T& remainder)
 // In the division case it is a waste of cycles
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient) noexcept
 {
     #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
 
@@ -431,7 +433,7 @@ BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::u
 }
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient, T& remainder) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient, T& remainder) noexcept
 {
     #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
 
@@ -470,13 +472,13 @@ BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::u
 }
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint32_t rhs, T& quotient, T& remainder) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint32_t rhs, T& quotient, T& remainder) noexcept
 {
     half_word_div(lhs, rhs, quotient, remainder);
 }
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint32_t rhs, T& quotient) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint32_t rhs, T& quotient) noexcept
 {
     half_word_div(lhs, rhs, quotient);
 }
@@ -488,7 +490,7 @@ BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::u
 #endif
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divisor) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divisor) noexcept
 {
     BOOST_INT128_ASSUME(divisor != static_cast<T>(0));
 
@@ -519,7 +521,7 @@ BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divi
 }
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divisor, T& remainder) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divisor, T& remainder) noexcept
 {
     BOOST_INT128_ASSUME(divisor != static_cast<T>(0));
     
diff --git a/include/boost/int128/detail/common_mul.hpp b/include/boost/int128/detail/common_mul.hpp
index a462627c..be26c763 100644
--- a/include/boost/int128/detail/common_mul.hpp
+++ b/include/boost/int128/detail/common_mul.hpp
@@ -21,7 +21,7 @@ namespace detail {
 // See: The Art of Computer Programming Volume 2 (Semi-numerical algorithms) section 4.3.1
 // Algorithm M: Multiplication of Non-negative integers
 template <typename ReturnType, std::size_t u_size, std::size_t v_size>
-BOOST_INT128_FORCE_INLINE constexpr ReturnType knuth_multiply(const std::uint32_t (&u)[u_size],
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType knuth_multiply(const std::uint32_t (&u)[u_size],
                                                               const std::uint32_t (&v)[v_size]) noexcept
 {
     using high_word_type = decltype(ReturnType{}.high);
@@ -59,7 +59,7 @@ BOOST_INT128_FORCE_INLINE constexpr ReturnType knuth_multiply(const std::uint32_
 }
 
 template <typename T>
-BOOST_INT128_FORCE_INLINE constexpr void to_words(const T& x, std::uint32_t (&words)[4]) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const T& x, std::uint32_t (&words)[4]) noexcept
 {
     #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION
 
@@ -78,7 +78,7 @@ BOOST_INT128_FORCE_INLINE constexpr void to_words(const T& x, std::uint32_t (&wo
 }
 
 
-BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint64_t x, std::uint32_t (&words)[2]) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint64_t x, std::uint32_t (&words)[2]) noexcept
 {
     #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION
 
@@ -94,7 +94,7 @@ BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint64_t x, std::ui
     words[1] = static_cast<std::uint32_t>(x >> 32);         // LCOV_EXCL_LINE
 }
 
-BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint32_t x, std::uint32_t (&words)[1]) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint32_t x, std::uint32_t (&words)[1]) noexcept
 {
     words[0] = x;
 }
diff --git a/include/boost/int128/detail/config.hpp b/include/boost/int128/detail/config.hpp
index 7fd2c8d2..40f95ee8 100644
--- a/include/boost/int128/detail/config.hpp
+++ b/include/boost/int128/detail/config.hpp
@@ -102,9 +102,9 @@ using builtin_u128 = std::_Unsigned128;
 #  define BOOST_INT128_HAS_BUILTIN_IS_CONSTANT_EVALUATED
 #endif
 
-#if defined(BOOST_INT128_HAS_IS_CONSTANT_EVALUATED)
+#if defined(BOOST_INT128_HAS_IS_CONSTANT_EVALUATED) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 #  define BOOST_INT128_IS_CONSTANT_EVALUATED(x) std::is_constant_evaluated()
-#elif defined(BOOST_INT128_HAS_BUILTIN_IS_CONSTANT_EVALUATED)
+#elif defined(BOOST_INT128_HAS_BUILTIN_IS_CONSTANT_EVALUATED) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 #  define BOOST_INT128_IS_CONSTANT_EVALUATED(x) __builtin_is_constant_evaluated()
 #else
 #  define BOOST_INT128_IS_CONSTANT_EVALUATED(x) false
@@ -285,4 +285,10 @@ using builtin_u128 = std::_Unsigned128;
 #  endif
 #endif
 
+#if defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA)
+#  define BOOST_INT128_HOST_DEVICE __host__ __device__
+#else
+#  define BOOST_INT128_HOST_DEVICE
+#endif
+
 #endif // BOOST_INT128_DETAIL_CONFIG_HPP
diff --git a/include/boost/int128/detail/conversions.hpp b/include/boost/int128/detail/conversions.hpp
index 02319668..f471d570 100644
--- a/include/boost/int128/detail/conversions.hpp
+++ b/include/boost/int128/detail/conversions.hpp
@@ -26,62 +26,243 @@ BOOST_INT128_INLINE_CONSTEXPR bool is_valid_overload_v = valid_overload<T>::valu
 
 #if BOOST_INT128_ENDIAN_LITTLE_BYTE
 
-constexpr int128_t::int128_t(const uint128_t& v) noexcept : low {v.low}, high {static_cast<std::int64_t>(v.high)} {}
+BOOST_INT128_HOST_DEVICE constexpr int128_t::int128_t(const uint128_t& v) noexcept : low {v.low}, high {static_cast<std::int64_t>(v.high)} {}
 
-constexpr uint128_t::uint128_t(const int128_t& v) noexcept : low {v.low}, high {static_cast<std::uint64_t>(v.high)} {}
+BOOST_INT128_HOST_DEVICE constexpr uint128_t::uint128_t(const int128_t& v) noexcept : low {v.low}, high {static_cast<std::uint64_t>(v.high)} {}
 
 #else
 
-constexpr int128_t::int128_t(const uint128_t& v) noexcept : high {static_cast<std::int64_t>(v.high)}, low {v.low} {}
+BOOST_INT128_HOST_DEVICE constexpr int128_t::int128_t(const uint128_t& v) noexcept : high {static_cast<std::int64_t>(v.high)}, low {v.low} {}
 
-constexpr uint128_t::uint128_t(const int128_t& v) noexcept : high {static_cast<std::uint64_t>(v.high)}, low {v.low} {}
+BOOST_INT128_HOST_DEVICE constexpr uint128_t::uint128_t(const int128_t& v) noexcept : high {static_cast<std::uint64_t>(v.high)}, low {v.low} {}
 
 #endif // BOOST_INT128_ENDIAN_LITTLE_BYTE
 
+//=====================================
+// Conversion Operators
+//=====================================
+
+BOOST_INT128_HOST_DEVICE constexpr int128_t::operator uint128_t() const noexcept
+{
+    return uint128_t{static_cast<std::uint64_t>(this->high), static_cast<std::uint64_t>(this->low)};
+}
+
+BOOST_INT128_HOST_DEVICE constexpr uint128_t::operator int128_t() const noexcept
+{
+    return int128_t{static_cast<std::int64_t>(this->high), static_cast<std::uint64_t>(this->low)};
+}
+
 //=====================================
 // Comparison Operators
 //=====================================
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4127)
+#endif
+
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr bool operator==(T, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_COMPARE
+
     static_assert(std::is_same<T, U>::value, "Sign Compare Error, cast one type to the other for this operation");
+    static_cast<void>(lhs);
+    static_cast<void>(rhs);
     return true;
+
+    #else
+
+    BOOST_INT128_IF_CONSTEXPR (std::is_same<T, int128_t>::value)
+    {
+        if (lhs < T{0})
+        {
+            return false;
+        }
+
+        return static_cast<uint128_t>(lhs) == rhs;
+    }
+    else
+    {
+        if (rhs < T{0})
+        {
+            return false;
+        }
+
+        return lhs == static_cast<uint128_t>(rhs);
+    }
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr bool operator!=(T, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_COMPARE
+
     static_assert(std::is_same<T, U>::value, "Sign Compare Error, cast one type to the other for this operation");
+    static_cast<void>(lhs);
+    static_cast<void>(rhs);
     return true;
+
+    #else
+
+    BOOST_INT128_IF_CONSTEXPR (std::is_same<T, int128_t>::value)
+    {
+        if (lhs < T{0})
+        {
+            return true;
+        }
+
+        return static_cast<uint128_t>(lhs) != rhs;
+    }
+    else
+    {
+        if (rhs < T{0})
+        {
+            return true;
+        }
+
+        return lhs != static_cast<uint128_t>(rhs);
+    }
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr bool operator<(T, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_COMPARE
+
     static_assert(std::is_same<T, U>::value, "Sign Compare Error, cast one type to the other for this operation");
+    static_cast<void>(lhs);
+    static_cast<void>(rhs);
     return true;
+
+    #else
+
+    BOOST_INT128_IF_CONSTEXPR (std::is_same<T, int128_t>::value)
+    {
+        if (lhs < T{0})
+        {
+            return true;
+        }
+
+        return static_cast<uint128_t>(lhs) < rhs;
+    }
+    else
+    {
+        if (rhs < T{0})
+        {
+            return false;
+        }
+
+        return lhs < static_cast<uint128_t>(rhs);
+    }
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr bool operator<=(T, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_COMPARE
+
     static_assert(std::is_same<T, U>::value, "Sign Compare Error, cast one type to the other for this operation");
+    static_cast<void>(lhs);
+    static_cast<void>(rhs);
     return true;
+
+    #else
+
+    BOOST_INT128_IF_CONSTEXPR (std::is_same<T, int128_t>::value)
+    {
+        if (lhs < T{0})
+        {
+            return true;
+        }
+
+        return static_cast<uint128_t>(lhs) <= rhs;
+    }
+    else
+    {
+        if (rhs < T{0})
+        {
+            return false;
+        }
+
+        return lhs <= static_cast<uint128_t>(rhs);
+    }
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr bool operator>(T, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_COMPARE
+
     static_assert(std::is_same<T, U>::value, "Sign Compare Error, cast one type to the other for this operation");
+    static_cast<void>(lhs);
+    static_cast<void>(rhs);
     return true;
+
+    #else
+
+    BOOST_INT128_IF_CONSTEXPR (std::is_same<T, int128_t>::value)
+    {
+        if (lhs < T{0})
+        {
+            return false;
+        }
+
+        return static_cast<uint128_t>(lhs) > rhs;
+    }
+    else
+    {
+        if (rhs < T{0})
+        {
+            return true;
+        }
+
+        return lhs > static_cast<uint128_t>(rhs);
+    }
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr bool operator>=(T, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_COMPARE
+
     static_assert(std::is_same<T, U>::value, "Sign Compare Error, cast one type to the other for this operation");
+    static_cast<void>(lhs);
+    static_cast<void>(rhs);
     return true;
+
+    #else
+
+    BOOST_INT128_IF_CONSTEXPR (std::is_same<T, int128_t>::value)
+    {
+        if (lhs < T{0})
+        {
+            return false;
+        }
+
+        return static_cast<uint128_t>(lhs) >= rhs;
+    }
+    else
+    {
+        if (rhs < T{0})
+        {
+            return true;
+        }
+
+        return lhs >= static_cast<uint128_t>(rhs);
+    }
+
+    #endif
 }
 
 //=====================================
@@ -89,40 +270,89 @@ constexpr bool operator>=(T, U) noexcept
 //=====================================
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr T operator+(T lhs, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
+
     static_assert(std::is_same<T, U>::value, "Sign Conversion Error, cast one type to the other for this operation");
-    return lhs;
+    static_cast<void>(rhs);
+    return static_cast<uint128_t>(lhs);
+
+    #else
+
+    return static_cast<uint128_t>(lhs) + static_cast<uint128_t>(rhs);
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr T operator-(T lhs, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
+
     static_assert(std::is_same<T, U>::value, "Sign Conversion Error, cast one type to the other for this operation");
-    return lhs;
+    static_cast<void>(rhs);
+    return static_cast<uint128_t>(lhs);
+
+    #else
+
+    return static_cast<uint128_t>(lhs) - static_cast<uint128_t>(rhs);
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr T operator*(T lhs, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
+
     static_assert(std::is_same<T, U>::value, "Sign Conversion Error, cast one type to the other for this operation");
-    return lhs;
+    static_cast<void>(rhs);
+    return static_cast<uint128_t>(lhs);
+
+    #else
+
+    return static_cast<uint128_t>(lhs) * static_cast<uint128_t>(rhs);
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr T operator/(T lhs, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
+
     static_assert(std::is_same<T, U>::value, "Sign Conversion Error, cast one type to the other for this operation");
-    return lhs;
+    static_cast<void>(rhs);
+    return static_cast<uint128_t>(lhs);
+
+    #else
+
+    return static_cast<uint128_t>(lhs) / static_cast<uint128_t>(rhs);
+
+    #endif
 }
 
 template <typename T, typename U, std::enable_if_t<detail::is_valid_overload_v<T> && detail::is_valid_overload_v<U> && !std::is_same<T, U>::value, bool> = true>
-constexpr T operator%(T lhs, U) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const T lhs, const U rhs) noexcept
 {
+    #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
+
     static_assert(std::is_same<T, U>::value, "Sign Conversion Error, cast one type to the other for this operation");
-    return lhs;
+    static_cast<void>(rhs);
+    return static_cast<uint128_t>(lhs);
+
+    #else
+
+    return static_cast<uint128_t>(lhs) % static_cast<uint128_t>(rhs);
+
+    #endif
 }
 
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 } // namespace int128
 } // namespace boost
 
diff --git a/include/boost/int128/detail/ctz.hpp b/include/boost/int128/detail/ctz.hpp
index eddb7c9f..2c78ddcd 100644
--- a/include/boost/int128/detail/ctz.hpp
+++ b/include/boost/int128/detail/ctz.hpp
@@ -20,7 +20,7 @@ namespace detail {
 
 namespace impl {
 
-#if BOOST_INT128_HAS_BUILTIN(__builtin_ctz)
+#if BOOST_INT128_HAS_BUILTIN(__builtin_ctz) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
 constexpr int countr_impl(unsigned int x) noexcept
 {
@@ -39,6 +39,8 @@ constexpr int countr_impl(unsigned long long x) noexcept
 
 #endif
 
+#if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
 BOOST_INT128_INLINE_CONSTEXPR int countr_mod37[37] = {
     32, 0, 1, 26, 2, 23, 27, 0,
     3, 16, 24, 30, 28, 11, 0, 13,
@@ -47,6 +49,8 @@ BOOST_INT128_INLINE_CONSTEXPR int countr_mod37[37] = {
     5, 20, 8, 19, 18
 };
 
+#endif
+
 #if defined(_MSC_VER) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !BOOST_INT128_HAS_BUILTIN(__builtin_ctz)
 
 #pragma warning(push)
@@ -75,15 +79,27 @@ constexpr int countr_impl(std::uint32_t x) noexcept
 
 #pragma warning(pop)
 
-#elif !BOOST_INT128_HAS_BUILTIN(__builtin_ctz)
+#elif !BOOST_INT128_HAS_BUILTIN(__builtin_ctz) || (defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
 #ifdef _MSC_VER
 #pragma warning(push)
 #pragma warning(disable : 4146) // unary minus operator applied to unsigned type, result still unsigned
 #endif
 
-constexpr int countr_impl(std::uint32_t x) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int countr_impl(std::uint32_t x) noexcept
 {
+    #if defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA)
+
+    constexpr int countr_mod37[37] = {
+        32, 0, 1, 26, 2, 23, 27, 0,
+        3, 16, 24, 30, 28, 11, 0, 13,
+        4, 7, 17, 0, 25, 22, 31, 15,
+        29, 10, 12, 6, 0, 21, 14, 9,
+        5, 20, 8, 19, 18
+    };
+
+    #endif
+
     return countr_mod37[(-x & x) % 37];
 }
 
@@ -93,7 +109,7 @@ constexpr int countr_impl(std::uint32_t x) noexcept
 
 #endif
 
-#if (defined(_M_AMD64) || defined(_M_ARM64)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !BOOST_INT128_HAS_BUILTIN(__builtin_ctz)
+#if (defined(_M_AMD64) || defined(_M_ARM64)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) && !BOOST_INT128_HAS_BUILTIN(__builtin_ctz) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
 constexpr int countr_impl(std::uint64_t x) noexcept
 {
@@ -116,9 +132,9 @@ constexpr int countr_impl(std::uint64_t x) noexcept
     }
 }
 
-#elif !BOOST_INT128_HAS_BUILTIN(__builtin_ctz)
+#elif !BOOST_INT128_HAS_BUILTIN(__builtin_ctz) || (defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
-constexpr int countr_impl(std::uint64_t x) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int countr_impl(std::uint64_t x) noexcept
 {
     return static_cast<std::uint32_t>(x) != 0 ? countr_impl(static_cast<std::uint32_t>(x)) :
                                                 countr_impl(static_cast<std::uint32_t>(x >> 32)) + 32;
@@ -129,7 +145,7 @@ constexpr int countr_impl(std::uint64_t x) noexcept
 } // namespace impl
 
 template <typename T>
-constexpr int countr_zero(T x) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int countr_zero(T x) noexcept
 {
     static_assert(std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
                   "Can only count with unsigned integers");
diff --git a/include/boost/int128/detail/int128_imp.hpp b/include/boost/int128/detail/int128_imp.hpp
index 1a4dcc5a..a9bdd9ca 100644
--- a/include/boost/int128/detail/int128_imp.hpp
+++ b/include/boost/int128/detail/int128_imp.hpp
@@ -55,205 +55,210 @@ int128_t
     constexpr int128_t& operator=(const int128_t&) noexcept = default;
     constexpr int128_t& operator=(int128_t&&) noexcept = default;
 
-    // Requires conversion file to be implemented
-    constexpr int128_t(const uint128_t& v) noexcept;
+    // Requires a conversion file to be implemented
+    BOOST_INT128_HOST_DEVICE explicit constexpr int128_t(const uint128_t& v) noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator uint128_t() const noexcept;
 
     // Construct from integral types
     #if BOOST_INT128_ENDIAN_LITTLE_BYTE
 
-    constexpr int128_t(const std::int64_t hi, const std::uint64_t lo) noexcept : low{lo}, high{hi} {}
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const std::int64_t hi, const std::uint64_t lo) noexcept : low{lo}, high{hi} {}
 
     template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-    constexpr int128_t(const SignedInteger v) noexcept : low {static_cast<std::uint64_t>(v)}, high {v < 0 ? -1 : 0} {}
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const SignedInteger v) noexcept : low {static_cast<std::uint64_t>(v)}, high {v < 0 ? -1 : 0} {}
 
     template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-    constexpr int128_t(const UnsignedInteger v) noexcept : low {static_cast<std::uint64_t>(v)}, high {} {}
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const UnsignedInteger v) noexcept : low {static_cast<std::uint64_t>(v)}, high {} {}
 
     #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-    BOOST_INT128_BUILTIN_CONSTEXPR int128_t(const detail::builtin_i128 v) noexcept : low {static_cast<std::uint64_t>(v & static_cast<detail::builtin_i128>(detail::low_word_mask))}, high {static_cast<std::int64_t>(v >> static_cast<detail::builtin_i128>(64U))} {}
-    BOOST_INT128_BUILTIN_CONSTEXPR int128_t(const detail::builtin_u128 v) noexcept : low {static_cast<std::uint64_t>(v & static_cast<detail::builtin_u128>(detail::low_word_mask))}, high {static_cast<std::int64_t>(v >> static_cast<detail::builtin_u128>(64U))} {}
+    BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t(const detail::builtin_i128 v) noexcept : low {static_cast<std::uint64_t>(v & static_cast<detail::builtin_i128>(detail::low_word_mask))}, high {static_cast<std::int64_t>(v >> static_cast<detail::builtin_i128>(64U))} {}
+    BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t(const detail::builtin_u128 v) noexcept : low {static_cast<std::uint64_t>(v & static_cast<detail::builtin_u128>(detail::low_word_mask))}, high {static_cast<std::int64_t>(v >> static_cast<detail::builtin_u128>(64U))} {}
 
     #endif // BOOST_INT128_HAS_INT128
 
     #else // Big endian
 
-    constexpr int128_t(const std::int64_t hi, const std::uint64_t lo) noexcept : high{hi}, low{lo} {}
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const std::int64_t hi, const std::uint64_t lo) noexcept : high{hi}, low{lo} {}
 
     template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-    constexpr int128_t(const SignedInteger v) noexcept : high{v < 0 ? -1 : 0}, low{static_cast<std::uint64_t>(v)} {}
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const SignedInteger v) noexcept : high{v < 0 ? -1 : 0}, low{static_cast<std::uint64_t>(v)} {}
 
     template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-    constexpr int128_t(const UnsignedInteger v) noexcept : high {}, low {static_cast<std::uint64_t>(v)} {}
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const UnsignedInteger v) noexcept : high {}, low {static_cast<std::uint64_t>(v)} {}
 
     #ifdef BOOST_INT128_HAS_INT128
 
-    constexpr int128_t(const detail::builtin_i128 v) noexcept : high {static_cast<std::int64_t>(v >> 64U)}, low {static_cast<std::uint64_t>(v & detail::low_word_mask)} {}
-    constexpr int128_t(const detail::builtin_u128 v) noexcept : high {static_cast<std::int64_t>(v >> 64U)}, low {static_cast<std::uint64_t>(v & detail::low_word_mask)} {}
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const detail::builtin_i128 v) noexcept : high {static_cast<std::int64_t>(v >> 64U)}, low {static_cast<std::uint64_t>(v & detail::low_word_mask)} {}
+    BOOST_INT128_HOST_DEVICE constexpr int128_t(const detail::builtin_u128 v) noexcept : high {static_cast<std::int64_t>(v >> 64U)}, low {static_cast<std::uint64_t>(v & detail::low_word_mask)} {}
 
     #endif // BOOST_INT128_HAS_INT128
 
     #endif // BOOST_INT128_ENDIAN_LITTLE_BYTE
 
     // Integer Conversion operators
-    explicit constexpr operator bool() const noexcept { return low || high; }
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator bool() const noexcept { return low || high; }
 
     template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-    explicit constexpr operator SignedInteger() const noexcept { return static_cast<SignedInteger>(low); }
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator SignedInteger() const noexcept { return static_cast<SignedInteger>(low); }
 
     template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-    explicit constexpr operator UnsignedInteger() const noexcept { return static_cast<UnsignedInteger>(low); }
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator UnsignedInteger() const noexcept { return static_cast<UnsignedInteger>(low); }
 
     #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-    explicit BOOST_INT128_BUILTIN_CONSTEXPR operator detail::builtin_i128() const noexcept { return static_cast<detail::builtin_i128>(static_cast<detail::builtin_u128>(high) << static_cast<detail::builtin_u128>(64)) | static_cast<detail::builtin_i128>(low); }
+    BOOST_INT128_HOST_DEVICE explicit BOOST_INT128_BUILTIN_CONSTEXPR operator detail::builtin_i128() const noexcept { return static_cast<detail::builtin_i128>(static_cast<detail::builtin_u128>(high) << static_cast<detail::builtin_u128>(64)) | static_cast<detail::builtin_i128>(low); }
 
-    explicit BOOST_INT128_BUILTIN_CONSTEXPR operator detail::builtin_u128() const noexcept { return (static_cast<detail::builtin_u128>(high) << static_cast<detail::builtin_u128>(64)) | static_cast<detail::builtin_u128>(low); }
+    BOOST_INT128_HOST_DEVICE explicit BOOST_INT128_BUILTIN_CONSTEXPR operator detail::builtin_u128() const noexcept { return (static_cast<detail::builtin_u128>(high) << static_cast<detail::builtin_u128>(64)) | static_cast<detail::builtin_u128>(low); }
 
     #endif // BOOST_INT128_HAS_INT128
 
     // Conversion to float
     // This is basically the same as ldexp(static_cast<T>(high), 64) + static_cast<T>(low),
     // but can be constexpr at C++11 instead of C++26
-    explicit constexpr operator float() const noexcept;
-    explicit constexpr operator double() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator float() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator double() const noexcept;
+
+    // Long double does not exist on device
+    #if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
     explicit constexpr operator long double() const noexcept;
+    #endif
 
     // Compound Or
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator|=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator|=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator|=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator|=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator|=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator|=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound And
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator&=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator&=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator&=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator&=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator&=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator&=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound XOR
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator^=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator^=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator^=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator^=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator^=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator^=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Left Shift
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator<<=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator<<=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator<<=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator<<=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator<<=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator<<=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Right Shift
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator>>=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator>>=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator>>=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator>>=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator>>=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator>>=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Prefix and postfix increment
-    constexpr int128_t& operator++() noexcept;
-    constexpr int128_t operator++(int) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator++() noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t operator++(int) noexcept;
 
     // Prefix and postfix decrment
-    constexpr int128_t& operator--() noexcept;
-    constexpr int128_t operator--(int) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator--() noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t operator--(int) noexcept;
 
     // Compound Addition
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator+=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator+=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator+=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator+=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator+=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator+=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Subtraction
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator-=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator-=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator-=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator-=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator-=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator-=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Multiplication
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator*=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator*=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator*=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator*=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator*=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator*=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Division
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator/=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator/=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator/=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator/=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator/=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator/=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Modulo
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr int128_t& operator%=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator%=(Integer rhs) noexcept;
 
-    constexpr int128_t& operator%=(int128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr int128_t& operator%=(int128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline int128_t& operator%=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline int128_t& operator%=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 };
@@ -262,7 +267,7 @@ int128_t
 // Absolute Value function
 //=====================================
 
-BOOST_INT128_EXPORT constexpr int128_t abs(int128_t value) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t abs(int128_t value) noexcept
 {
     if (value.high < 0)
     {
@@ -282,33 +287,37 @@ BOOST_INT128_EXPORT constexpr int128_t abs(int128_t value) noexcept
 // by 0xFFFFFFFF in order to generally replicate what ldexp is doing in the constexpr context.
 // We also avoid pulling in <quadmath.h> for the __float128 case where we would need ldexpq
 
-constexpr int128_t::operator float() const noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t::operator float() const noexcept
 {
     return static_cast<float>(high) * detail::offset_value_v<float> + static_cast<float>(low);
 }
 
-constexpr int128_t::operator double() const noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t::operator double() const noexcept
 {
     return static_cast<double>(high) * detail::offset_value_v<double> + static_cast<double>(low);
 }
 
+#if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
 constexpr int128_t::operator long double() const noexcept
 {
     return static_cast<long double>(high) * detail::offset_value_v<long double> + static_cast<long double>(low);
 }
 
+#endif
+
 //=====================================
 // Unary Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr int128_t operator+(const int128_t value) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const int128_t value) noexcept
 {
     return value;
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator-(const int128_t value) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const int128_t value) noexcept
 {
-    return (value.low == 0) ? int128_t{-value.high, 0} :
+    return (value.low == 0) ? int128_t{static_cast<std::int64_t>(0ULL - static_cast<std::uint64_t>(value.high)), 0} :
                               int128_t{~value.high, ~value.low + 1};
 }
 
@@ -316,12 +325,12 @@ BOOST_INT128_EXPORT constexpr int128_t operator-(const int128_t value) noexcept
 // Equality Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr bool operator==(const int128_t lhs, const bool rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator==(const int128_t lhs, const bool rhs) noexcept
 {
     return lhs.high == 0 && lhs.low == static_cast<std::uint64_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr bool operator==(const bool lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator==(const bool lhs, const int128_t rhs) noexcept
 {
     return rhs.high == 0 && rhs.low == static_cast<std::uint64_t>(lhs);
 }
@@ -336,7 +345,7 @@ BOOST_INT128_EXPORT constexpr bool operator==(const bool lhs, const int128_t rhs
 #  pragma GCC diagnostic ignored "-Wsign-compare"
 #endif
 
-BOOST_INT128_EXPORT constexpr bool operator==(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator==(const int128_t lhs, const int128_t rhs) noexcept
 {
     // x64 and ARM64 like the values in opposite directions
 
@@ -352,19 +361,19 @@ BOOST_INT128_EXPORT constexpr bool operator==(const int128_t lhs, const int128_t
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator==(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return lhs.high == (rhs < 0 ? -1 : 0) && lhs.low == static_cast<std::uint64_t>(rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator==(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return rhs.high == (lhs < 0 ? -1 : 0) && rhs.low == static_cast<std::uint64_t>(lhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator==(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -381,7 +390,7 @@ constexpr bool operator==(const int128_t lhs, const UnsignedInteger rhs) noexcep
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator==(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -399,24 +408,24 @@ constexpr bool operator==(const UnsignedInteger lhs, const int128_t rhs) noexcep
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs == static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) == rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs.high < 0 ? false : lhs == static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return rhs.high < 0 ? false : static_cast<int128_t>(lhs) == rhs;
 }
@@ -424,14 +433,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail:
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
@@ -445,7 +454,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const T, const int128_t) noexcept
 // Inequality Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr bool operator!=(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const int128_t lhs, const int128_t rhs) noexcept
 {
     // x64 and ARM64 like the values in opposite directions
 
@@ -477,30 +486,30 @@ BOOST_INT128_EXPORT constexpr bool operator!=(const int128_t lhs, const int128_t
     #endif
 }
 
-BOOST_INT128_EXPORT constexpr bool operator!=(const int128_t lhs, const bool rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const int128_t lhs, const bool rhs) noexcept
 {
     return lhs.high != 0 || lhs.low != static_cast<std::uint64_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr bool operator!=(const bool lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const bool lhs, const int128_t rhs) noexcept
 {
     return rhs.high != 0 || rhs.low != static_cast<std::uint64_t>(lhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator!=(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return lhs.high != (rhs < 0 ? -1 : 0) || lhs.low != static_cast<std::uint64_t>(rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator!=(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return rhs.high != (lhs < 0 ? -1 : 0) || rhs.low != static_cast<std::uint64_t>(lhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator!=(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -517,7 +526,7 @@ constexpr bool operator!=(const int128_t lhs, const UnsignedInteger rhs) noexcep
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator!=(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -535,24 +544,24 @@ constexpr bool operator!=(const UnsignedInteger lhs, const int128_t rhs) noexcep
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs != static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) != rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs.high < 0 ? true : lhs != static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return rhs.high < 0 ? true : static_cast<int128_t>(lhs) != rhs;
 }
@@ -560,14 +569,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail:
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
@@ -581,7 +590,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const T, const int128_t) noexcept
 // Less than Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr bool operator<(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator<(const int128_t lhs, const int128_t rhs) noexcept
 {
     // On ARM macs only with the clang compiler is casting to __int128 uniformly better (and seemingly cost free)
     #if defined(__aarch64__) && defined(__APPLE__) && defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
@@ -613,7 +622,7 @@ BOOST_INT128_EXPORT constexpr bool operator<(const int128_t lhs, const int128_t
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator<(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -630,7 +639,7 @@ constexpr bool operator<(const int128_t lhs, const UnsignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator<(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -647,7 +656,7 @@ constexpr bool operator<(const UnsignedInteger lhs, const int128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator<(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     if (lhs.high < 0)
     {
@@ -663,7 +672,7 @@ constexpr bool operator<(const int128_t lhs, const SignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator<(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     if (rhs.high < 0)
     {
@@ -681,24 +690,24 @@ constexpr bool operator<(const SignedInteger lhs, const int128_t rhs) noexcept
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs < static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) < rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs.high < 0 ? false : lhs < static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return rhs.high < 0 ? true : static_cast<int128_t>(lhs) < rhs;
 }
@@ -706,14 +715,14 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_u128 lhs, co
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
@@ -727,7 +736,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const T, const int128_t) noexcept
 // Greater than Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr bool operator>(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator>(const int128_t lhs, const int128_t rhs) noexcept
 {
     // On ARM macs only with the clang compiler is casting to __int128 uniformly better (and seemingly cost free)
     #if defined(__aarch64__) && defined(__APPLE__) && defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
@@ -759,19 +768,19 @@ BOOST_INT128_EXPORT constexpr bool operator>(const int128_t lhs, const int128_t
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator>(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return !(lhs < rhs) && !(lhs == rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator>(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return !(lhs < rhs) && !(lhs == rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator>(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -788,7 +797,7 @@ constexpr bool operator>(const int128_t lhs, const UnsignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator>(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -806,24 +815,24 @@ constexpr bool operator>(const UnsignedInteger lhs, const int128_t rhs) noexcept
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs > static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) > rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs.high < 0 ? false : lhs > static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return rhs.high < 0 ? true : static_cast<int128_t>(lhs) > rhs;
 }
@@ -831,14 +840,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
@@ -852,7 +861,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const T, const int128_t) noexcept
 // Less Equal Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr bool operator<=(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const int128_t lhs, const int128_t rhs) noexcept
 {
     // On ARM macs only with the clang compiler is casting to __int128 uniformly better (and seemingly cost free)
     #if defined(__aarch64__) && defined(__APPLE__) && defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
@@ -884,19 +893,19 @@ BOOST_INT128_EXPORT constexpr bool operator<=(const int128_t lhs, const int128_t
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator<=(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return !(lhs > rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator<=(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return !(lhs > rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator<=(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -913,7 +922,7 @@ constexpr bool operator<=(const int128_t lhs, const UnsignedInteger rhs) noexcep
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator<=(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -931,24 +940,24 @@ constexpr bool operator<=(const UnsignedInteger lhs, const int128_t rhs) noexcep
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs <= static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) <= rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs.high < 0 ? true : lhs <= static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return rhs.high < 0 ? false : static_cast<int128_t>(lhs) <= rhs;
 }
@@ -956,14 +965,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail:
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
@@ -977,7 +986,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const T, const int128_t) noexcept
 // Greater Equal Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr bool operator>=(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const int128_t lhs, const int128_t rhs) noexcept
 {
     // On ARM macs only with the clang compiler is casting to __int128 uniformly better (and seemingly cost free)
     #if defined(__aarch64__) && defined(__APPLE__) && defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
@@ -1009,19 +1018,19 @@ BOOST_INT128_EXPORT constexpr bool operator>=(const int128_t lhs, const int128_t
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator>=(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return !(lhs < rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator>=(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return !(lhs < rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator>=(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -1038,7 +1047,7 @@ constexpr bool operator>=(const int128_t lhs, const UnsignedInteger rhs) noexcep
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator>=(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -1056,24 +1065,24 @@ constexpr bool operator>=(const UnsignedInteger lhs, const int128_t rhs) noexcep
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs >= static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) >= rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs.high < 0 ? false : lhs >= static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return rhs.high < 0 ? true : static_cast<int128_t>(lhs) >= rhs;
 }
@@ -1081,14 +1090,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail:
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return true;
@@ -1104,7 +1113,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const T, const int128_t) noexcept
 
 #ifdef BOOST_INT128_HAS_SPACESHIP_OPERATOR
 
-BOOST_INT128_EXPORT constexpr std::strong_ordering operator<=>(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const int128_t lhs, const int128_t rhs) noexcept
 {
     if (lhs < rhs)
     {
@@ -1121,7 +1130,7 @@ BOOST_INT128_EXPORT constexpr std::strong_ordering operator<=>(const int128_t lh
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr std::strong_ordering operator<=>(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     if (lhs < rhs)
     {
@@ -1138,7 +1147,7 @@ constexpr std::strong_ordering operator<=>(const int128_t lhs, const SignedInteg
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr std::strong_ordering operator<=>(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     if (lhs < rhs)
     {
@@ -1155,7 +1164,7 @@ constexpr std::strong_ordering operator<=>(const SignedInteger lhs, const int128
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr std::strong_ordering operator<=>(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -1183,7 +1192,7 @@ constexpr std::strong_ordering operator<=>(const int128_t lhs, const UnsignedInt
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr std::strong_ordering operator<=>(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -1216,7 +1225,7 @@ constexpr std::strong_ordering operator<=>(const UnsignedInteger lhs, const int1
 // Not Operator
 //=====================================
 
-BOOST_INT128_EXPORT constexpr int128_t operator~(const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator~(const int128_t rhs) noexcept
 {
     return {~rhs.high, ~rhs.low};
 }
@@ -1225,25 +1234,25 @@ BOOST_INT128_EXPORT constexpr int128_t operator~(const int128_t rhs) noexcept
 // Or Operator
 //=====================================
 
-BOOST_INT128_EXPORT constexpr int128_t operator|(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const int128_t lhs, const int128_t rhs) noexcept
 {
     return {lhs.high | rhs.high, lhs.low | rhs.low};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator|(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return {lhs.high | (rhs < 0 ? -1 : 0), lhs.low | static_cast<std::uint64_t>(rhs)};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator|(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return {rhs.high | (lhs < 0 ? -1 : 0), static_cast<std::uint64_t>(lhs) | rhs.low};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator|(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1260,7 +1269,7 @@ constexpr int128_t operator|(const int128_t lhs, const UnsignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator|(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1278,24 +1287,24 @@ constexpr int128_t operator|(const UnsignedInteger lhs, const int128_t rhs) noex
 
 #ifdef BOOST_INT128_HAS_INT128
 
-BOOST_INT128_EXPORT constexpr int128_t operator|(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs | static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator|(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) | rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr int128_t operator|(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs | static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator|(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) | rhs;
 }
@@ -1303,14 +1312,14 @@ BOOST_INT128_EXPORT constexpr int128_t operator|(const detail::builtin_u128 lhs,
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator|(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator|(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator|(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -1325,7 +1334,7 @@ constexpr int128_t operator|(const T, const int128_t) noexcept
 //=====================================
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator|=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator|=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_signed_integer_v<Integer>, "Sign Conversion Error");
@@ -1335,7 +1344,7 @@ constexpr int128_t& int128_t::operator|=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr int128_t& int128_t::operator|=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator|=(const int128_t rhs) noexcept
 {
     *this = *this | rhs;
     return *this;
@@ -1344,7 +1353,7 @@ constexpr int128_t& int128_t::operator|=(const int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator|=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator|=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -1360,25 +1369,25 @@ inline int128_t& int128_t::operator|=(const Integer rhs) noexcept
 // And Operator
 //=====================================
 
-BOOST_INT128_EXPORT constexpr int128_t operator&(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const int128_t lhs, const int128_t rhs) noexcept
 {
     return {lhs.high & rhs.high, lhs.low & rhs.low};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator&(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return {lhs.high & (rhs < 0 ? -1 : 0), lhs.low & static_cast<std::uint64_t>(rhs)};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator&(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return {rhs.high & (lhs < 0 ? -1 : 0), static_cast<std::uint64_t>(lhs) & rhs.low};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator&(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1395,7 +1404,7 @@ constexpr int128_t operator&(const int128_t lhs, const UnsignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator&(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1413,24 +1422,24 @@ constexpr int128_t operator&(const UnsignedInteger lhs, const int128_t rhs) noex
 
 #ifdef BOOST_INT128_HAS_INT128
 
-BOOST_INT128_EXPORT constexpr int128_t operator&(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs & static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator&(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) & rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr int128_t operator&(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs & static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator&(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) & rhs;
 }
@@ -1438,14 +1447,14 @@ BOOST_INT128_EXPORT constexpr int128_t operator&(const detail::builtin_u128 lhs,
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator&(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator&(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator&(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -1458,7 +1467,7 @@ constexpr int128_t operator&(const T, const int128_t) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator&=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator&=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -1475,7 +1484,7 @@ inline int128_t& int128_t::operator&=(const Integer rhs) noexcept
 //=====================================
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator&=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator&=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_signed_integer_v<Integer>, "Sign Conversion Error");
@@ -1485,7 +1494,7 @@ constexpr int128_t& int128_t::operator&=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr int128_t& int128_t::operator&=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator&=(const int128_t rhs) noexcept
 {
     *this = *this & rhs;
     return *this;
@@ -1495,25 +1504,25 @@ constexpr int128_t& int128_t::operator&=(const int128_t rhs) noexcept
 // XOR Operator
 //=====================================
 
-BOOST_INT128_EXPORT constexpr int128_t operator^(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const int128_t lhs, const int128_t rhs) noexcept
 {
     return {lhs.high ^ rhs.high, lhs.low ^ rhs.low};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator^(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return {lhs.high ^ (rhs < 0 ? -1 : 0), lhs.low ^ static_cast<std::uint64_t>(rhs)};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator^(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return {rhs.high ^ (lhs < 0 ? -1 : 0), static_cast<std::uint64_t>(lhs) ^ rhs.low};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator^(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1530,7 +1539,7 @@ constexpr int128_t operator^(const int128_t lhs, const UnsignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator^(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1541,31 +1550,31 @@ constexpr int128_t operator^(const UnsignedInteger lhs, const int128_t rhs) noex
     static_assert(detail::is_signed_integer_v<UnsignedInteger>, "Sign Conversion Error");
     static_cast<void>(lhs);
     static_cast<void>(rhs);
-    return true;
+    return int128_t{};
 
     #endif
 }
 
 #ifdef BOOST_INT128_HAS_INT128
 
-BOOST_INT128_EXPORT constexpr int128_t operator^(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs ^ static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator^(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) ^ rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr int128_t operator^(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs ^ static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator^(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) ^ rhs;
 }
@@ -1573,14 +1582,14 @@ BOOST_INT128_EXPORT constexpr int128_t operator^(const detail::builtin_u128 lhs,
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator^(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator^(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator^(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -1595,7 +1604,7 @@ constexpr int128_t operator^(const T, const int128_t) noexcept
 //=====================================
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator^=(Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator^=(Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_signed_integer_v<Integer>, "Sign Conversion Error");
@@ -1605,7 +1614,7 @@ constexpr int128_t& int128_t::operator^=(Integer rhs) noexcept
     return *this;
 }
 
-constexpr int128_t& int128_t::operator^=(int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator^=(int128_t rhs) noexcept
 {
     *this = *this ^ rhs;
     return *this;
@@ -1614,7 +1623,7 @@ constexpr int128_t& int128_t::operator^=(int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator^=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator^=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -1633,13 +1642,23 @@ inline int128_t& int128_t::operator^=(const Integer rhs) noexcept
 namespace detail {
 
 template <typename Integer>
-constexpr int128_t default_ls_impl(const int128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t default_ls_impl(const int128_t lhs, const Integer rhs) noexcept
 {
     static_assert(std::is_integral<Integer>::value, "Only builtin types allowed");
 
-    if (rhs < 0 || rhs >= 128)
+    BOOST_INT128_IF_CONSTEXPR (std::numeric_limits<Integer>::is_signed)
     {
-        return {0, 0};
+        if (rhs < 0 || rhs >= 128)
+        {
+            return {0, 0};
+        }
+    }
+    else
+    {
+        if (rhs >= 128)
+        {
+            return {0, 0};
+        }
     }
 
     if (rhs == 0)
@@ -1668,11 +1687,21 @@ constexpr int128_t default_ls_impl(const int128_t lhs, const Integer rhs) noexce
 }
 
 template <typename Integer>
-int128_t intrinsic_ls_impl(const int128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE int128_t intrinsic_ls_impl(const int128_t lhs, const Integer rhs) noexcept
 {
-    if (BOOST_INT128_UNLIKELY(rhs >= 128 || rhs < 0))
+    BOOST_INT128_IF_CONSTEXPR (std::numeric_limits<Integer>::is_signed)
     {
-        return {0, 0};
+        if (BOOST_INT128_UNLIKELY(rhs >= 128 || rhs < 0))
+        {
+            return {0, 0};
+        }
+    }
+    else
+    {
+        if (BOOST_INT128_UNLIKELY(rhs >= 128))
+        {
+            return {0, 0};
+        }
     }
 
     #ifdef BOOST_INT128_HAS_INT128
@@ -1748,7 +1777,7 @@ int128_t intrinsic_ls_impl(const int128_t lhs, const Integer rhs) noexcept
 } // namespace detail
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-constexpr int128_t operator<<(const int128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator<<(const int128_t lhs, const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION
 
@@ -1768,7 +1797,7 @@ constexpr int128_t operator<<(const int128_t lhs, const Integer rhs) noexcept
     #endif
 }
 
-constexpr int128_t operator<<(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator<<(const int128_t lhs, const int128_t rhs) noexcept
 {
     if (rhs.high != 0 || rhs.low >= 128)
     {
@@ -1780,7 +1809,7 @@ constexpr int128_t operator<<(const int128_t lhs, const int128_t rhs) noexcept
 
 #ifdef BOOST_INT128_HAS_INT128
 
-BOOST_INT128_EXPORT constexpr detail::builtin_u128 operator<<(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr detail::builtin_u128 operator<<(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(detail::builtin_u128) * 8};
 
@@ -1792,7 +1821,7 @@ BOOST_INT128_EXPORT constexpr detail::builtin_u128 operator<<(const detail::buil
     return lhs << rhs.low;
 }
 
-BOOST_INT128_EXPORT constexpr detail::builtin_i128 operator<<(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr detail::builtin_i128 operator<<(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(detail::builtin_i128) * 8};
 
@@ -1807,7 +1836,7 @@ BOOST_INT128_EXPORT constexpr detail::builtin_i128 operator<<(const detail::buil
 #endif
 
 BOOST_INT128_EXPORT template <typename SignedInteger, std::enable_if_t<detail::is_signed_integer_v<SignedInteger> && (sizeof(SignedInteger) * 8 <= 16), bool> = true>
-constexpr int operator<<(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int operator<<(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(SignedInteger) * 8};
 
@@ -1820,7 +1849,7 @@ constexpr int operator<<(const SignedInteger lhs, const int128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <typename UnsignedInteger, std::enable_if_t<detail::is_unsigned_integer_v<UnsignedInteger> && (sizeof(UnsignedInteger) * 8 <= 16), bool> = true>
-constexpr unsigned operator<<(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr unsigned operator<<(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(UnsignedInteger) * 8};
 
@@ -1838,13 +1867,13 @@ constexpr unsigned operator<<(const UnsignedInteger lhs, const int128_t rhs) noe
 #endif // _MSC_VER
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator<<=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator<<=(const Integer rhs) noexcept
 {
     *this = *this << rhs;
     return *this;
 }
 
-constexpr int128_t& int128_t::operator<<=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator<<=(const int128_t rhs) noexcept
 {
     *this = *this << rhs;
     return *this;
@@ -1853,7 +1882,7 @@ constexpr int128_t& int128_t::operator<<=(const int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator<<=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator<<=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -1876,11 +1905,21 @@ inline int128_t& int128_t::operator<<=(const Integer rhs) noexcept
 namespace detail {
 
 template <typename Integer>
-constexpr int128_t default_rs_impl(const int128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t default_rs_impl(const int128_t lhs, const Integer rhs) noexcept
 {
-    if (rhs >= 128 || rhs < 0 )
+    BOOST_INT128_IF_CONSTEXPR (std::numeric_limits<Integer>::is_signed)
     {
-        return lhs.high < 0 ? int128_t{-1, UINT64_MAX} : int128_t{0, 0};
+        if (rhs >= 128 || rhs < 0)
+        {
+            return lhs.high < 0 ? int128_t{-1, UINT64_MAX} : int128_t{0, 0};
+        }
+    }
+    else
+    {
+        if (rhs >= 128)
+        {
+            return lhs.high < 0 ? int128_t{-1, UINT64_MAX} : int128_t{0, 0};
+        }
     }
 
     if (rhs == 0)
@@ -1905,11 +1944,21 @@ constexpr int128_t default_rs_impl(const int128_t lhs, const Integer rhs) noexce
 }
 
 template <typename Integer>
-int128_t intrinsic_rs_impl(const int128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE int128_t intrinsic_rs_impl(const int128_t lhs, const Integer rhs) noexcept
 {
-    if (BOOST_INT128_UNLIKELY(rhs >= 128 || rhs < 0))
+    BOOST_INT128_IF_CONSTEXPR (std::numeric_limits<Integer>::is_signed)
     {
-        return {0, 0};
+        if (rhs >= 128 || rhs < 0)
+        {
+            return lhs.high < 0 ? int128_t{-1, UINT64_MAX} : int128_t{0, 0};
+        }
+    }
+    else
+    {
+        if (rhs >= 128)
+        {
+            return lhs.high < 0 ? int128_t{-1, UINT64_MAX} : int128_t{0, 0};
+        }
     }
 
     #ifdef BOOST_INT128_HAS_INT128
@@ -1982,7 +2031,7 @@ int128_t intrinsic_rs_impl(const int128_t lhs, const Integer rhs) noexcept
 } // namespace detail
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-constexpr int128_t operator>>(const int128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator>>(const int128_t lhs, const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION
 
@@ -2002,19 +2051,19 @@ constexpr int128_t operator>>(const int128_t lhs, const Integer rhs) noexcept
     #endif
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator>>(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator>>(const int128_t lhs, const int128_t rhs) noexcept
 {
     if (rhs.high != 0 || rhs.low >= 128)
     {
         return 0;
     }
 
-    return lhs << rhs.low;
+    return lhs >> rhs.low;
 }
 
 #ifdef BOOST_INT128_HAS_INT128
 
-BOOST_INT128_EXPORT constexpr detail::builtin_u128 operator>>(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr detail::builtin_u128 operator>>(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(detail::builtin_u128) * 8};
 
@@ -2023,10 +2072,10 @@ BOOST_INT128_EXPORT constexpr detail::builtin_u128 operator>>(const detail::buil
         return 0;
     }
 
-    return lhs << rhs.low;
+    return lhs >> rhs.low;
 }
 
-BOOST_INT128_EXPORT constexpr detail::builtin_i128 operator>>(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr detail::builtin_i128 operator>>(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(detail::builtin_i128) * 8};
 
@@ -2035,13 +2084,13 @@ BOOST_INT128_EXPORT constexpr detail::builtin_i128 operator>>(const detail::buil
         return 0;
     }
 
-    return lhs << rhs.low;
+    return lhs >> rhs.low;
 }
 
 #endif
 
 BOOST_INT128_EXPORT template <typename SignedInteger, std::enable_if_t<detail::is_signed_integer_v<SignedInteger> && (sizeof(SignedInteger) * 8 <= 16), bool> = true>
-constexpr int operator>>(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int operator>>(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(SignedInteger) * 8};
 
@@ -2054,7 +2103,7 @@ constexpr int operator>>(const SignedInteger lhs, const int128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <typename UnsignedInteger, std::enable_if_t<detail::is_unsigned_integer_v<UnsignedInteger> && (sizeof(UnsignedInteger) * 8 <= 16), bool> = true>
-constexpr unsigned operator>>(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr unsigned operator>>(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(UnsignedInteger) * 8};
 
@@ -2072,13 +2121,13 @@ constexpr unsigned operator>>(const UnsignedInteger lhs, const int128_t rhs) noe
 #endif // _MSC_VER
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator>>=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator>>=(const Integer rhs) noexcept
 {
     *this = *this >> rhs;
     return *this;
 }
 
-constexpr int128_t& int128_t::operator>>=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator>>=(const int128_t rhs) noexcept
 {
     *this = *this >> rhs;
     return *this;
@@ -2087,7 +2136,7 @@ constexpr int128_t& int128_t::operator>>=(const int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator>>=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator>>=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -2107,7 +2156,7 @@ inline int128_t& int128_t::operator>>=(const Integer rhs) noexcept
 // Increment Operators
 //=====================================
 
-constexpr int128_t& int128_t::operator++() noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator++() noexcept
 {
     if (++low == UINT64_C(0))
     {
@@ -2117,7 +2166,7 @@ constexpr int128_t& int128_t::operator++() noexcept
     return *this;
 }
 
-constexpr int128_t int128_t::operator++(int) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t int128_t::operator++(int) noexcept
 {
     const auto temp {*this};
     ++(*this);
@@ -2128,7 +2177,7 @@ constexpr int128_t int128_t::operator++(int) noexcept
 // Decrement Operators
 //=====================================
 
-constexpr int128_t& int128_t::operator--() noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator--() noexcept
 {
     if (low-- == UINT64_C(0))
     {
@@ -2138,7 +2187,7 @@ constexpr int128_t& int128_t::operator--() noexcept
     return *this;
 }
 
-constexpr int128_t int128_t::operator--(int) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t int128_t::operator--(int) noexcept
 {
     const auto temp {*this};
     --(*this);
@@ -2151,7 +2200,7 @@ constexpr int128_t int128_t::operator--(int) noexcept
 
 namespace detail {
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t library_add(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t library_add(const int128_t lhs, const int128_t rhs) noexcept
 {
     const auto new_low {lhs.low + rhs.low};
     const auto new_high {static_cast<std::uint64_t>(lhs.high) +
@@ -2161,7 +2210,7 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t library_add(const int128_t lhs, con
     return int128_t{static_cast<std::int64_t>(new_high), new_low};
 }
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t default_add(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_add(const int128_t lhs, const int128_t rhs) noexcept
 {
     #if (defined(__x86_64__) || (defined(__aarch64__) && !defined(__APPLE__))) && !defined(_WIN32) && defined(BOOST_INT128_HAS_INT128)
 
@@ -2199,7 +2248,7 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t default_add(const int128_t lhs, con
 }
 
 template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-BOOST_INT128_FORCE_INLINE constexpr int128_t default_add(const int128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_add(const int128_t lhs, const Integer rhs) noexcept
 {
     const auto new_low {lhs.low + rhs};
     const auto new_high {static_cast<std::uint64_t>(lhs.high) + static_cast<std::uint64_t>(new_low < lhs.low)};
@@ -2207,7 +2256,7 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t default_add(const int128_t lhs, con
     return int128_t{static_cast<std::int64_t>(new_high), new_low};
 }
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t library_sub(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t library_sub(const int128_t lhs, const int128_t rhs) noexcept
 {
     const auto new_low {lhs.low - rhs.low};
     const auto new_high {static_cast<std::uint64_t>(lhs.high) - static_cast<std::uint64_t>(rhs.high) - static_cast<std::uint64_t>(lhs.low < rhs.low)};
@@ -2215,9 +2264,9 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t library_sub(const int128_t lhs, con
     return int128_t{static_cast<std::int64_t>(new_high), new_low};
 }
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t default_sub(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_sub(const int128_t lhs, const int128_t rhs) noexcept
 {
-    #if defined(BOOST_INT128_HAS_BUILTIN_SUB_OVERFLOW) && (!defined(__aarch64__) || defined(__APPLE__) || !defined(BOOST_INT128_HAS_INT128))
+    #if defined(BOOST_INT128_HAS_BUILTIN_SUB_OVERFLOW) && (!defined(__aarch64__) || defined(__APPLE__) || !defined(BOOST_INT128_HAS_INT128)) && !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
 
     // __builtin_sub_overflow is marked constexpr so we don't need if consteval handling
     std::uint64_t result_low {};
@@ -2252,7 +2301,7 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t default_sub(const int128_t lhs, con
 }
 
 template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-BOOST_INT128_FORCE_INLINE constexpr int128_t default_sub(const int128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_sub(const int128_t lhs, const Integer rhs) noexcept
 {
     const auto new_low {lhs.low - rhs};
     const auto new_high {static_cast<std::uint64_t>(lhs.high) - static_cast<std::uint64_t>(new_low > lhs.low)};
@@ -2265,14 +2314,14 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t default_sub(const int128_t lhs, con
 // doing addition via subtraction is >10% faster in the benchmarks
 #if defined(__s390__) || defined(__s390x__)
 
-constexpr int128_t operator+(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const int128_t lhs, const int128_t rhs) noexcept
 {
     return detail::default_sub(lhs, -rhs);
 }
 
 #else
 
-BOOST_INT128_EXPORT constexpr int128_t operator+(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const int128_t lhs, const int128_t rhs) noexcept
 {
     return detail::default_add(lhs, rhs);
 }
@@ -2280,7 +2329,7 @@ BOOST_INT128_EXPORT constexpr int128_t operator+(const int128_t lhs, const int12
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator+(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2297,7 +2346,7 @@ constexpr int128_t operator+(const int128_t lhs, const UnsignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator+(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2314,13 +2363,13 @@ constexpr int128_t operator+(const UnsignedInteger lhs, const int128_t rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator+(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return rhs > 0 ? detail::default_add(lhs, rhs) : detail::default_sub(lhs, -rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator+(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator+(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return lhs > 0 ? detail::default_add(rhs, lhs) : detail::default_sub(rhs, -lhs);
 }
@@ -2329,12 +2378,12 @@ constexpr int128_t operator+(const SignedInteger lhs, const int128_t rhs) noexce
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return detail::default_add(lhs, static_cast<int128_t>(rhs));
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return detail::default_add(rhs, static_cast<int128_t>(lhs));
 }
@@ -2342,14 +2391,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const deta
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -2357,12 +2406,12 @@ BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const T, const int128_t) noexc
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return detail::default_add(lhs, static_cast<int128_t>(rhs));
 }
 
-BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return detail::default_add(rhs, static_cast<int128_t>(lhs));
 }
@@ -2370,7 +2419,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator+(const detail::builtin_i128 lhs
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator+=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator+=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_signed_integer_v<Integer>, "Sign Conversion Error");
@@ -2380,7 +2429,7 @@ constexpr int128_t& int128_t::operator+=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr int128_t& int128_t::operator+=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator+=(const int128_t rhs) noexcept
 {
     *this = *this + rhs;
     return *this;
@@ -2389,7 +2438,7 @@ constexpr int128_t& int128_t::operator+=(const int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator+=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator+=(const Integer rhs) noexcept
 {
     *this = *this + rhs;
     return *this;
@@ -2401,13 +2450,13 @@ inline int128_t& int128_t::operator+=(const Integer rhs) noexcept
 // Subtraction Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr int128_t operator-(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const int128_t lhs, const int128_t rhs) noexcept
 {
     return detail::default_sub(lhs, rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator-(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2424,7 +2473,7 @@ constexpr int128_t operator-(const int128_t lhs, const UnsignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator-(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2441,13 +2490,13 @@ constexpr int128_t operator-(const UnsignedInteger lhs, const int128_t rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator-(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return detail::default_sub(lhs, static_cast<int128_t>(rhs));
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator-(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator-(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return detail::default_sub(static_cast<int128_t>(lhs), rhs);
 }
@@ -2456,12 +2505,12 @@ constexpr int128_t operator-(const SignedInteger lhs, const int128_t rhs) noexce
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs - static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) - rhs;
 }
@@ -2469,14 +2518,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const deta
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -2484,12 +2533,12 @@ BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const T, const int128_t) noexc
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs - static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) - rhs;
 }
@@ -2497,7 +2546,7 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR int128_t operator-(const deta
 #endif
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator-=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator-=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_signed_integer_v<Integer>, "Sign Conversion Error");
@@ -2507,7 +2556,7 @@ constexpr int128_t& int128_t::operator-=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr int128_t& int128_t::operator-=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator-=(const int128_t rhs) noexcept
 {
     *this = *this - rhs;
     return *this;
@@ -2516,7 +2565,7 @@ constexpr int128_t& int128_t::operator-=(const int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator-=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator-=(const Integer rhs) noexcept
 {
     *this = *this - rhs;
     return *this;
@@ -2530,12 +2579,12 @@ inline int128_t& int128_t::operator-=(const Integer rhs) noexcept
 
 namespace detail {
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t signed_shift_left_32(const std::uint64_t low) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t signed_shift_left_32(const std::uint64_t low) noexcept
 {
     return {static_cast<std::int64_t>(low >> 32), low << 32};
 }
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t library_mul(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t library_mul(const int128_t lhs, const int128_t rhs) noexcept
 {
     const auto a {lhs.low >> 32U};
     const auto b {lhs.low & UINT32_MAX};
@@ -2548,7 +2597,7 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t library_mul(const int128_t lhs, con
     return result;
 }
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const std::uint64_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const std::uint64_t rhs) noexcept
 {
     const auto low_res{lhs.low * rhs};
 
@@ -2571,7 +2620,7 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, con
     return {high_res, low_res};
 }
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const std::uint32_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const std::uint32_t rhs) noexcept
 {
     const auto low_res{lhs.low * rhs};
 
@@ -2585,7 +2634,7 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, con
 
 #if defined(_M_AMD64) && !defined(__GNUC__)
 
-BOOST_INT128_FORCE_INLINE int128_t msvc_amd64_mul(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE int128_t msvc_amd64_mul(const int128_t lhs, const int128_t rhs) noexcept
 {
     int128_t result {};
     result.low = _umul128(lhs.low, rhs.low, reinterpret_cast<std::uint64_t*>(&result.high));
@@ -2597,7 +2646,7 @@ BOOST_INT128_FORCE_INLINE int128_t msvc_amd64_mul(const int128_t lhs, const int1
 
 #endif
 
-BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const int128_t rhs) noexcept
 {
     #if ((defined(__aarch64__) && defined(__APPLE__)) || defined(__x86_64__) || defined(__PPC__) || defined(__powerpc__)) && defined(__GNUC__) && !defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
 
@@ -2680,13 +2729,13 @@ BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, con
 
 } // namespace detail
 
-BOOST_INT128_EXPORT constexpr int128_t operator*(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const int128_t lhs, const int128_t rhs) noexcept
 {
     return detail::default_mul(lhs, rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator*(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2704,7 +2753,7 @@ constexpr int128_t operator*(const int128_t lhs, const UnsignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator*(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2727,14 +2776,14 @@ constexpr int128_t operator*(const UnsignedInteger lhs, const int128_t rhs) noex
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator*(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return rhs < 0 ? -detail::default_mul(lhs, -static_cast<std::uint64_t>(rhs)) :
                       detail::default_mul(lhs, static_cast<std::uint64_t>(rhs));
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator*(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return lhs < 0 ? -detail::default_mul(rhs, -static_cast<std::uint64_t>(lhs)) :
                       detail::default_mul(rhs, static_cast<std::uint64_t>(lhs));
@@ -2748,12 +2797,12 @@ constexpr int128_t operator*(const SignedInteger lhs, const int128_t rhs) noexce
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr int128_t operator*(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) * rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator*(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(static_cast<detail::builtin_i128>(rhs) * lhs);
 }
@@ -2761,14 +2810,14 @@ BOOST_INT128_EXPORT constexpr int128_t operator*(const detail::builtin_u128 lhs,
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator*(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator*(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -2776,12 +2825,12 @@ constexpr int128_t operator*(const T, const int128_t) noexcept
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr int128_t operator*(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return detail::default_mul(lhs, static_cast<int128_t>(rhs));
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator*(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator*(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return detail::default_mul(rhs, static_cast<int128_t>(lhs));
 }
@@ -2789,7 +2838,7 @@ BOOST_INT128_EXPORT constexpr int128_t operator*(const detail::builtin_i128 lhs,
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator*=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator*=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_signed_integer_v<Integer>, "Sign Conversion Error");
@@ -2799,7 +2848,7 @@ constexpr int128_t& int128_t::operator*=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr int128_t& int128_t::operator*=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator*=(const int128_t rhs) noexcept
 {
     *this = *this * rhs;
     return *this;
@@ -2808,7 +2857,7 @@ constexpr int128_t& int128_t::operator*=(const int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator*=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator*=(const Integer rhs) noexcept
 {
     *this = *this * rhs;
     return *this;
@@ -2825,25 +2874,25 @@ inline int128_t& int128_t::operator*=(const Integer rhs) noexcept
 #  pragma clang diagnostic ignored "-Wassume"
 #endif
 
-BOOST_INT128_EXPORT constexpr int128_t operator/(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const int128_t lhs, const int128_t rhs) noexcept
 {
     if (BOOST_INT128_UNLIKELY(rhs == 0))
     {
         return {0, 0};
     }
 
+    constexpr int128_t min_val {INT64_MIN, 0};
     const auto abs_lhs {abs(lhs)};
     const auto abs_rhs {abs(rhs)};
 
-    if (abs_lhs < abs_rhs)
+    if (lhs != min_val && abs_lhs < abs_rhs)
     {
         return {0,0};
     }
     #if defined(BOOST_INT128_HAS_INT128)
-    else
-    {
-        return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) / static_cast<detail::builtin_i128>(rhs));
-    }
+
+    return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) / static_cast<detail::builtin_i128>(rhs));
+
     #else
 
     int128_t quotient {};
@@ -2870,7 +2919,7 @@ BOOST_INT128_EXPORT constexpr int128_t operator/(const int128_t lhs, const int12
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator/(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2898,7 +2947,7 @@ constexpr int128_t operator/(const int128_t lhs, const UnsignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator/(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2915,7 +2964,8 @@ constexpr int128_t operator/(const UnsignedInteger lhs, const int128_t rhs) noex
     {
         auto abs_rhs {abs(rhs)};
         const auto res {static_cast<std::uint64_t>(lhs) / abs_rhs.low};
-        return int128_t{rhs.high, res};
+        const int128_t result {0, res};
+        return rhs < 0 ? -result : result;
     }
 
     #else
@@ -2929,7 +2979,7 @@ constexpr int128_t operator/(const UnsignedInteger lhs, const int128_t rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator/(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     using eval_type = detail::evaluation_type_t<SignedInteger>;
 
@@ -2940,11 +2990,12 @@ constexpr int128_t operator/(const int128_t lhs, const SignedInteger rhs) noexce
 
     int128_t quotient {};
 
+    constexpr int128_t min_val {INT64_MIN, 0};
     const auto negative_res {static_cast<bool>((lhs.high < 0) ^ (rhs < 0))};
     const auto abs_rhs {rhs < 0 ? -rhs : rhs};
     const auto abs_lhs {abs(lhs)};
 
-    if (abs_lhs < abs_rhs)
+    if (lhs != min_val && abs_lhs < abs_rhs)
     {
         return {0, 0};
     }
@@ -2955,7 +3006,7 @@ constexpr int128_t operator/(const int128_t lhs, const SignedInteger rhs) noexce
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator/(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     if (BOOST_INT128_UNLIKELY(rhs == 0))
     {
@@ -2981,12 +3032,12 @@ constexpr int128_t operator/(const SignedInteger lhs, const int128_t rhs) noexce
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr int128_t operator/(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) / rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator/(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs / static_cast<detail::builtin_i128>(rhs));
 }
@@ -2994,14 +3045,14 @@ BOOST_INT128_EXPORT constexpr int128_t operator/(const detail::builtin_u128 lhs,
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator/(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator/(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -3009,12 +3060,12 @@ constexpr int128_t operator/(const T, const int128_t) noexcept
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr int128_t operator/(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) / rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator/(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs / static_cast<detail::builtin_i128>(rhs));
 }
@@ -3023,12 +3074,12 @@ BOOST_INT128_EXPORT constexpr int128_t operator/(const detail::builtin_i128 lhs,
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT inline int128_t operator/(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE inline int128_t operator/(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs / static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT inline int128_t operator/(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE inline int128_t operator/(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) / rhs;
 }
@@ -3036,14 +3087,14 @@ BOOST_INT128_EXPORT inline int128_t operator/(const detail::builtin_u128 lhs, co
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-inline int128_t operator/(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t operator/(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-inline int128_t operator/(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t operator/(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -3051,12 +3102,12 @@ inline int128_t operator/(const T, const int128_t) noexcept
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT inline int128_t operator/(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE inline int128_t operator/(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs / static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT inline int128_t operator/(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE inline int128_t operator/(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) / rhs;
 }
@@ -3064,7 +3115,7 @@ BOOST_INT128_EXPORT inline int128_t operator/(const detail::builtin_i128 lhs, co
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator/=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator/=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_signed_integer_v<Integer>, "Sign Conversion Error");
@@ -3074,7 +3125,7 @@ constexpr int128_t& int128_t::operator/=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr int128_t& int128_t::operator/=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator/=(const int128_t rhs) noexcept
 {
     *this = *this / rhs;
     return *this;
@@ -3083,7 +3134,7 @@ constexpr int128_t& int128_t::operator/=(const int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator/=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator/=(const Integer rhs) noexcept
 {
     *this = *this / rhs;
     return *this;
@@ -3102,21 +3153,21 @@ inline int128_t& int128_t::operator/=(const Integer rhs) noexcept
 //=====================================
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator%(int128_t lhs, UnsignedInteger rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(int128_t lhs, UnsignedInteger rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator%(UnsignedInteger lhs, int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(UnsignedInteger lhs, int128_t rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator%(int128_t lhs, SignedInteger rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(int128_t lhs, SignedInteger rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator%(SignedInteger lhs, int128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(SignedInteger lhs, int128_t rhs) noexcept;
 
-BOOST_INT128_EXPORT constexpr int128_t operator%(int128_t lhs, int128_t rhs) noexcept;
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(int128_t lhs, int128_t rhs) noexcept;
 
 template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator%(const int128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -3147,7 +3198,7 @@ constexpr int128_t operator%(const int128_t lhs, const UnsignedInteger rhs) noex
 }
 
 template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator%(const UnsignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const UnsignedInteger lhs, const int128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -3165,9 +3216,9 @@ constexpr int128_t operator%(const UnsignedInteger lhs, const int128_t rhs) noex
         return lhs;
     }
 
-    const int128_t remainder {0, static_cast<eval_type>(lhs) % rhs.low};
+    const int128_t remainder {0, static_cast<eval_type>(lhs) % abs_rhs.low};
 
-    return rhs < 0 ? -remainder : remainder;
+    return remainder;
 
     #else
 
@@ -3180,28 +3231,29 @@ constexpr int128_t operator%(const UnsignedInteger lhs, const int128_t rhs) noex
 }
 
 template <BOOST_INT128_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator%(const int128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const SignedInteger rhs) noexcept
 {
     return lhs % static_cast<int128_t>(rhs);
 }
 
 template <BOOST_INT128_SIGNED_INTEGER_CONCEPT>
-constexpr int128_t operator%(const SignedInteger lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const SignedInteger lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) % rhs;
 }
 
-constexpr int128_t operator%(const int128_t lhs, const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const int128_t rhs) noexcept
 {
     if (rhs == 0)
     {
         return {0, 0};
     }
 
+    constexpr int128_t min_val {INT64_MIN, 0};
     const auto abs_lhs {abs(lhs)};
     const auto abs_rhs {abs(rhs)};
 
-    if (abs_rhs > abs_lhs)
+    if (lhs != min_val && rhs != min_val && abs_rhs > abs_lhs)
     {
         return lhs;
     }
@@ -3212,7 +3264,7 @@ constexpr int128_t operator%(const int128_t lhs, const int128_t rhs) noexcept
     }
     #else
 
-    const auto is_neg{static_cast<bool>(lhs < 0)};
+    const auto is_neg{lhs < 0};
     
     int128_t remainder {};
 
@@ -3241,24 +3293,24 @@ constexpr int128_t operator%(const int128_t lhs, const int128_t rhs) noexcept
 
 #ifdef BOOST_INT128_HAS_INT128
 
-BOOST_INT128_EXPORT constexpr int128_t operator%(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return static_cast<detail::builtin_i128>(lhs) % rhs;
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator%(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return lhs % static_cast<detail::builtin_i128>(rhs);
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr int128_t operator%(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return static_cast<int128_t>(static_cast<detail::builtin_u128>(lhs) % rhs);
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator%(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs % static_cast<detail::builtin_u128>(rhs));
 }
@@ -3266,14 +3318,14 @@ BOOST_INT128_EXPORT constexpr int128_t operator%(const detail::builtin_u128 lhs,
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator%(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-constexpr int128_t operator%(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -3283,24 +3335,24 @@ constexpr int128_t operator%(const T, const int128_t) noexcept
 
 #elif defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT inline int128_t operator%(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE inline int128_t operator%(const int128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs % static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT inline int128_t operator%(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE inline int128_t operator%(const detail::builtin_i128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) % rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT inline int128_t operator%(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE inline int128_t operator%(const int128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs % static_cast<int128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT inline int128_t operator%(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE inline int128_t operator%(const detail::builtin_u128 lhs, const int128_t rhs) noexcept
 {
     return static_cast<int128_t>(lhs) % rhs;
 }
@@ -3308,14 +3360,14 @@ BOOST_INT128_EXPORT inline int128_t operator%(const detail::builtin_u128 lhs, co
 #else // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-inline int128_t operator%(const int128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t operator%(const int128_t, const T) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_u128>::value, bool> = true>
-inline int128_t operator%(const T, const int128_t) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t operator%(const T, const int128_t) noexcept
 {
     static_assert(detail::is_signed_integer_v<T>, "Sign Compare Error");
     return {0, 0};
@@ -3326,7 +3378,7 @@ inline int128_t operator%(const T, const int128_t) noexcept
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr int128_t& int128_t::operator%=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator%=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_signed_integer_v<Integer>, "Sign Conversion Error");
@@ -3336,7 +3388,7 @@ constexpr int128_t& int128_t::operator%=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr int128_t& int128_t::operator%=(const int128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t& int128_t::operator%=(const int128_t rhs) noexcept
 {
     *this = *this % rhs;
     return *this;
@@ -3345,7 +3397,7 @@ constexpr int128_t& int128_t::operator%=(const int128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline int128_t& int128_t::operator%=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator%=(const Integer rhs) noexcept
 {
     *this = *this % rhs;
     return *this;
@@ -3403,15 +3455,15 @@ class numeric_limits_impl_i128
     static constexpr bool tinyness_before = false;
 
     // Member functions
-    static constexpr auto (min)        () -> boost::int128::int128_t { return {INT64_MIN, 0}; }
-    static constexpr auto lowest       () -> boost::int128::int128_t { return {INT64_MIN, 0}; }
-    static constexpr auto (max)        () -> boost::int128::int128_t { return {INT64_MAX, UINT64_MAX}; }
-    static constexpr auto epsilon      () -> boost::int128::int128_t { return {0, 0}; }
-    static constexpr auto round_error  () -> boost::int128::int128_t { return {0, 0}; }
-    static constexpr auto infinity     () -> boost::int128::int128_t { return {0, 0}; }
-    static constexpr auto quiet_NaN    () -> boost::int128::int128_t { return {0, 0}; }
-    static constexpr auto signaling_NaN() -> boost::int128::int128_t { return {0, 0}; }
-    static constexpr auto denorm_min   () -> boost::int128::int128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto (min)        () -> boost::int128::int128_t { return {INT64_MIN, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto lowest       () -> boost::int128::int128_t { return {INT64_MIN, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto (max)        () -> boost::int128::int128_t { return {INT64_MAX, UINT64_MAX}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto epsilon      () -> boost::int128::int128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto round_error  () -> boost::int128::int128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto infinity     () -> boost::int128::int128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto quiet_NaN    () -> boost::int128::int128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto signaling_NaN() -> boost::int128::int128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto denorm_min   () -> boost::int128::int128_t { return {0, 0}; }
 };
 
 #if !defined(__cpp_inline_variables) || __cpp_inline_variables < 201606L
diff --git a/include/boost/int128/detail/mini_from_chars.hpp b/include/boost/int128/detail/mini_from_chars.hpp
index c7aca800..a399e1d8 100644
--- a/include/boost/int128/detail/mini_from_chars.hpp
+++ b/include/boost/int128/detail/mini_from_chars.hpp
@@ -22,6 +22,9 @@ namespace int128 {
 namespace detail {
 
 namespace impl {
+
+#if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
 BOOST_INT128_INLINE_CONSTEXPR unsigned char uchar_values[] =
      {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
       255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
@@ -42,14 +45,40 @@ BOOST_INT128_INLINE_CONSTEXPR unsigned char uchar_values[] =
 
 static_assert(sizeof(uchar_values) == 256, "uchar_values should represent all 256 values of unsigned char");
 
+#endif // __NVCC__
+
 // Convert characters for 0-9, A-Z, a-z to 0-35. Anything else is 255
-BOOST_INT128_FORCE_INLINE constexpr auto digit_from_char(char val) noexcept -> unsigned char
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr auto digit_from_char(char val) noexcept -> unsigned char
 {
+    #if defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA)
+
+    constexpr unsigned char uchar_values[] =
+    {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+       0,   1,   2,   3,   4,   5,   6,   7,   8,   9, 255, 255, 255, 255, 255, 255,
+     255,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+      25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35, 255, 255, 255, 255, 255,
+     255,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+      25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255};
+
+    static_assert(sizeof(uchar_values) == 256, "uchar_values should represent all 256 values of unsigned char");
+
+    #endif // __NVCC__
+
     return uchar_values[static_cast<unsigned char>(val)];
 }
 
 template <typename Integer, typename Unsigned_Integer>
-constexpr int from_chars_integer_impl(const char* first, const char* last, Integer& value, int base) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int from_chars_integer_impl(const char* first, const char* last, Integer& value, int base) noexcept
 {
     if (first >= last)
     {
@@ -76,8 +105,8 @@ constexpr int from_chars_integer_impl(const char* first, const char* last, Integ
             ++next;
         }
 
-        overflow_value = (std::numeric_limits<Integer>::max)();
-        max_digit = (std::numeric_limits<Integer>::max)();
+        overflow_value = static_cast<Unsigned_Integer>((std::numeric_limits<Integer>::max)());
+        max_digit = static_cast<Unsigned_Integer>((std::numeric_limits<Integer>::max)());
 
         if (is_negative)
         {
@@ -154,7 +183,7 @@ constexpr int from_chars_integer_impl(const char* first, const char* last, Integ
     }
 
     // Return the parsed value, adding the sign back if applicable
-    // If we have overflowed then we do not return the result
+    // If we have overflowed, then we do not return the result
     if (overflowed)
     {
         return EDOM;
@@ -172,16 +201,27 @@ constexpr int from_chars_integer_impl(const char* first, const char* last, Integ
 
     // This value will be negative to differentiate from errno values
     // since they are in the range of acceptable distances
+
+    // This cast is useless on 32-bit platforms
+    #if defined(__GNUC__) && !defined(__clang__)
+    #  pragma GCC diagnostic push
+    #  pragma GCC diagnostic ignored "-Wuseless-cast"
+    #endif
+
     return static_cast<int>(first - next);
+
+    #if defined(__GNUC__) && !defined(__clang__)
+    #  pragma GCC diagnostic pop
+    #endif
 }
 } // namespace impl
 
-constexpr int from_chars(const char* first, const char* last, uint128_t& value, int base = 10) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int from_chars(const char* first, const char* last, uint128_t& value, int base = 10) noexcept
 {
     return impl::from_chars_integer_impl<uint128_t, uint128_t>(first, last, value, base);
 }
 
-constexpr int from_chars(const char* first, const char* last, int128_t& value, int base = 10) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int from_chars(const char* first, const char* last, int128_t& value, int base = 10) noexcept
 {
     return impl::from_chars_integer_impl<int128_t, uint128_t>(first, last, value, base);
 }
diff --git a/include/boost/int128/detail/mini_to_chars.hpp b/include/boost/int128/detail/mini_to_chars.hpp
index 0d491841..80e1fcd0 100644
--- a/include/boost/int128/detail/mini_to_chars.hpp
+++ b/include/boost/int128/detail/mini_to_chars.hpp
@@ -12,6 +12,8 @@ namespace boost {
 namespace int128 {
 namespace detail {
 
+#if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
 BOOST_INT128_INLINE_CONSTEXPR char lower_case_digit_table[] = {
     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
     'a', 'b', 'c', 'd', 'e', 'f'
@@ -26,8 +28,22 @@ BOOST_INT128_INLINE_CONSTEXPR char upper_case_digit_table[] = {
 
 static_assert(sizeof(upper_case_digit_table) == sizeof(char) * 16, "10 numbers, and 6 letters");
 
-constexpr char* mini_to_chars(char (&buffer)[64], uint128_t v, const int base, const bool uppercase) noexcept
+#endif // !__NVCC__
+
+BOOST_INT128_HOST_DEVICE constexpr char* mini_to_chars(char (&buffer)[64], uint128_t v, const int base, const bool uppercase) noexcept
 {
+    #if defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA)
+    constexpr char lower_case_digit_table[] = {
+        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+        'a', 'b', 'c', 'd', 'e', 'f'
+    };
+
+    constexpr char upper_case_digit_table[] = {
+        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+        'A', 'B', 'C', 'D', 'E', 'F'
+    };
+    #endif
+
     char* last {buffer + 64U};
     *--last = '\0';
 
@@ -81,7 +97,7 @@ constexpr char* mini_to_chars(char (&buffer)[64], uint128_t v, const int base, c
     return last;
 }
 
-constexpr char* mini_to_chars(char (&buffer)[64], const int128_t v, const int base, const bool uppercase) noexcept
+BOOST_INT128_HOST_DEVICE constexpr char* mini_to_chars(char (&buffer)[64], const int128_t v, const int base, const bool uppercase) noexcept
 {
     char* p {nullptr};
 
diff --git a/include/boost/int128/detail/uint128_imp.hpp b/include/boost/int128/detail/uint128_imp.hpp
index 987d29c2..3fd7c9b7 100644
--- a/include/boost/int128/detail/uint128_imp.hpp
+++ b/include/boost/int128/detail/uint128_imp.hpp
@@ -56,27 +56,28 @@ uint128_t
     constexpr uint128_t& operator=(const uint128_t&) noexcept = default;
     constexpr uint128_t& operator=(uint128_t&&) noexcept = default;
 
-    // Requires conversion file to be implemented
-    constexpr uint128_t(const int128_t& v) noexcept;
+    // Requires a conversion file to be implemented
+    BOOST_INT128_HOST_DEVICE explicit constexpr uint128_t(const int128_t& v) noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator int128_t() const noexcept;
 
     // Construct from integral types
     #if BOOST_INT128_ENDIAN_LITTLE_BYTE
 
-    constexpr uint128_t(const std::uint64_t hi, const std::uint64_t lo) noexcept : low {lo}, high {hi} {}
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const std::uint64_t hi, const std::uint64_t lo) noexcept : low {lo}, high {hi} {}
 
     template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-    constexpr uint128_t(const SignedInteger v) noexcept : low {static_cast<std::uint64_t>(v)}, high {v < 0 ? UINT64_MAX : UINT64_C(0)} {}
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const SignedInteger v) noexcept : low {static_cast<std::uint64_t>(v)}, high {v < 0 ? UINT64_MAX : UINT64_C(0)} {}
 
     template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-    constexpr uint128_t(const UnsignedInteger v) noexcept : low {static_cast<std::uint64_t>(v)}, high {} {}
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const UnsignedInteger v) noexcept : low {static_cast<std::uint64_t>(v)}, high {} {}
 
     #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-    BOOST_INT128_BUILTIN_CONSTEXPR uint128_t(const detail::builtin_i128 v) noexcept :
+    BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t(const detail::builtin_i128 v) noexcept :
         low {static_cast<std::uint64_t>(v)},
         high {static_cast<std::uint64_t>(static_cast<detail::builtin_u128>(v) >> static_cast<detail::builtin_u128>(64U))} {}
 
-    BOOST_INT128_BUILTIN_CONSTEXPR uint128_t(const detail::builtin_u128 v) noexcept :
+    BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t(const detail::builtin_u128 v) noexcept :
         low {static_cast<std::uint64_t>(v)},
         high {static_cast<std::uint64_t>(v >> static_cast<detail::builtin_i128>(64U))} {}
 
@@ -84,21 +85,21 @@ uint128_t
 
     #else // Big endian
 
-    constexpr uint128_t(const std::uint64_t hi, const std::uint64_t lo) noexcept : high {hi}, low {lo} {}
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const std::uint64_t hi, const std::uint64_t lo) noexcept : high {hi}, low {lo} {}
 
     template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-    constexpr uint128_t(const SignedInteger v) noexcept : high {v < 0 ? UINT64_MAX : UINT64_C(0)}, low {static_cast<std::uint64_t>(v)} {}
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const SignedInteger v) noexcept : high {v < 0 ? UINT64_MAX : UINT64_C(0)}, low {static_cast<std::uint64_t>(v)} {}
 
     template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-    constexpr uint128_t(const UnsignedInteger v) noexcept : high {}, low {static_cast<std::uint64_t>(v)} {}
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const UnsignedInteger v) noexcept : high {}, low {static_cast<std::uint64_t>(v)} {}
 
     #ifdef BOOST_INT128_HAS_INT128
 
-    constexpr uint128_t(const detail::builtin_i128 v) noexcept :
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const detail::builtin_i128 v) noexcept :
         high {static_cast<std::uint64_t>(static_cast<detail::builtin_u128>(v) >> 64U)},
         low {static_cast<std::uint64_t>(v)} {}
 
-    constexpr uint128_t(const detail::builtin_u128 v) noexcept :
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t(const detail::builtin_u128 v) noexcept :
         high {static_cast<std::uint64_t>(v >> 64U)},
         low {static_cast<std::uint64_t>(v)} {}
 
@@ -107,161 +108,165 @@ uint128_t
     #endif // BOOST_INT128_ENDIAN_LITTLE_BYTE
 
     // Integer conversion operators
-    explicit constexpr operator bool() const noexcept {return low || high; }
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator bool() const noexcept {return low || high; }
 
     template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-    explicit constexpr operator SignedInteger() const noexcept { return static_cast<SignedInteger>(low); }
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator SignedInteger() const noexcept { return static_cast<SignedInteger>(low); }
 
     template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-    explicit constexpr operator UnsignedInteger() const noexcept { return static_cast<UnsignedInteger>(low); }
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator UnsignedInteger() const noexcept { return static_cast<UnsignedInteger>(low); }
 
     #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-    explicit BOOST_INT128_BUILTIN_CONSTEXPR operator detail::builtin_i128() const noexcept { return static_cast<detail::builtin_i128>(static_cast<detail::builtin_u128>(high) << static_cast<detail::builtin_u128>(64)) | static_cast<detail::builtin_i128>(low); }
+    BOOST_INT128_HOST_DEVICE explicit BOOST_INT128_BUILTIN_CONSTEXPR operator detail::builtin_i128() const noexcept { return static_cast<detail::builtin_i128>(static_cast<detail::builtin_u128>(high) << static_cast<detail::builtin_u128>(64)) | static_cast<detail::builtin_i128>(low); }
 
-    explicit BOOST_INT128_BUILTIN_CONSTEXPR operator detail::builtin_u128() const noexcept { return (static_cast<detail::builtin_u128>(high) << static_cast<detail::builtin_u128>(64)) | static_cast<detail::builtin_u128>(low); }
+    BOOST_INT128_HOST_DEVICE explicit BOOST_INT128_BUILTIN_CONSTEXPR operator detail::builtin_u128() const noexcept { return (static_cast<detail::builtin_u128>(high) << static_cast<detail::builtin_u128>(64)) | static_cast<detail::builtin_u128>(low); }
 
     #endif // BOOST_INT128_HAS_INT128
 
     // Conversion to float
     // This is basically the same as ldexp(static_cast<T>(high), 64) + static_cast<T>(low),
     // but can be constexpr at C++11 instead of C++26
-    explicit constexpr operator float() const noexcept;
-    explicit constexpr operator double() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator float() const noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator double() const noexcept;
+
+    // long doubles do not exist on device
+    #if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
     explicit constexpr operator long double() const noexcept;
+    #endif
 
     // Compound OR
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator|=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator|=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator|=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator|=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator|=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator|=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound AND
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator&=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator&=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator&=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator&=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator&=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator&=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound XOR
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator^=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator^=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator^=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator^=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator^=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator^=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Left Shift
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator<<=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator<<=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator<<=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator<<=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator<<=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator<<=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Right Shift
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator>>=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator>>=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator>>=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator>>=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator>>=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator>>=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
-    constexpr uint128_t& operator++() noexcept;
-    constexpr uint128_t operator++(int) noexcept;
-    constexpr uint128_t& operator--() noexcept;
-    constexpr uint128_t operator--(int) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator++() noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t operator++(int) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator--() noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t operator--(int) noexcept;
 
     // Compound Addition
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator+=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator+=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator+=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator+=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator+=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator+=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Subtraction
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator-=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator-=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator-=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator-=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator-=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator-=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Multiplication
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator*=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator*=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator*=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator*=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator*=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator*=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound Division
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator/=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator/=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator/=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator/=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator/=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator/=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 
     // Compound modulo
     template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-    constexpr uint128_t& operator%=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator%=(Integer rhs) noexcept;
 
-    constexpr uint128_t& operator%=(uint128_t rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE constexpr uint128_t& operator%=(uint128_t rhs) noexcept;
 
     #ifdef BOOST_INT128_HAS_MSVC_INT128
 
     template <BOOST_INT128_DEFAULTED_128BIT_INTEGER_CONCEPT>
-    inline uint128_t& operator%=(Integer rhs) noexcept;
+    BOOST_INT128_HOST_DEVICE inline uint128_t& operator%=(Integer rhs) noexcept;
 
     #endif // BOOST_INT128_HAS_MSVC_INT128
 };
@@ -270,7 +275,7 @@ uint128_t
 // Absolute Value function
 //=====================================
 
-BOOST_INT128_EXPORT constexpr uint128_t abs(const uint128_t value) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t abs(const uint128_t value) noexcept
 {
     return value;
 }
@@ -284,31 +289,35 @@ BOOST_INT128_EXPORT constexpr uint128_t abs(const uint128_t value) noexcept
 // by 0xFFFFFFFF in order to generally replicate what ldexp is doing in the constexpr context.
 // We also avoid pulling in <quadmath.h> for the __float128 case where we would need ldexpq
 
-constexpr uint128_t::operator float() const noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t::operator float() const noexcept
 {
     return static_cast<float>(high) * detail::offset_value_v<float> + static_cast<float>(low);
 }
 
-constexpr uint128_t::operator double() const noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t::operator double() const noexcept
 {
     return static_cast<double>(high) * detail::offset_value_v<double> + static_cast<double>(low);
 }
 
+#if !(defined(__CUDACC__) && defined(BOOST_INT128_ENABLE_CUDA))
+
 constexpr uint128_t::operator long double() const noexcept
 {
     return static_cast<long double>(high) * detail::offset_value_v<long double> + static_cast<long double>(low);
 }
 
+#endif // __NVCC__
+
 //=====================================
 // Unary Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr uint128_t operator+(const uint128_t value) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const uint128_t value) noexcept
 {
     return value;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator-(const uint128_t value) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const uint128_t value) noexcept
 {
     return {~value.high + static_cast<std::uint64_t>(value.low == UINT64_C(0)), ~value.low + UINT64_C(1)};
 }
@@ -317,12 +326,12 @@ BOOST_INT128_EXPORT constexpr uint128_t operator-(const uint128_t value) noexcep
 // Equality Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr bool operator==(const uint128_t lhs, const bool rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator==(const uint128_t lhs, const bool rhs) noexcept
 {
     return lhs.high == UINT64_C(0) && lhs.low == static_cast<std::uint64_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr bool operator==(const bool lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator==(const bool lhs, const uint128_t rhs) noexcept
 {
     return rhs.high == UINT64_C(0) && rhs.low == static_cast<std::uint64_t>(lhs);
 }
@@ -336,7 +345,7 @@ BOOST_INT128_EXPORT constexpr bool operator==(const bool lhs, const uint128_t rh
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator==(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -353,7 +362,7 @@ constexpr bool operator==(const uint128_t lhs, const SignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator==(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -370,18 +379,18 @@ constexpr bool operator==(const SignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator==(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return lhs.high == UINT64_C(0) && lhs.low == static_cast<std::uint64_t>(rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator==(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator==(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return rhs.high == UINT64_C(0) && rhs.low == static_cast<std::uint64_t>(lhs);
 }
 
-BOOST_INT128_EXPORT constexpr bool operator==(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator==(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_AMD64)
 
@@ -417,12 +426,12 @@ BOOST_INT128_EXPORT constexpr bool operator==(const uint128_t lhs, const uint128
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs == static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) == rhs;
 }
@@ -430,14 +439,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail:
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
@@ -445,12 +454,12 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const T, const uint128_t) noexcep
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs == static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) == rhs;
 }
@@ -461,18 +470,18 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator==(const detail:
 // Inequality Operators
 //=====================================
 
-BOOST_INT128_EXPORT constexpr bool operator!=(const uint128_t lhs, const bool rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const uint128_t lhs, const bool rhs) noexcept
 {
     return lhs.high != UINT64_C(0) || lhs.low != static_cast<std::uint64_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr bool operator!=(const bool lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const bool lhs, const uint128_t rhs) noexcept
 {
     return rhs.high != UINT64_C(0) || rhs.low != static_cast<std::uint64_t>(lhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator!=(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -489,7 +498,7 @@ constexpr bool operator!=(const uint128_t lhs, const SignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator!=(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -506,18 +515,18 @@ constexpr bool operator!=(const SignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator!=(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return lhs.high != UINT64_C(0) || lhs.low != static_cast<std::uint64_t>(rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator!=(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return rhs.high != UINT64_C(0) || rhs.low != static_cast<std::uint64_t>(lhs);
 }
 
-BOOST_INT128_EXPORT constexpr bool operator!=(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator!=(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_AMD64)
 
@@ -553,12 +562,12 @@ BOOST_INT128_EXPORT constexpr bool operator!=(const uint128_t lhs, const uint128
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs != static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) != rhs;
 }
@@ -566,14 +575,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail:
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
@@ -581,12 +590,12 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const T, const uint128_t) noexcep
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs != static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) != rhs;
 }
@@ -598,7 +607,7 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator!=(const detail:
 //=====================================
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator<(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -615,7 +624,7 @@ constexpr bool operator<(const uint128_t lhs, const SignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator<(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -632,18 +641,18 @@ constexpr bool operator<(const SignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator<(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return lhs.high == UINT64_C(0) && lhs.low < static_cast<std::uint64_t>(rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator<(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return rhs.high > UINT64_C(0) || static_cast<std::uint64_t>(lhs) < rhs.low;
 }
 
-BOOST_INT128_EXPORT constexpr bool operator<(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator<(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     // On ARM macs only with the clang compiler is casting to unsigned __int128 uniformly better (and seemingly cost free)
     #if defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
@@ -709,12 +718,12 @@ BOOST_INT128_EXPORT constexpr bool operator<(const uint128_t lhs, const uint128_
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs < static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) < rhs;
 }
@@ -722,14 +731,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
@@ -737,12 +746,12 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const T, const uint128_t) noexcept
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs < static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) < rhs;
 }
@@ -754,7 +763,7 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<(const detail::
 //=====================================
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator<=(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -771,7 +780,7 @@ constexpr bool operator<=(const uint128_t lhs, const SignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator<=(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -788,18 +797,18 @@ constexpr bool operator<=(const SignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator<=(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return lhs.high == UINT64_C(0) && lhs.low <= static_cast<std::uint64_t>(rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator<=(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return rhs.high > UINT64_C(0) || static_cast<std::uint64_t>(lhs) <= rhs.low;
 }
 
-BOOST_INT128_EXPORT constexpr bool operator<=(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator<=(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     #if defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
 
@@ -864,22 +873,22 @@ BOOST_INT128_EXPORT constexpr bool operator<=(const uint128_t lhs, const uint128
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs <= static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) <= rhs;
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs <= static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) <= rhs;
 }
@@ -887,14 +896,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const detail:
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
@@ -909,7 +918,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator<=(const T, const uint128_t) noexcep
 //=====================================
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator>(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -926,7 +935,7 @@ constexpr bool operator>(const uint128_t lhs, const SignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator>(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -943,18 +952,18 @@ constexpr bool operator>(const SignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator>(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return lhs.high > UINT64_C(0) || lhs.low > static_cast<std::uint64_t>(rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator>(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return rhs.high == UINT64_C(0) && static_cast<std::uint64_t>(lhs) > rhs.low;
 }
 
-BOOST_INT128_EXPORT constexpr bool operator>(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator>(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     #if defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
 
@@ -1019,22 +1028,22 @@ BOOST_INT128_EXPORT constexpr bool operator>(const uint128_t lhs, const uint128_
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs > static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) > rhs;
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs > static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) > rhs;
 }
@@ -1042,14 +1051,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const detail::
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
@@ -1064,7 +1073,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator>(const T, const uint128_t) noexcept
 //=====================================
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator>=(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -1081,7 +1090,7 @@ constexpr bool operator>=(const uint128_t lhs, const SignedInteger rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr bool operator>=(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -1098,18 +1107,18 @@ constexpr bool operator>=(const SignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator>=(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return lhs.high > UINT64_C(0) || lhs.low >= static_cast<std::uint64_t>(rhs);
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr bool operator>=(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return rhs.high == UINT64_C(0) && static_cast<std::uint64_t>(lhs) >= rhs.low;
 }
 
-BOOST_INT128_EXPORT constexpr bool operator>=(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr bool operator>=(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     #if defined(__clang__) && defined(BOOST_INT128_HAS_INT128)
 
@@ -1172,24 +1181,24 @@ BOOST_INT128_EXPORT constexpr bool operator>=(const uint128_t lhs, const uint128
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs >= static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) >= rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs >= static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) >= rhs;
 }
@@ -1197,14 +1206,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const detail:
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Compare Error");
     return true;
@@ -1220,7 +1229,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR bool operator>=(const T, const uint128_t) noexcep
 
 #ifdef BOOST_INT128_HAS_SPACESHIP_OPERATOR
 
-BOOST_INT128_EXPORT constexpr std::strong_ordering operator<=>(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     if (lhs < rhs)
     {
@@ -1237,7 +1246,7 @@ BOOST_INT128_EXPORT constexpr std::strong_ordering operator<=>(const uint128_t l
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr std::strong_ordering operator<=>(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     if (lhs < rhs)
     {
@@ -1254,7 +1263,7 @@ constexpr std::strong_ordering operator<=>(const uint128_t lhs, const UnsignedIn
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr std::strong_ordering operator<=>(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     if (lhs < rhs)
     {
@@ -1271,7 +1280,7 @@ constexpr std::strong_ordering operator<=>(const UnsignedInteger lhs, const uint
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr std::strong_ordering operator<=>(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -1299,7 +1308,7 @@ constexpr std::strong_ordering operator<=>(const SignedInteger lhs, const uint12
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr std::strong_ordering operator<=>(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::strong_ordering operator<=>(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_COMPARE
 
@@ -1332,7 +1341,7 @@ constexpr std::strong_ordering operator<=>(const uint128_t lhs, const SignedInte
 // Not Operator
 //=====================================
 
-BOOST_INT128_EXPORT constexpr uint128_t operator~(const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator~(const uint128_t rhs) noexcept
 {
     return {~rhs.high, ~rhs.low};
 }
@@ -1342,7 +1351,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator~(const uint128_t rhs) noexcept
 //=====================================
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator|(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1359,7 +1368,7 @@ constexpr uint128_t operator|(const uint128_t lhs, const SignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator|(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1376,18 +1385,18 @@ constexpr uint128_t operator|(const SignedInteger lhs, const uint128_t rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator|(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return {lhs.high, lhs.low | static_cast<std::uint64_t>(rhs)};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator|(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return {rhs.high, rhs.low | static_cast<std::uint64_t>(lhs)};
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator|(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     return {lhs.high | rhs.high, lhs.low | rhs.low};
 }
@@ -1396,12 +1405,12 @@ BOOST_INT128_EXPORT constexpr uint128_t operator|(const uint128_t lhs, const uin
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr uint128_t operator|(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs | static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator|(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) | rhs;
 }
@@ -1409,14 +1418,14 @@ BOOST_INT128_EXPORT constexpr uint128_t operator|(const detail::builtin_i128 lhs
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-constexpr uint128_t operator|(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-constexpr uint128_t operator|(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
@@ -1424,12 +1433,12 @@ constexpr uint128_t operator|(const T, const uint128_t) noexcept
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr uint128_t operator|(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs | static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator|(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator|(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) | rhs;
 }
@@ -1437,7 +1446,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator|(const detail::builtin_u128 lhs
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator|=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator|=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_unsigned_integer_v<Integer>, "Sign Conversion Error");
@@ -1446,7 +1455,7 @@ constexpr uint128_t& uint128_t::operator|=(const Integer rhs) noexcept
     *this = *this | rhs;
     return *this;
 }
-constexpr uint128_t& uint128_t::operator|=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator|=(const uint128_t rhs) noexcept
 {
     *this = *this | rhs;
     return *this;
@@ -1455,7 +1464,7 @@ constexpr uint128_t& uint128_t::operator|=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator|=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator|=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(!std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -1472,7 +1481,7 @@ inline uint128_t& uint128_t::operator|=(const Integer rhs) noexcept
 //=====================================
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator&(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1489,7 +1498,7 @@ constexpr uint128_t operator&(const uint128_t lhs, const SignedInteger rhs) noex
 }
 
 template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator&(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1506,18 +1515,18 @@ constexpr uint128_t operator&(const SignedInteger lhs, const uint128_t rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator&(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return {lhs.high, lhs.low & static_cast<std::uint64_t>(rhs)};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator&(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return {rhs.high, rhs.low & static_cast<std::uint64_t>(lhs)};
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator&(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     return {lhs.high & rhs.high, lhs.low & rhs.low};
 }
@@ -1526,12 +1535,12 @@ BOOST_INT128_EXPORT constexpr uint128_t operator&(const uint128_t lhs, const uin
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr uint128_t operator&(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs & static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator&(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) & rhs;
 }
@@ -1539,14 +1548,14 @@ BOOST_INT128_EXPORT constexpr uint128_t operator&(const detail::builtin_i128 lhs
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-constexpr uint128_t operator&(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-constexpr uint128_t operator&(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
@@ -1554,12 +1563,12 @@ constexpr uint128_t operator&(const T, const uint128_t) noexcept
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr uint128_t operator&(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs & static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator&(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator&(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) & rhs;
 }
@@ -1567,7 +1576,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator&(const detail::builtin_u128 lhs
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator&=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator&=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_unsigned_integer_v<Integer>, "Sign Conversion Error");
@@ -1577,7 +1586,7 @@ constexpr uint128_t& uint128_t::operator&=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator&=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator&=(const uint128_t rhs) noexcept
 {
     *this = *this & rhs;
     return *this;
@@ -1586,7 +1595,7 @@ constexpr uint128_t& uint128_t::operator&=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator&=(Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator&=(Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(!std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -1604,7 +1613,7 @@ inline uint128_t& uint128_t::operator&=(Integer rhs) noexcept
 //=====================================
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator^(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1621,7 +1630,7 @@ constexpr uint128_t operator^(const uint128_t lhs, const SignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator^(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -1638,18 +1647,18 @@ constexpr uint128_t operator^(const SignedInteger lhs, const uint128_t rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator^(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return {lhs.high, lhs.low ^ static_cast<std::uint64_t>(rhs)};
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator^(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return {rhs.high, rhs.low ^ static_cast<std::uint64_t>(lhs)};
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator^(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     return {lhs.high ^ rhs.high, lhs.low ^ rhs.low};
 }
@@ -1658,12 +1667,12 @@ BOOST_INT128_EXPORT constexpr uint128_t operator^(const uint128_t lhs, const uin
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr uint128_t operator^(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs ^ static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator^(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) ^ rhs;
 }
@@ -1671,14 +1680,14 @@ BOOST_INT128_EXPORT constexpr uint128_t operator^(const detail::builtin_i128 lhs
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-constexpr uint128_t operator^(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-constexpr uint128_t operator^(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
@@ -1686,12 +1695,12 @@ constexpr uint128_t operator^(const T, const uint128_t) noexcept
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr uint128_t operator^(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs ^ static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator^(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator^(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) ^ rhs;
 }
@@ -1699,7 +1708,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator^(const detail::builtin_u128 lhs
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator^=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator^=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_unsigned_integer_v<Integer>, "Sign Conversion Error");
@@ -1709,7 +1718,7 @@ constexpr uint128_t& uint128_t::operator^=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator^=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator^=(const uint128_t rhs) noexcept
 {
     *this = *this ^ rhs;
     return *this;
@@ -1718,7 +1727,7 @@ constexpr uint128_t& uint128_t::operator^=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator^=(Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator^=(Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(!std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -1737,13 +1746,23 @@ inline uint128_t& uint128_t::operator^=(Integer rhs) noexcept
 namespace detail {
 
 template <typename Integer>
-constexpr uint128_t default_ls_impl(const uint128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t default_ls_impl(const uint128_t lhs, const Integer rhs) noexcept
 {
     static_assert(std::is_integral<Integer>::value, "Needs to be a builtin type");
 
-    if (rhs < 0 || rhs >= 128)
+    BOOST_INT128_IF_CONSTEXPR (std::numeric_limits<Integer>::is_signed)
     {
-        return {0, 0};
+        if (rhs < 0 || rhs >= 128)
+        {
+            return {0, 0};
+        }
+    }
+    else
+    {
+        if (rhs >= 128)
+        {
+            return {0, 0};
+        }
     }
 
     if (rhs == 0)
@@ -1768,12 +1787,23 @@ constexpr uint128_t default_ls_impl(const uint128_t lhs, const Integer rhs) noex
 }
 
 template <typename T>
-uint128_t intrinsic_ls_impl(const uint128_t lhs, const T rhs) noexcept
+BOOST_INT128_HOST_DEVICE uint128_t intrinsic_ls_impl(const uint128_t lhs, const T rhs) noexcept
 {
-    if (BOOST_INT128_UNLIKELY(rhs >= 128 || rhs < 0))
+    BOOST_INT128_IF_CONSTEXPR (std::numeric_limits<T>::is_signed)
     {
-        return {0, 0};
+        if (BOOST_INT128_UNLIKELY(rhs >= 128 || rhs < 0))
+        {
+            return {0, 0};
+        }
+    }
+    else
+    {
+        if (BOOST_INT128_UNLIKELY(rhs >= 128))
+        {
+            return {0, 0};
+        }
     }
+
     if (BOOST_INT128_UNLIKELY(rhs == 0))
     {
         return lhs;
@@ -1829,7 +1859,7 @@ uint128_t intrinsic_ls_impl(const uint128_t lhs, const T rhs) noexcept
 } // namespace detail
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_INTEGER_CONCEPT>
-constexpr uint128_t operator<<(const uint128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator<<(const uint128_t lhs, const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION
 
@@ -1851,11 +1881,11 @@ constexpr uint128_t operator<<(const uint128_t lhs, const Integer rhs) noexcept
 
 // A number of different overloads to ensure that we return the same type as the builtins would
 
-BOOST_INT128_EXPORT constexpr uint128_t operator<<(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator<<(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     if (rhs.high > UINT64_C(0) || rhs.low >= UINT64_C(128))
     {
-        return 0;
+        return uint128_t{0};
     }
 
     return lhs << rhs.low;
@@ -1863,7 +1893,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator<<(const uint128_t lhs, const ui
 
 #ifdef BOOST_INT128_HAS_INT128
 
-BOOST_INT128_EXPORT constexpr detail::builtin_u128 operator<<(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr detail::builtin_u128 operator<<(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(detail::builtin_u128 ) * 8};
 
@@ -1875,7 +1905,7 @@ BOOST_INT128_EXPORT constexpr detail::builtin_u128 operator<<(const detail::buil
     return lhs << rhs.low;
 }
 
-BOOST_INT128_EXPORT constexpr detail::builtin_i128 operator<<(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr detail::builtin_i128 operator<<(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(detail::builtin_u128) * 8};
 
@@ -1890,7 +1920,7 @@ BOOST_INT128_EXPORT constexpr detail::builtin_i128 operator<<(const detail::buil
 #endif
 
 BOOST_INT128_EXPORT template <typename SignedInteger, std::enable_if_t<detail::is_signed_integer_v<SignedInteger> && (sizeof(SignedInteger) * 8 <= 16), bool> = true>
-constexpr int operator<<(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int operator<<(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(SignedInteger) * 8};
 
@@ -1903,7 +1933,7 @@ constexpr int operator<<(const SignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <typename UnsignedInteger, std::enable_if_t<detail::is_unsigned_integer_v<UnsignedInteger> && (sizeof(UnsignedInteger) * 8 <= 16), bool> = true>
-constexpr unsigned int operator<<(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr unsigned int operator<<(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     constexpr auto bit_width {sizeof(UnsignedInteger) * 8};
 
@@ -1916,13 +1946,13 @@ constexpr unsigned int operator<<(const UnsignedInteger lhs, const uint128_t rhs
 }
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator<<=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator<<=(const Integer rhs) noexcept
 {
     *this = *this << rhs;
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator<<=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator<<=(const uint128_t rhs) noexcept
 {
     *this = *this << rhs;
     return *this;
@@ -1931,7 +1961,7 @@ constexpr uint128_t& uint128_t::operator<<=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator<<=(Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator<<=(Integer rhs) noexcept
 {
     *this = *this << rhs;
     return *this;
@@ -1946,11 +1976,21 @@ inline uint128_t& uint128_t::operator<<=(Integer rhs) noexcept
 namespace detail {
 
 template <typename Integer>
-constexpr uint128_t default_rs_impl(const uint128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t default_rs_impl(const uint128_t lhs, const Integer rhs) noexcept
 {
-    if (rhs < 0 || rhs >= 128)
+    BOOST_INT128_IF_CONSTEXPR (std::numeric_limits<Integer>::is_signed)
     {
-        return {0, 0};
+        if (rhs < 0 || rhs >= 128)
+        {
+            return {0, 0};
+        }
+    }
+    else
+    {
+        if (rhs >= 128)
+        {
+            return {0, 0};
+        }
     }
 
     if (rhs == 0)
@@ -1975,12 +2015,23 @@ constexpr uint128_t default_rs_impl(const uint128_t lhs, const Integer rhs) noex
 }
 
 template <typename Integer>
-uint128_t intrinsic_rs_impl(const uint128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE uint128_t intrinsic_rs_impl(const uint128_t lhs, const Integer rhs) noexcept
 {
-    if (BOOST_INT128_UNLIKELY(rhs >= 128 || rhs < 0))
+    BOOST_INT128_IF_CONSTEXPR (std::numeric_limits<Integer>::is_signed)
     {
-        return {0, 0};
+        if (BOOST_INT128_UNLIKELY(rhs >= 128 || rhs < 0))
+        {
+            return {0, 0};
+        }
+    }
+    else
+    {
+        if (BOOST_INT128_UNLIKELY(rhs >= 128))
+        {
+            return {0, 0};
+        }
     }
+
     if (BOOST_INT128_UNLIKELY(rhs == 0))
     {
         return lhs;
@@ -2033,7 +2084,7 @@ uint128_t intrinsic_rs_impl(const uint128_t lhs, const Integer rhs) noexcept
 } // namespace detail
 
 BOOST_INT128_EXPORT template <typename Integer, std::enable_if_t<std::is_integral<Integer>::value, bool> = true>
-constexpr uint128_t operator>>(const uint128_t lhs, const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator>>(const uint128_t lhs, const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION
 
@@ -2053,11 +2104,11 @@ constexpr uint128_t operator>>(const uint128_t lhs, const Integer rhs) noexcept
     #endif
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator>>(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator>>(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     if (rhs.high > UINT64_C(0) || rhs.low >= UINT64_C(128))
     {
-        return 0;
+        return uint128_t{0};
     }
 
     return lhs >> rhs.low;
@@ -2065,7 +2116,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator>>(const uint128_t lhs, const ui
 
 #ifdef BOOST_INT128_HAS_INT128
 
-BOOST_INT128_EXPORT constexpr detail::builtin_u128 operator>>(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr detail::builtin_u128 operator>>(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     constexpr auto bit_width = sizeof(detail::builtin_u128) * 8;
 
@@ -2077,7 +2128,7 @@ BOOST_INT128_EXPORT constexpr detail::builtin_u128 operator>>(const detail::buil
     return lhs >> rhs.low;
 }
 
-BOOST_INT128_EXPORT constexpr detail::builtin_i128 operator>>(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr detail::builtin_i128 operator>>(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     constexpr auto bit_width = sizeof(detail::builtin_i128) * 8;
 
@@ -2092,7 +2143,7 @@ BOOST_INT128_EXPORT constexpr detail::builtin_i128 operator>>(const detail::buil
 #endif
 
 BOOST_INT128_EXPORT template <typename SignedInteger, std::enable_if_t<detail::is_signed_integer_v<SignedInteger> && (sizeof(SignedInteger) * 8 <= 16), bool> = true>
-constexpr int operator>>(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int operator>>(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     constexpr auto bit_width = sizeof(SignedInteger) * 8;
 
@@ -2105,7 +2156,7 @@ constexpr int operator>>(const SignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 BOOST_INT128_EXPORT template <typename UnsignedInteger, std::enable_if_t<detail::is_unsigned_integer_v<UnsignedInteger> && (sizeof(UnsignedInteger) * 8 <= 16), bool> = true>
-constexpr unsigned operator>>(UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr unsigned operator>>(UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     constexpr auto bit_width = sizeof(UnsignedInteger) * 8;
 
@@ -2118,13 +2169,13 @@ constexpr unsigned operator>>(UnsignedInteger lhs, const uint128_t rhs) noexcept
 }
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator>>=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator>>=(const Integer rhs) noexcept
 {
     *this = *this >> rhs;
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator>>=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator>>=(const uint128_t rhs) noexcept
 {
     *this = *this >> rhs;
     return *this;
@@ -2133,7 +2184,7 @@ constexpr uint128_t& uint128_t::operator>>=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator>>=(Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator>>=(Integer rhs) noexcept
 {
     *this = *this >> rhs;
     return *this;
@@ -2145,7 +2196,7 @@ inline uint128_t& uint128_t::operator>>=(Integer rhs) noexcept
 // Increment Operator
 //=====================================
 
-constexpr uint128_t& uint128_t::operator++() noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator++() noexcept
 {
     if (++low == UINT64_C(0))
     {
@@ -2155,7 +2206,7 @@ constexpr uint128_t& uint128_t::operator++() noexcept
     return *this;
 }
 
-constexpr uint128_t uint128_t::operator++(int) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t uint128_t::operator++(int) noexcept
 {
     const auto temp {*this};
     ++(*this);
@@ -2166,7 +2217,7 @@ constexpr uint128_t uint128_t::operator++(int) noexcept
 // Decrement Operator
 //=====================================
 
-constexpr uint128_t& uint128_t::operator--() noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator--() noexcept
 {
     if (--low == UINT64_MAX)
     {
@@ -2176,7 +2227,7 @@ constexpr uint128_t& uint128_t::operator--() noexcept
     return *this;
 }
 
-constexpr uint128_t uint128_t::operator--(int) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t uint128_t::operator--(int) noexcept
 {
     const auto temp {*this};
     --(*this);
@@ -2189,7 +2240,7 @@ constexpr uint128_t uint128_t::operator--(int) noexcept
 
 namespace impl {
 
-BOOST_INT128_FORCE_INLINE constexpr uint128_t default_add(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_add(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     #if defined(BOOST_INT128_HAS_BUILTIN_ADD_OVERFLOW) && (defined(__i386__) || (defined(__aarch64__) && !defined(__APPLE__)) || defined(__arm__) || (defined(__s390__) || defined(__s390x__)))
 
@@ -2212,7 +2263,7 @@ BOOST_INT128_FORCE_INLINE constexpr uint128_t default_add(const uint128_t lhs, c
     #endif
 }
 
-BOOST_INT128_FORCE_INLINE constexpr uint128_t default_add(const uint128_t lhs, const std::uint64_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_add(const uint128_t lhs, const std::uint64_t rhs) noexcept
 {
     #if defined(BOOST_INT128_HAS_BUILTIN_ADD_OVERFLOW) && (defined(__i386__) || (defined(__aarch64__) && !defined(__APPLE__)) || defined(__arm__) || (defined(__s390__) || defined(__s390x__)))
 
@@ -2235,7 +2286,7 @@ BOOST_INT128_FORCE_INLINE constexpr uint128_t default_add(const uint128_t lhs, c
     #endif
 }
 
-BOOST_INT128_FORCE_INLINE constexpr uint128_t default_sub(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_sub(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     #if defined(BOOST_INT128_HAS_BUILTIN_SUB_OVERFLOW) && (defined(__i386__) || defined(__arm__) || (defined(__s390__) || defined(__s390x__)))
 
@@ -2263,7 +2314,7 @@ BOOST_INT128_FORCE_INLINE constexpr uint128_t default_sub(const uint128_t lhs, c
     #endif
 }
 
-BOOST_INT128_FORCE_INLINE constexpr uint128_t default_sub(const uint128_t lhs, const std::uint64_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_sub(const uint128_t lhs, const std::uint64_t rhs) noexcept
 {
     #if defined(BOOST_INT128_HAS_BUILTIN_SUB_OVERFLOW) && (defined(__i386__) || (defined(__aarch64__) && !defined(__APPLE__)) || defined(__arm__) || (defined(__s390__) || defined(__s390x__)))
 
@@ -2295,7 +2346,7 @@ BOOST_INT128_FORCE_INLINE constexpr uint128_t default_sub(const uint128_t lhs, c
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator+(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2313,7 +2364,7 @@ constexpr uint128_t operator+(const uint128_t lhs, const SignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator+(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2335,18 +2386,18 @@ constexpr uint128_t operator+(const SignedInteger lhs, const uint128_t rhs) noex
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator+(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return impl::default_add(lhs, static_cast<std::uint64_t>(rhs));
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator+(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return impl::default_add(rhs, static_cast<std::uint64_t>(lhs));
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator+(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator+(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     return impl::default_add(lhs, rhs);
 }
@@ -2355,12 +2406,12 @@ BOOST_INT128_EXPORT constexpr uint128_t operator+(const uint128_t lhs, const uin
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return impl::default_add(lhs, static_cast<uint128_t>(rhs));
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return impl::default_add(static_cast<uint128_t>(lhs), rhs);
 }
@@ -2368,14 +2419,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const det
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
@@ -2383,12 +2434,12 @@ BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const T, const uint128_t) noe
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return impl::default_add(lhs, static_cast<uint128_t>(rhs));
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return impl::default_add(static_cast<uint128_t>(lhs), rhs);
 }
@@ -2396,7 +2447,7 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator+(const det
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator+=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator+=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_unsigned_integer_v<Integer>, "Sign Conversion Error");
@@ -2406,7 +2457,7 @@ constexpr uint128_t& uint128_t::operator+=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator+=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator+=(const uint128_t rhs) noexcept
 {
     *this = *this + rhs;
     return *this;
@@ -2415,7 +2466,7 @@ constexpr uint128_t& uint128_t::operator+=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator+=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator+=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(!std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -2438,7 +2489,7 @@ inline uint128_t& uint128_t::operator+=(const Integer rhs) noexcept
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator-(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2456,7 +2507,7 @@ constexpr uint128_t operator-(const uint128_t lhs, const SignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator-(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2478,18 +2529,18 @@ constexpr uint128_t operator-(const SignedInteger lhs, const uint128_t rhs) noex
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator-(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return impl::default_sub(lhs, static_cast<std::uint64_t>(rhs));
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator-(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return impl::default_add(-rhs, static_cast<std::uint64_t>(lhs));
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator-(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator-(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     return impl::default_sub(lhs, rhs);
 }
@@ -2498,12 +2549,12 @@ BOOST_INT128_EXPORT constexpr uint128_t operator-(const uint128_t lhs, const uin
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs - static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) - rhs;
 }
@@ -2511,14 +2562,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const det
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
@@ -2526,12 +2577,12 @@ BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const T, const uint128_t) noe
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs - static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) - rhs;
 }
@@ -2539,7 +2590,7 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator-(const det
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator-=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator-=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_unsigned_integer_v<Integer>, "Sign Conversion Error");
@@ -2549,7 +2600,7 @@ constexpr uint128_t& uint128_t::operator-=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator-=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator-=(const uint128_t rhs) noexcept
 {
     *this = *this - rhs;
     return *this;
@@ -2558,7 +2609,7 @@ constexpr uint128_t& uint128_t::operator-=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator-=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator-=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(!std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -2583,7 +2634,7 @@ namespace detail {
 
 #if defined(_M_AMD64) && !defined(__GNUC__)
 
-BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     uint128_t result {};
     result.low = _umul128(lhs.low, rhs.low, &result.high);
@@ -2593,7 +2644,7 @@ BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const uint128_
     return result;
 }
 
-BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uint64_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uint64_t rhs) noexcept
 {
     uint128_t result {};
     result.low = _umul128(lhs.low, rhs, &result.high);
@@ -2602,7 +2653,7 @@ BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uin
     return result;
 }
 
-BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uint32_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uint32_t rhs) noexcept
 {
     uint128_t result {};
     result.low = _umul128(lhs.low, static_cast<std::uint64_t>(rhs), &result.high);
@@ -2613,7 +2664,7 @@ BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uin
 
 #elif defined(_M_ARM64)
 
-BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     const auto low_low{lhs.low * rhs.low};
     const auto high_low_low{__umulh(lhs.low, rhs.low)};
@@ -2626,7 +2677,7 @@ BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const uint128_
     return {high, low_low};
 }
 
-BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uint64_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uint64_t rhs) noexcept
 {
     const auto low{lhs.low * rhs};
     const auto high{__umulh(lhs.low, rhs) + (lhs.high * rhs)};
@@ -2634,7 +2685,7 @@ BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uin
     return {high, low};
 }
 
-BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uint32_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uint32_t rhs) noexcept
 {
     const auto low{lhs.low * rhs};
     const auto high{__umulh(lhs.low, static_cast<std::uint64_t>(rhs)) + (lhs.high * rhs)};
@@ -2645,7 +2696,7 @@ BOOST_INT128_FORCE_INLINE uint128_t msvc_mul(const uint128_t lhs, const std::uin
 #endif // MSVC implementations
 
 template <typename UnsignedInteger>
-BOOST_INT128_FORCE_INLINE constexpr uint128_t default_mul(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_mul(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     #if (defined(__aarch64__) || defined(__x86_64__) || defined(__PPC__) || defined(__powerpc__)) && defined(__GNUC__) && defined(BOOST_INT128_HAS_INT128)
 
@@ -2722,7 +2773,7 @@ BOOST_INT128_FORCE_INLINE constexpr uint128_t default_mul(const uint128_t lhs, c
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator*(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2744,7 +2795,7 @@ constexpr uint128_t operator*(const uint128_t lhs, const SignedInteger rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator*(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2766,7 +2817,7 @@ constexpr uint128_t operator*(const SignedInteger lhs, const uint128_t rhs) noex
 }
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator*(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     return detail::default_mul(lhs, static_cast<std::uint64_t>(rhs));
 }
@@ -2776,12 +2827,12 @@ constexpr uint128_t operator*(const uint128_t lhs, const UnsignedInteger rhs) no
 #endif
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator*(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     return detail::default_mul(rhs, static_cast<std::uint64_t>(lhs));
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator*(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     return detail::default_mul(lhs, rhs);
 }
@@ -2790,7 +2841,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator*(const uint128_t lhs, const uin
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr uint128_t operator*(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     const auto abs_rhs {rhs < 0 ? -static_cast<uint128_t>(rhs) : static_cast<uint128_t>(rhs)};
     const auto res {lhs * abs_rhs};
@@ -2798,7 +2849,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator*(const uint128_t lhs, const det
     return rhs < 0 ? -res : res;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator*(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     const auto abs_lhs {lhs < 0 ? -static_cast<uint128_t>(lhs) : static_cast<uint128_t>(lhs)};
     const auto res {abs_lhs * rhs};
@@ -2809,14 +2860,14 @@ BOOST_INT128_EXPORT constexpr uint128_t operator*(const detail::builtin_i128 lhs
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-constexpr uint128_t operator*(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-constexpr uint128_t operator*(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
@@ -2824,12 +2875,12 @@ constexpr uint128_t operator*(const T, const uint128_t) noexcept
 
 #endif // BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT constexpr uint128_t operator*(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs * static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator*(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator*(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) * rhs;
 }
@@ -2837,7 +2888,7 @@ BOOST_INT128_EXPORT constexpr uint128_t operator*(const detail::builtin_u128 lhs
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator*=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator*=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_unsigned_integer_v<Integer>, "Sign Conversion Error");
@@ -2847,7 +2898,7 @@ constexpr uint128_t& uint128_t::operator*=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator*=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator*=(const uint128_t rhs) noexcept
 {
     *this = *this * rhs;
     return *this;
@@ -2856,7 +2907,7 @@ constexpr uint128_t& uint128_t::operator*=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator*=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator*=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(!std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -2874,21 +2925,21 @@ inline uint128_t& uint128_t::operator*=(const Integer rhs) noexcept
 
 // For div we need forward declarations since we mix and match the arguments
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator/(uint128_t lhs, SignedInteger rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(uint128_t lhs, SignedInteger rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator/(SignedInteger lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(SignedInteger lhs, uint128_t rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator/(uint128_t lhs, UnsignedInteger rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(uint128_t lhs, UnsignedInteger rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator/(UnsignedInteger lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(UnsignedInteger lhs, uint128_t rhs) noexcept;
 
-BOOST_INT128_EXPORT constexpr uint128_t operator/(uint128_t lhs, uint128_t rhs) noexcept;
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(uint128_t lhs, uint128_t rhs) noexcept;
 
 template <BOOST_INT128_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator/(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2906,7 +2957,7 @@ constexpr uint128_t operator/(const uint128_t lhs, const SignedInteger rhs) noex
 }
 
 template <BOOST_INT128_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator/(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -2924,7 +2975,7 @@ constexpr uint128_t operator/(const SignedInteger lhs, const uint128_t rhs) noex
 }
 
 template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator/(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     using eval_type = detail::evaluation_type_t<UnsignedInteger>;
 
@@ -2946,7 +2997,7 @@ constexpr uint128_t operator/(const uint128_t lhs, const UnsignedInteger rhs) no
 }
 
 template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator/(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     using eval_type = detail::evaluation_type_t<UnsignedInteger>;
 
@@ -2963,7 +3014,7 @@ constexpr uint128_t operator/(const UnsignedInteger lhs, const uint128_t rhs) no
     return {0, static_cast<eval_type>(lhs) / rhs.low};
 }
 
-constexpr uint128_t operator/(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     if (BOOST_INT128_UNLIKELY(rhs == 0U))
     {
@@ -3004,24 +3055,24 @@ constexpr uint128_t operator/(const uint128_t lhs, const uint128_t rhs) noexcept
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs / static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) / rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs / static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) / rhs;
 }
@@ -3029,14 +3080,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const det
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
@@ -3047,7 +3098,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator/(const T, const uint128_t) noe
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator/=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator/=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_unsigned_integer_v<Integer>, "Sign Conversion Error");
@@ -3057,7 +3108,7 @@ constexpr uint128_t& uint128_t::operator/=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator/=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator/=(const uint128_t rhs) noexcept
 {
     *this = *this / rhs;
     return *this;
@@ -3066,7 +3117,7 @@ constexpr uint128_t& uint128_t::operator/=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator/=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator/=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(!std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -3084,21 +3135,21 @@ inline uint128_t& uint128_t::operator/=(const Integer rhs) noexcept
 
 // For div we need forward declarations since we mix and match the arguments
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator%(uint128_t lhs, SignedInteger rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(uint128_t lhs, SignedInteger rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator%(SignedInteger lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(SignedInteger lhs, uint128_t rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator%(uint128_t lhs, UnsignedInteger rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(uint128_t lhs, UnsignedInteger rhs) noexcept;
 
 BOOST_INT128_EXPORT template <BOOST_INT128_DEFAULTED_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator%(UnsignedInteger lhs, uint128_t rhs) noexcept;
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(UnsignedInteger lhs, uint128_t rhs) noexcept;
 
-BOOST_INT128_EXPORT constexpr uint128_t operator%(uint128_t lhs, uint128_t rhs) noexcept;
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(uint128_t lhs, uint128_t rhs) noexcept;
 
 template <BOOST_INT128_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator%(const uint128_t lhs, const SignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint128_t lhs, const SignedInteger rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -3116,7 +3167,7 @@ constexpr uint128_t operator%(const uint128_t lhs, const SignedInteger rhs) noex
 }
 
 template <BOOST_INT128_SIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator%(const SignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const SignedInteger lhs, const uint128_t rhs) noexcept
 {
     #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
@@ -3134,7 +3185,7 @@ constexpr uint128_t operator%(const SignedInteger lhs, const uint128_t rhs) noex
 }
 
 template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator%(const uint128_t lhs, const UnsignedInteger rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint128_t lhs, const UnsignedInteger rhs) noexcept
 {
     using eval_type = detail::evaluation_type_t<UnsignedInteger>;
 
@@ -3159,7 +3210,7 @@ constexpr uint128_t operator%(const uint128_t lhs, const UnsignedInteger rhs) no
 }
 
 template <BOOST_INT128_UNSIGNED_INTEGER_CONCEPT>
-constexpr uint128_t operator%(const UnsignedInteger lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const UnsignedInteger lhs, const uint128_t rhs) noexcept
 {
     using eval_type = detail::evaluation_type_t<UnsignedInteger>;
 
@@ -3175,7 +3226,7 @@ constexpr uint128_t operator%(const UnsignedInteger lhs, const uint128_t rhs) no
     return {0, static_cast<eval_type>(lhs) % rhs.low};
 }
 
-constexpr uint128_t operator%(const uint128_t lhs, const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint128_t lhs, const uint128_t rhs) noexcept
 {
     if (BOOST_INT128_UNLIKELY(rhs == 0U))
     {
@@ -3218,24 +3269,24 @@ constexpr uint128_t operator%(const uint128_t lhs, const uint128_t rhs) noexcept
 
 #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INT128)
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const uint128_t lhs, const detail::builtin_u128 rhs) noexcept
 {
     return lhs % static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const detail::builtin_u128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) % rhs;
 }
 
 #ifdef BOOST_INT128_ALLOW_SIGN_CONVERSION
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const uint128_t lhs, const detail::builtin_i128 rhs) noexcept
 {
     return lhs % static_cast<uint128_t>(rhs);
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const detail::builtin_i128 lhs, const uint128_t rhs) noexcept
 {
     return static_cast<uint128_t>(lhs) % rhs;
 }
@@ -3243,14 +3294,14 @@ BOOST_INT128_EXPORT BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const det
 #else
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const uint128_t, const T) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const uint128_t, const T) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
 }
 
 BOOST_INT128_EXPORT template <typename T, std::enable_if_t<std::is_same<T, detail::builtin_i128>::value, bool> = true>
-BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const T, const uint128_t) noexcept
+BOOST_INT128_HOST_DEVICE BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const T, const uint128_t) noexcept
 {
     static_assert(detail::is_unsigned_integer_v<T>, "Sign Conversion Error");
     return {0, 0};
@@ -3261,7 +3312,7 @@ BOOST_INT128_BUILTIN_CONSTEXPR uint128_t operator%(const T, const uint128_t) noe
 #endif // BOOST_INT128_HAS_INT128
 
 template <BOOST_INT128_INTEGER_CONCEPT>
-constexpr uint128_t& uint128_t::operator%=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator%=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(detail::is_unsigned_integer_v<Integer>, "Sign Conversion Error");
@@ -3271,7 +3322,7 @@ constexpr uint128_t& uint128_t::operator%=(const Integer rhs) noexcept
     return *this;
 }
 
-constexpr uint128_t& uint128_t::operator%=(const uint128_t rhs) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t& uint128_t::operator%=(const uint128_t rhs) noexcept
 {
     *this = *this % rhs;
     return *this;
@@ -3280,7 +3331,7 @@ constexpr uint128_t& uint128_t::operator%=(const uint128_t rhs) noexcept
 #ifdef BOOST_INT128_HAS_MSVC_INT128
 
 template <BOOST_INT128_128BIT_INTEGER_CONCEPT>
-inline uint128_t& uint128_t::operator%=(const Integer rhs) noexcept
+BOOST_INT128_HOST_DEVICE inline uint128_t& uint128_t::operator%=(const Integer rhs) noexcept
 {
     #ifndef BOOST_INT128_ALLOW_SIGN_CONVERSION
     static_assert(!std::numeric_limits<Integer>::is_signed, "Sign Conversion Error");
@@ -3342,15 +3393,15 @@ class numeric_limits_impl_u128
     static constexpr bool tinyness_before = false;
 
     // Member functions
-    static constexpr auto (min)        () -> boost::int128::uint128_t { return {0, 0}; }
-    static constexpr auto lowest       () -> boost::int128::uint128_t { return {0, 0}; }
-    static constexpr auto (max)        () -> boost::int128::uint128_t { return {UINT64_MAX, UINT64_MAX}; }
-    static constexpr auto epsilon      () -> boost::int128::uint128_t { return {0, 0}; }
-    static constexpr auto round_error  () -> boost::int128::uint128_t { return {0, 0}; }
-    static constexpr auto infinity     () -> boost::int128::uint128_t { return {0, 0}; }
-    static constexpr auto quiet_NaN    () -> boost::int128::uint128_t { return {0, 0}; }
-    static constexpr auto signaling_NaN() -> boost::int128::uint128_t { return {0, 0}; }
-    static constexpr auto denorm_min   () -> boost::int128::uint128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto (min)        () -> boost::int128::uint128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto lowest       () -> boost::int128::uint128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto (max)        () -> boost::int128::uint128_t { return {UINT64_MAX, UINT64_MAX}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto epsilon      () -> boost::int128::uint128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto round_error  () -> boost::int128::uint128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto infinity     () -> boost::int128::uint128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto quiet_NaN    () -> boost::int128::uint128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto signaling_NaN() -> boost::int128::uint128_t { return {0, 0}; }
+    BOOST_INT128_HOST_DEVICE static constexpr auto denorm_min   () -> boost::int128::uint128_t { return {0, 0}; }
 };
 
 #if !defined(__cpp_inline_variables) || __cpp_inline_variables < 201606L
diff --git a/include/boost/int128/detail/utilities.hpp b/include/boost/int128/detail/utilities.hpp
index 019d76c0..cc3ed445 100644
--- a/include/boost/int128/detail/utilities.hpp
+++ b/include/boost/int128/detail/utilities.hpp
@@ -5,6 +5,8 @@
 #ifndef BOOST_INT128_DETAIL_UTILITIES_HPP
 #define BOOST_INT128_DETAIL_UTILITIES_HPP
 
+#include <boost/int128/detail/config.hpp>
+
 #ifndef BOOST_INT128_BUILD_MODULE
 
 #include <cstddef>
@@ -16,7 +18,7 @@ namespace int128 {
 namespace detail {
 
 template <typename T>
-constexpr std::size_t strlen(const T* str) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::size_t strlen(const T* str) noexcept
 {
     std::size_t i {};
     while (*str != '\0')
diff --git a/include/boost/int128/fmt_format.hpp b/include/boost/int128/fmt_format.hpp
index 6df48626..e328e638 100644
--- a/include/boost/int128/fmt_format.hpp
+++ b/include/boost/int128/fmt_format.hpp
@@ -280,7 +280,7 @@ struct formatter
         }
         else
         {
-            abs_v = v;
+            abs_v = static_cast<uint128_t>(v);
         }
 
         const auto end = detail::mini_to_chars(buffer, abs_v, base, is_upper);
diff --git a/include/boost/int128/format.hpp b/include/boost/int128/format.hpp
index d6e235ad..16289415 100644
--- a/include/boost/int128/format.hpp
+++ b/include/boost/int128/format.hpp
@@ -269,7 +269,7 @@ struct formatter<T>
         }
         else
         {
-            abs_v = v;
+            abs_v = static_cast<boost::int128::uint128_t>(v);
         }
 
         const auto end = boost::int128::detail::mini_to_chars(buffer, abs_v, base, is_upper);
diff --git a/include/boost/int128/iostream.hpp b/include/boost/int128/iostream.hpp
index 9d87d0f8..d6c4c7c0 100644
--- a/include/boost/int128/iostream.hpp
+++ b/include/boost/int128/iostream.hpp
@@ -124,14 +124,17 @@ auto operator<<(std::basic_ostream<charT, traits>& os, const LibIntegerType& v)
 
     auto first {detail::mini_to_chars(buffer, v, base, uppercase)};
 
-    if (base == 8)
+    if (flags & std::ios_base::showbase)
     {
-        *--first = '0';
-    }
-    else if (base == 16)
-    {
-        *--first = uppercase ? 'X' : 'x';
-        *--first = '0';
+        if (base == 8)
+        {
+            *--first = '0';
+        }
+        else if (base == 16)
+        {
+            *--first = uppercase ? 'X' : 'x';
+            *--first = '0';
+        }
     }
 
     BOOST_INT128_IF_CONSTEXPR (!std::is_same<charT, char>::value)
diff --git a/include/boost/int128/literals.hpp b/include/boost/int128/literals.hpp
index a8c99e5e..9497f20b 100644
--- a/include/boost/int128/literals.hpp
+++ b/include/boost/int128/literals.hpp
@@ -15,76 +15,76 @@ namespace boost {
 namespace int128 {
 namespace literals {
 
-BOOST_INT128_EXPORT constexpr uint128_t operator ""_u128(const char* str) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_u128(const char* str) noexcept
 {
     uint128_t result {};
     detail::from_chars(str, str + detail::strlen(str), result);
     return result;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator ""_U128(const char* str) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_U128(const char* str) noexcept
 {
     uint128_t result {};
     detail::from_chars(str, str + detail::strlen(str), result);
     return result;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator ""_u128(const char* str, std::size_t len) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_u128(const char* str, std::size_t len) noexcept
 {
     uint128_t result {};
     detail::from_chars(str, str + len, result);
     return result;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator ""_U128(const char* str, std::size_t len) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_U128(const char* str, std::size_t len) noexcept
 {
     uint128_t result {};
     detail::from_chars(str, str + len, result);
     return result;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator ""_u128(unsigned long long v) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_u128(unsigned long long v) noexcept
 {
     return uint128_t{v};
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t operator ""_U128(unsigned long long v) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t operator ""_U128(unsigned long long v) noexcept
 {
     return uint128_t{v};
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator ""_i128(const char* str) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_i128(const char* str) noexcept
 {
     int128_t result {};
     detail::from_chars(str, str + detail::strlen(str), result);
     return result;
 }
 
-constexpr int128_t operator ""_I128(const char* str) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_I128(const char* str) noexcept
 {
     int128_t result {};
     detail::from_chars(str, str + detail::strlen(str), result);
     return result;
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator ""_i128(unsigned long long v) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_i128(unsigned long long v) noexcept
 {
     return int128_t{v};
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator ""_I128(unsigned long long v) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_I128(unsigned long long v) noexcept
 {
     return int128_t{v};
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator ""_i128(const char* str, std::size_t len) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_i128(const char* str, std::size_t len) noexcept
 {
     int128_t result {};
     detail::from_chars(str, str + len, result);
     return result;
 }
 
-BOOST_INT128_EXPORT constexpr int128_t operator ""_I128(const char* str, std::size_t len) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator ""_I128(const char* str, std::size_t len) noexcept
 {
     int128_t result {};
     detail::from_chars(str, str + len, result);
diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp
index fa22f7c5..0699cd8b 100644
--- a/include/boost/int128/numeric.hpp
+++ b/include/boost/int128/numeric.hpp
@@ -52,7 +52,7 @@ BOOST_INT128_INLINE_CONSTEXPR bool is_reduced_integer_v {reduced_integers<Intege
 
 } // namespace detail
 
-BOOST_INT128_EXPORT constexpr uint128_t add_sat(const uint128_t x, const uint128_t y) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t add_sat(const uint128_t x, const uint128_t y) noexcept
 {
     const auto z {x + y};
 
@@ -64,7 +64,7 @@ BOOST_INT128_EXPORT constexpr uint128_t add_sat(const uint128_t x, const uint128
     return z;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t sub_sat(const uint128_t x, const uint128_t y) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const uint128_t x, const uint128_t y) noexcept
 {
     const auto z {x - y};
 
@@ -76,72 +76,67 @@ BOOST_INT128_EXPORT constexpr uint128_t sub_sat(const uint128_t x, const uint128
     return z;
 }
 
-BOOST_INT128_EXPORT constexpr int128_t add_sat(int128_t x, int128_t y) noexcept;
-BOOST_INT128_EXPORT constexpr int128_t sub_sat(int128_t x, int128_t y) noexcept;
-
 #ifdef _MSC_VER
 #  pragma warning(push)
 #  pragma warning(disable : 4307) // Addition Overflow
 #  pragma warning(disable : 4146) // Unary minus applied to unsigned type
 #endif
 
-constexpr int128_t add_sat(const int128_t x, const int128_t y) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(const int128_t x, const int128_t y) noexcept
 {
-    if (x >= 0 && y >= 0)
-    {
-        constexpr auto max_value {static_cast<uint128_t>((std::numeric_limits<int128_t>::max)())};
-        const auto big_x {static_cast<uint128_t>(x)};
-        const auto big_y {static_cast<uint128_t>(y)};
-        const auto big_res {big_x + big_y};
+    // Detect overflow BEFORE the addition to avoid signed overflow UB.
+    // When both are non-negative: overflow iff x > max - y (subtraction safe: max - non_negative >= 0)
+    // When both are negative: overflow iff x < min - y (subtraction safe: min - negative > min)
+    // Mixed signs: overflow is impossible.
 
-        return big_res > max_value ? (std::numeric_limits<int128_t>::max)() : static_cast<int128_t>(big_res);
-    }
-    else if ((x < 0 && y > 0) || (x > 0 && y < 0))
+    if (x.high >= 0 && y.high >= 0)
     {
-        return x + y;
+        if (x > (std::numeric_limits<int128_t>::max)() - y)
+        {
+            return (std::numeric_limits<int128_t>::max)();
+        }
     }
-    else
+    else if (x.high < 0 && y.high < 0)
     {
-        // x < 0 and y < 0
-        // Nearly the same technique as the positive values case
-        constexpr auto max_value {-static_cast<uint128_t>((std::numeric_limits<int128_t>::min)())};
-        const auto big_x {static_cast<uint128_t>(abs(x))};
-        const auto big_y {static_cast<uint128_t>(abs(y))};
-        const auto big_res {big_x + big_y};
-
-        return big_res > max_value ? (std::numeric_limits<int128_t>::min)() : -static_cast<int128_t>(big_res);
+        if (x < (std::numeric_limits<int128_t>::min)() - y)
+        {
+            return (std::numeric_limits<int128_t>::min)();
+        }
     }
+
+    return x + y;
 }
 
-constexpr int128_t sub_sat(const int128_t x, const int128_t y) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int128_t y) noexcept
 {
-    if (x <= 0 && y >= 0)
-    {
-        // Underflow case
-        const auto res {x - y};
-        return res > x ? (std::numeric_limits<int128_t>::min)() : res;
-    }
-    else if (x > 0 && y < 0)
-    {
-        // Overflow Case
-        constexpr auto max_val {static_cast<uint128_t>((std::numeric_limits<int128_t>::max)())};
-        const auto big_x {static_cast<uint128_t>(x)};
-        const auto big_y {-static_cast<uint128_t>(y)};
-        const auto res {big_x + big_y};
+    // Detect overflow BEFORE the subtraction to avoid signed overflow UB.
+    // Positive overflow: x >= 0 and y < 0 and x > max + y (safe: max + negative < max)
+    // Negative overflow: x < 0 and y >= 0 and x < min + y (safe: min + non_negative > min)
+    // Same signs: overflow is impossible.
 
-        return (res > max_val || res < big_x) ? (std::numeric_limits<int128_t>::max)() : static_cast<int128_t>(res);
+    if (x.high >= 0 && y.high < 0)
+    {
+        if (x > (std::numeric_limits<int128_t>::max)() + y)
+        {
+            return (std::numeric_limits<int128_t>::max)();
+        }
     }
-    else
+    else if (x.high < 0 && y.high >= 0)
     {
-        return x - y;
+        if (x < (std::numeric_limits<int128_t>::min)() + y)
+        {
+            return (std::numeric_limits<int128_t>::min)();
+        }
     }
+
+    return x - y;
 }
 
 #ifdef _MSC_VER
 #  pragma warning(pop)
 #endif
 
-BOOST_INT128_EXPORT constexpr uint128_t mul_sat(const uint128_t x, const uint128_t y) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t mul_sat(const uint128_t x, const uint128_t y) noexcept
 {
     const auto x_bits {bit_width(x)};
     const auto y_bits {bit_width(y)};
@@ -154,7 +149,7 @@ BOOST_INT128_EXPORT constexpr uint128_t mul_sat(const uint128_t x, const uint128
     return x * y;
 }
 
-BOOST_INT128_EXPORT constexpr int128_t mul_sat(const int128_t& x, const int128_t& y) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t mul_sat(const int128_t& x, const int128_t& y) noexcept
 {
     const auto x_bits {bit_width(static_cast<uint128_t>(abs(x)))};
     const auto y_bits {bit_width(static_cast<uint128_t>(abs(y)))};
@@ -175,12 +170,12 @@ BOOST_INT128_EXPORT constexpr int128_t mul_sat(const int128_t& x, const int128_t
     return res;
 }
 
-BOOST_INT128_EXPORT constexpr uint128_t div_sat(const uint128_t x, const uint128_t y) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t div_sat(const uint128_t x, const uint128_t y) noexcept
 {
     return x / y;
 }
 
-BOOST_INT128_EXPORT constexpr int128_t div_sat(const int128_t x, const int128_t y) noexcept
+BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t div_sat(const int128_t x, const int128_t y) noexcept
 {
     if (BOOST_INT128_UNLIKELY(x == (std::numeric_limits<int128_t>::min)() && y == -1))
     {
@@ -197,7 +192,7 @@ BOOST_INT128_EXPORT constexpr int128_t div_sat(const int128_t x, const int128_t
 #endif
 
 BOOST_INT128_EXPORT template <typename TargetType, std::enable_if_t<detail::is_reduced_integer_v<TargetType>, bool> = true>
-constexpr TargetType saturate_cast(const uint128_t value) noexcept
+BOOST_INT128_HOST_DEVICE constexpr TargetType saturate_cast(const uint128_t value) noexcept
 {
     BOOST_INT128_IF_CONSTEXPR (std::is_same<uint128_t, TargetType>::value)
     {
@@ -219,7 +214,7 @@ constexpr TargetType saturate_cast(const uint128_t value) noexcept
 #endif
 
 BOOST_INT128_EXPORT template <typename TargetType, std::enable_if_t<detail::is_reduced_integer_v<TargetType>, bool> = true>
-constexpr TargetType saturate_cast(const int128_t value) noexcept
+BOOST_INT128_HOST_DEVICE constexpr TargetType saturate_cast(const int128_t value) noexcept
 {
     BOOST_INT128_IF_CONSTEXPR (std::is_same<int128_t, TargetType>::value)
     {
@@ -251,7 +246,7 @@ constexpr TargetType saturate_cast(const int128_t value) noexcept
 
 namespace detail {
 
-constexpr std::uint64_t gcd64(std::uint64_t x, std::uint64_t y) noexcept
+BOOST_INT128_HOST_DEVICE constexpr std::uint64_t gcd64(std::uint64_t x, std::uint64_t y) noexcept
 {
     if (x == 0)
     {
@@ -283,7 +278,7 @@ constexpr std::uint64_t gcd64(std::uint64_t x, std::uint64_t y) noexcept
 
 } // namespace detail
 
-constexpr uint128_t gcd(uint128_t a, uint128_t b) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t gcd(uint128_t a, uint128_t b) noexcept
 {
     // Base case
     if (a == 0U)
@@ -320,7 +315,7 @@ constexpr uint128_t gcd(uint128_t a, uint128_t b) noexcept
     return uint128_t{0, g} << shift;
 }
 
-constexpr int128_t gcd(const int128_t a, const int128_t b) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t gcd(const int128_t a, const int128_t b) noexcept
 {
     return static_cast<int128_t>(gcd(static_cast<uint128_t>(abs(a)), static_cast<uint128_t>(abs(b))));
 }
@@ -330,7 +325,7 @@ constexpr int128_t gcd(const int128_t a, const int128_t b) noexcept
 // but very slow impl that we know works.
 #if !(defined(_M_IX86) && !defined(_NDEBUG))
 
-constexpr uint128_t lcm(const uint128_t a, const uint128_t b) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t lcm(const uint128_t a, const uint128_t b) noexcept
 {
     if (a == 0U || b == 0U)
     {
@@ -346,11 +341,11 @@ constexpr uint128_t lcm(const uint128_t a, const uint128_t b) noexcept
 
 #else
 
-constexpr uint128_t lcm(uint128_t a, uint128_t b) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t lcm(uint128_t a, uint128_t b) noexcept
 {
     if (a == 0U || b == 0U)
     {
-        return 0;
+        return uint128_t{0};
     }
 
 
@@ -380,12 +375,12 @@ constexpr uint128_t lcm(uint128_t a, uint128_t b) noexcept
 
 #endif
 
-constexpr int128_t lcm(const int128_t a, const int128_t b) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t lcm(const int128_t a, const int128_t b) noexcept
 {
     return static_cast<int128_t>(lcm(static_cast<uint128_t>(abs(a)), static_cast<uint128_t>(abs(b))));
 }
 
-constexpr uint128_t midpoint(const uint128_t a, const uint128_t b) noexcept
+BOOST_INT128_HOST_DEVICE constexpr uint128_t midpoint(const uint128_t a, const uint128_t b) noexcept
 {
     // Bit manipulation formula works for unsigned integers
     auto mid {(a & b) + ((a ^ b) >> 1)};
@@ -399,16 +394,22 @@ constexpr uint128_t midpoint(const uint128_t a, const uint128_t b) noexcept
     return mid;
 }
 
-constexpr int128_t midpoint(const int128_t a, const int128_t b) noexcept
+BOOST_INT128_HOST_DEVICE constexpr int128_t midpoint(const int128_t a, const int128_t b) noexcept
 {
     // For signed integers, we use a + (b - a) / 2 or a - (a - b) / 2
     // The subtraction is done in unsigned arithmetic to handle overflow correctly
     // Integer division automatically rounds toward the first argument
+    //
+    // Use direct field access for both the uint128 construction and the
+    // comparison to avoid NVCC host compiler issues with operator<= and
+    // static_cast on int128_t for large-magnitude values
+
+    const uint128_t ua {static_cast<std::uint64_t>(a.high), a.low};
+    const uint128_t ub {static_cast<std::uint64_t>(b.high), b.low};
 
-    const auto ua {static_cast<uint128_t>(a)};
-    const auto ub {static_cast<uint128_t>(b)};
+    const bool a_le_b {a.high == b.high ? a.low <= b.low : a.high < b.high};
 
-    if (a <= b)
+    if (a_le_b)
     {
         // diff = b - a (computed in unsigned, handles wrap-around correctly)
         const auto diff {ub - ua};
diff --git a/include/boost/int128/random.hpp b/include/boost/int128/random.hpp
new file mode 100644
index 00000000..82e2975b
--- /dev/null
+++ b/include/boost/int128/random.hpp
@@ -0,0 +1,96 @@
+// Copyright 2026 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+#ifndef BOOST_INT128_RANDOM_HPP
+#define BOOST_INT128_RANDOM_HPP
+
+#include <boost/int128/int128.hpp>
+
+namespace boost {
+namespace random {
+namespace traits {
+
+template <class T, bool intrinsic>
+struct make_unsigned_imp;
+
+template <>
+struct make_unsigned_imp<int128::uint128_t, false>
+{
+    using type = int128::uint128_t;
+};
+
+template <>
+struct make_unsigned_imp<int128::int128_t, false>
+{
+    using type = int128::uint128_t;
+};
+
+template <class T>
+struct make_unsigned;
+
+template <>
+struct make_unsigned<int128::uint128_t>
+{
+    using type = int128::uint128_t;
+};
+
+template <>
+struct make_unsigned<int128::int128_t>
+{
+    using type = int128::int128_t;
+};
+
+template <class T, bool intrinsic>
+struct make_unsigned_or_unbounded_imp;
+
+template <>
+struct make_unsigned_or_unbounded_imp<int128::uint128_t, false>
+{
+    using type = int128::uint128_t;
+};
+
+template <>
+struct make_unsigned_or_unbounded_imp<int128::int128_t, false>
+{
+    using type = int128::uint128_t;
+};
+
+template <class T>
+struct make_unsigned_or_unbounded;
+
+template <>
+struct make_unsigned_or_unbounded<int128::uint128_t>
+{
+    using type = int128::uint128_t;
+};
+
+template <>
+struct make_unsigned_or_unbounded<int128::int128_t>
+{
+    using type = int128::uint128_t;
+};
+
+template <class T>
+struct is_integral;
+
+template <>
+struct is_integral<int128::uint128_t> : std::true_type {};
+
+template <>
+struct is_integral<int128::int128_t> : std::true_type {};
+
+template <class T>
+struct is_signed;
+
+template <>
+struct is_signed<int128::uint128_t> : std::false_type {};
+
+template <>
+struct is_signed<int128::int128_t> : std::true_type {};
+
+} // namespace traits
+} // namespace random
+} // namespace boost
+
+#endif // BOOST_INT128_RANDOM_HPP
diff --git a/module/Jamfile b/module/Jamfile
index 68463c18..0cdbc520 100644
--- a/module/Jamfile
+++ b/module/Jamfile
@@ -27,3 +27,12 @@ project
 obj int128 : int128.cxx : <toolset>msvc:<cxxflags>-interface ;
 
 run quick_test.cpp int128 : : : <dependency>int128 ;
+run ../test/test_bit.cpp int128 : : : <dependency>int128 ;
+run ../test/test_div.cpp int128 : : : <dependency>int128 ;
+run ../test/test_gcd_lcm.cpp int128 : : : <dependency>int128 ;
+run ../test/test_i128.cpp int128 : : : <dependency>int128 ;
+run ../test/test_i128_no_sign_conv.cpp int128 : : : <dependency>int128 ;
+run ../test/test_limits_i128.cpp : : : <dependency>int128 ;
+run ../test/test_limits_u128.cpp : : : <dependency>int128 ;
+run ../test/test_literals.cpp : : : <dependency>int128 ;
+run ../test/test_midpoint.cpp : : : <dependency>int128 ;
diff --git a/ports/int128/portfile.cmake b/ports/int128/portfile.cmake
index ca3cd128..5c61b3b7 100644
--- a/ports/int128/portfile.cmake
+++ b/ports/int128/portfile.cmake
@@ -7,8 +7,8 @@
 vcpkg_from_github(
         OUT_SOURCE_PATH SOURCE_PATH
         REPO cppalliance/int128
-        REF v1.6.0
-        SHA512 cc5a5e065643ee5b790f35441b34cea291bd299f5460278bb3402005c9cdd5a9a552b28e64816cc0ed480662c53959eb7cd5d1774d7e2480fffc65532353dc5c
+        REF v1.6.1
+        SHA512 84978c581edd5d4b2b9ecaf9229b552dae4c686387d7e3149a2fd7ff4736a045ba6712611f6560ed58084f0f0cd2a5fc4d32759bb0d4166529f50bc066dc59e0
         HEAD_REF master
 )
 
diff --git a/ports/int128/vcpkg.json b/ports/int128/vcpkg.json
index 5863488b..6548a469 100644
--- a/ports/int128/vcpkg.json
+++ b/ports/int128/vcpkg.json
@@ -1,6 +1,6 @@
 {
   "name": "int128",
-  "version": "1.6.0",
+  "version": "1.6.1",
   "description": "Portable and performant 128-bit integers",
   "homepage": "https://github.com/cppalliance/int128",
   "license": "BSL-1.0",
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 637982d4..e601babb 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -7,6 +7,24 @@ include(BoostTestJamfile OPTIONAL RESULT_VARIABLE HAVE_BOOST_TEST)
 
 if(HAVE_BOOST_TEST)
 
-    boost_test_jamfile(FILE Jamfile LINK_LIBRARIES Boost::int128 Boost::core Boost::random Boost::multiprecision Boost::mp11 Boost::charconv)
+    enable_testing()
+
+    if(BOOST_INT128_ENABLE_CUDA)
+
+        message(STATUS "Building Boost.int128 with CUDA")
+
+        enable_language(CUDA)
+        find_package(CUDAToolkit REQUIRED)
+        set(CMAKE_CUDA_EXTENSIONS OFF)
+
+        enable_testing()
+
+        boost_test_jamfile(FILE cuda_jamfile LINK_LIBRARIES Boost::int128 Boost::random Boost::charconv CUDA::cudart COMPILE_DEFINITIONS BOOST_INT128_ENABLE_CUDA=1  )
+        
+    else()
+
+        boost_test_jamfile(FILE Jamfile LINK_LIBRARIES Boost::int128 Boost::core Boost::random Boost::multiprecision Boost::mp11 Boost::charconv)
+
+    endif()
 
 endif()
diff --git a/test/Jamfile b/test/Jamfile
index 3f7de536..4c18efb3 100644
--- a/test/Jamfile
+++ b/test/Jamfile
@@ -69,6 +69,8 @@ run test_stream.cpp ;
 
 compile-fail test_mixed_type_ops.cpp ;
 compile-fail test_mixed_arithmetic.cpp ;
+run test_mixed_type_sign_compare.cpp ;
+run test_mixed_type_sign_conversion.cpp ;
 
 run test_consteval_funcs.cpp ;
 run test_sign_compare.cpp ;
@@ -110,6 +112,7 @@ run github_issue_207.cpp ;
 run github_issue_210.cpp ;
 run github_issue_221.cpp ;
 run github_issue_272.cpp ;
+run github_issue_377.cpp ;
 
 # Compilation of individual headers
 compile compile_tests/int128_master_header_compile.cpp ;
@@ -125,3 +128,4 @@ compile compile_tests/limits_compile.cpp ;
 compile compile_tests/literals_compile.cpp ;
 compile compile_tests/numeric_compile.cpp ;
 compile compile_tests/string_compile.cpp ;
+compile compile_tests/random_compile.cpp ;
diff --git a/test/compile_tests/random_compile.cpp b/test/compile_tests/random_compile.cpp
new file mode 100644
index 00000000..42d23f37
--- /dev/null
+++ b/test/compile_tests/random_compile.cpp
@@ -0,0 +1,10 @@
+// Copyright 2025 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+#include <boost/int128/random.hpp>
+
+int main()
+{
+    return 0;
+}
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
new file mode 100644
index 00000000..e7279d01
--- /dev/null
+++ b/test/cuda_jamfile
@@ -0,0 +1,96 @@
+# Copyright 2024 Matt Borland
+# Distributed under the Boost Software License, Version 1.0.
+# https://www.boost.org/LICENSE_1_0.txt
+
+import testing ;
+import ../../config/checks/config : requires ;
+
+project : requirements
+    [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
+    ;
+
+run test_unsigned_add.cu ;
+run test_signed_add.cu ;
+run test_unsigned_sub.cu ;
+run test_signed_sub.cu ;
+run test_unsigned_mul.cu ;
+run test_signed_mul.cu ;
+run test_unsigned_div.cu ;
+run test_signed_div.cu ;
+run test_unsigned_mod.cu ;
+run test_signed_mod.cu ;
+
+run test_unsigned_or.cu ;
+run test_signed_or.cu ;
+run test_unsigned_and.cu ;
+run test_signed_and.cu ;
+run test_unsigned_xor.cu ;
+run test_signed_xor.cu ;
+run test_unsigned_not.cu ;
+run test_signed_not.cu ;
+run test_unsigned_left_shift.cu ;
+run test_signed_left_shift.cu ;
+run test_unsigned_right_shift.cu ;
+run test_signed_right_shift.cu ;
+
+run test_has_single_bit.cu ;
+run test_countl_zero.cu ;
+run test_countl_one.cu ;
+run test_bit_width.cu ;
+run test_bit_ceil.cu ;
+run test_bit_floor.cu ;
+run test_countr_zero.cu ;
+run test_countr_one.cu ;
+run test_rotl.cu ;
+run test_rotr.cu ;
+run test_popcount.cu ;
+run test_byteswap.cu ;
+
+run test_unsigned_eq.cu ;
+run test_signed_eq.cu ;
+run test_unsigned_ne.cu ;
+run test_signed_ne.cu ;
+run test_unsigned_lt.cu ;
+run test_signed_lt.cu ;
+run test_unsigned_le.cu ;
+run test_signed_le.cu ;
+run test_unsigned_gt.cu ;
+run test_signed_gt.cu ;
+run test_unsigned_ge.cu ;
+run test_signed_ge.cu ;
+
+run test_unsigned_cstdlib_div.cu ;
+run test_signed_cstdlib_div.cu ;
+
+run test_signed_to_unsigned_conversion.cu ;
+run test_unsigned_to_signed_conversion.cu ;
+
+run test_unsigned_add_sat.cu ;
+run test_signed_add_sat.cu ;
+run test_unsigned_sub_sat.cu ;
+run test_signed_sub_sat.cu ;
+run test_unsigned_mul_sat.cu ;
+run test_signed_mul_sat.cu ;
+run test_unsigned_div_sat.cu ;
+run test_signed_div_sat.cu ;
+run test_unsigned_gcd.cu ;
+run test_signed_gcd.cu ;
+run test_unsigned_lcm.cu ;
+run test_signed_lcm.cu ;
+run test_unsigned_midpoint.cu ;
+run test_signed_midpoint.cu ;
+
+run test_unsigned_to_chars.cu ;
+run test_signed_to_chars.cu ;
+run test_unsigned_from_chars.cu ;
+run test_signed_from_chars.cu ;
+
+run test_unsigned_to_chars_bases.cu ;
+run test_signed_to_chars_bases.cu ;
+run test_unsigned_from_chars_bases.cu ;
+run test_signed_from_chars_bases.cu ;
+
+run test_unsigned_literals.cu ;
+run test_signed_literals.cu ;
+
+run ../examples/cuda.cu ;
diff --git a/test/cuda_managed_ptr.hpp b/test/cuda_managed_ptr.hpp
new file mode 100644
index 00000000..c9b1ab60
--- /dev/null
+++ b/test/cuda_managed_ptr.hpp
@@ -0,0 +1,139 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_INT128_CUDA_MANAGED_PTR_HPP
+#define BOOST_INT128_CUDA_MANAGED_PTR_HPP
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#include <cuda_runtime.h>
+
+class managed_holder_base
+{
+protected:
+   static int count;
+   managed_holder_base() { ++count; }
+   ~managed_holder_base()
+   {
+      if(0 == --count)
+         cudaDeviceSynchronize();
+   }
+};
+
+int managed_holder_base::count = 0;
+
+//
+// Reset the device and exit:
+// cudaDeviceReset causes the driver to clean up all state. While
+// not mandatory in normal operation, it is good practice.  It is also
+// needed to ensure correct operation when the application is being
+// profiled. Calling cudaDeviceReset causes all profile data to be
+// flushed before the application exits.
+//
+// We have a global instance of this class, plus instances for each
+// managed pointer.  Last one out the door switches the lights off.
+//
+class cudaResetter
+{
+   static int count;
+public:
+   cudaResetter() { ++count;  }
+   ~cudaResetter()
+   {
+      if(--count == 0)
+      {
+         cudaError_t err = cudaDeviceReset();
+         if(err != cudaSuccess)
+         {
+            std::cerr << "Failed to deinitialize the device! error=" << cudaGetErrorString(err) << std::endl;
+         }
+      }
+   }
+};
+
+int cudaResetter::count = 0;
+
+cudaResetter global_resetter;
+
+template <class T>
+class cuda_managed_ptr
+{
+   T* data;
+   static const cudaResetter resetter;
+   cuda_managed_ptr(const cuda_managed_ptr&) = delete;
+   cuda_managed_ptr& operator=(cuda_managed_ptr const&) = delete;
+   void free()
+   {
+      if(data)
+      {
+         cudaDeviceSynchronize();
+         cudaError_t err = cudaFree(data);
+         if(err != cudaSuccess)
+         {
+            std::cerr << "Failed to deinitialize the device! error=" << cudaGetErrorString(err) << std::endl;
+         }
+      }
+   }
+public:
+   cuda_managed_ptr() : data(0) {}
+   cuda_managed_ptr(std::size_t n)
+   {
+      cudaError_t err = cudaSuccess;
+      void *ptr;
+      err = cudaMallocManaged(&ptr, n * sizeof(T));
+      if(err != cudaSuccess)
+         throw std::runtime_error(cudaGetErrorString(err));
+      cudaDeviceSynchronize();
+      data = static_cast<T*>(ptr);
+   }
+   cuda_managed_ptr(cuda_managed_ptr&& o)
+   {
+      data = o.data;
+      o.data = 0;
+   }
+   cuda_managed_ptr& operator=(cuda_managed_ptr&& o)
+   {
+      free();
+      data = o.data;
+      o.data = 0;
+      return *this;
+   }
+   ~cuda_managed_ptr()
+   {
+      free();
+   }
+
+   class managed_holder : managed_holder_base
+   {
+      T* pdata;
+   public:
+      managed_holder(T* p) : managed_holder_base(), pdata(p) {}
+      managed_holder(const managed_holder& o) : managed_holder_base(), pdata(o.pdata) {}
+      operator T* () { return pdata; }
+      T& operator[] (std::size_t n) { return pdata[n]; }
+   };
+   class const_managed_holder : managed_holder_base
+   {
+      const T* pdata;
+   public:
+      const_managed_holder(T* p) : managed_holder_base(), pdata(p) {}
+      const_managed_holder(const managed_holder& o) : managed_holder_base(), pdata(o.pdata) {}
+      operator const T* () { return pdata; }
+      const T& operator[] (std::size_t n) { return pdata[n]; }
+   };
+
+   managed_holder get() { return managed_holder(data); }
+   const_managed_holder get()const { return data; }
+   T& operator[](std::size_t n) { return data[n]; }
+   const T& operator[](std::size_t n)const { return data[n]; }
+};
+
+template <class T>
+cudaResetter const cuda_managed_ptr<T>::resetter;
+
+#endif // BOOST_INT128_CUDA_MANAGED_PTR_HPP
diff --git a/test/fuzzing/test_fuzzing_add_versus_wide_int.cpp b/test/fuzzing/test_fuzzing_add_versus_wide_int.cpp
new file mode 100644
index 00000000..216ec8e6
--- /dev/null
+++ b/test/fuzzing/test_fuzzing_add_versus_wide_int.cpp
@@ -0,0 +1,141 @@
+///////////////////////////////////////////////////////////////////////////////
+//  Copyright Matt Borland 2025.
+//  Copyright Christopher Kormanyos 2024 - 2025.
+//  Distributed under the Boost Software License,
+//  Version 1.0. (See accompanying file LICENSE_1_0.txt
+//  or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+
+// cd /mnt/c/Users/ckorm/Documents/Ks/PC_Software/NumericalPrograms/ExtendedNumberTypes/wide_integer
+// clang++ -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I. -I/mnt/c/ChrisGitRepos/cppalliance/int128/include -I../NumericalPrograms/ExtendedNumberTypes/wide_integer test/fuzzing/test_fuzzing_div_versus_cppalliance_int128.cpp -o test_fuzzing_div_versus_cppalliance_int128
+// ./test_fuzzing_div_versus_cppalliance_int128 -max_total_time=1200 -max_len=32
+
+#include <math/wide_integer/uintwide_t.h>
+
+#include <boost/int128.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <utility>
+
+extern "C"
+int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
+
+namespace fuzzing
+{
+  template <typename CntrlUintType,
+            typename LocalUintType>
+  auto eval_op(const CntrlUintType& a_cntrl,
+               const CntrlUintType& b_cntrl,
+               const LocalUintType& a_local,
+               const LocalUintType& b_local) -> bool;
+}
+
+template <typename CntrlUintType,
+          typename LocalUintType>
+auto fuzzing::eval_op(const CntrlUintType& a_cntrl,
+                      const CntrlUintType& b_cntrl,
+                      const LocalUintType& a_local,
+                      const LocalUintType& b_local) -> bool
+{
+  using cntrl_uint_type = CntrlUintType;
+  using local_uint_type = LocalUintType;
+
+  static_assert
+  (
+       (std::numeric_limits<cntrl_uint_type>::digits == std::numeric_limits<local_uint_type>::digits)
+    && (std::numeric_limits<cntrl_uint_type>::digits == int { INT32_C(128) }),
+    "Error: the control and local types must both have 128 binary digits"
+  );
+
+  const local_uint_type result_local { local_uint_type(a_local) += b_local };
+  const cntrl_uint_type result_cntrl { cntrl_uint_type(a_cntrl) += b_cntrl };
+
+  const std::uint64_t result_local_lo = static_cast<std::uint64_t>(result_local);
+  const std::uint64_t result_local_hi = static_cast<std::uint64_t>(result_local >> unsigned { UINT8_C(64) });
+
+  const std::uint64_t result_cntrl_lo = static_cast<std::uint64_t>(result_cntrl);
+  const std::uint64_t result_cntrl_hi = static_cast<std::uint64_t>(result_cntrl >> unsigned { UINT8_C(64) });
+
+  // Verify that both the local (test) type as well as control type
+  // obtain the same numerical result.
+
+  const bool
+    result_is_ok
+    {
+         (result_local_lo == result_cntrl_lo)
+      && (result_local_hi == result_cntrl_hi)
+    };
+
+  if(!result_is_ok)
+  {
+    std::cout << "Error: lhs: " << a_local << ", rhs: " << b_local << ", result obtained: " << result_local << std::endl;
+  }
+
+  return result_is_ok;
+}
+
+// The fuzzing entry point.
+extern "C"
+int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
+{
+  constexpr std::size_t max_size { UINT8_C(32) };
+  constexpr std::size_t min_size { UINT8_C(17) };
+
+  bool result_is_ok { true };
+
+  if(((size >= min_size) && (size <= max_size)) && (data != nullptr))
+  {
+    using local_data_array_type = std::array<std::uint8_t, max_size>;
+
+    local_data_array_type tmp_data { };
+
+    tmp_data.fill(UINT8_C(0));
+
+    static_cast<void>(std::copy(data, data + size, tmp_data.begin()));
+
+    const std::uint64_t a_lo64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(0) }) };
+    const std::uint64_t a_hi64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(8) }) };
+    const std::uint64_t b_lo64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(16) }) };
+    const std::uint64_t b_hi64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(24) }) };
+
+    // Import data into the uint values.
+    using local_uint_type = ::boost::int128::uint128_t;
+
+    #if defined(WIDE_INTEGER_NAMESPACE)
+    using cntrl_uint_type = ::WIDE_INTEGER_NAMESPACE::math::wide_integer::uint128_t;
+    #else
+    using cntrl_uint_type = ::math::wide_integer::uint128_t;
+    #endif
+
+    cntrl_uint_type a_cntrl { a_hi64 }; a_cntrl <<= unsigned { UINT8_C(64) }; a_cntrl |= a_lo64;
+    cntrl_uint_type b_cntrl { b_hi64 }; b_cntrl <<= unsigned { UINT8_C(64) }; b_cntrl |= b_lo64;
+
+    local_uint_type a_local { a_hi64 }; a_local <<= unsigned { UINT8_C(64) }; a_local |= a_lo64;
+    local_uint_type b_local { b_hi64 }; b_local <<= unsigned { UINT8_C(64) }; b_local |= b_lo64;
+
+    if(a_local < b_local)
+    {
+      std::swap(a_local, b_local);
+      std::swap(a_cntrl, b_cntrl);
+    }
+
+    if(b_local != 0U)
+    {
+      const bool result_op_is_ok { fuzzing::eval_op(a_cntrl, b_cntrl, a_local, b_local) };
+
+      if(!result_op_is_ok)
+      {
+        assert(result_op_is_ok);
+      }
+
+      result_is_ok = (result_op_is_ok && result_is_ok);
+    }
+  }
+
+  return (result_is_ok ? 0 : -1);
+}
diff --git a/test/fuzzing/test_fuzzing_mul_versus_wide_int.cpp b/test/fuzzing/test_fuzzing_mul_versus_wide_int.cpp
new file mode 100644
index 00000000..2d7da2dc
--- /dev/null
+++ b/test/fuzzing/test_fuzzing_mul_versus_wide_int.cpp
@@ -0,0 +1,141 @@
+///////////////////////////////////////////////////////////////////////////////
+//  Copyright Matt Borland 2025.
+//  Copyright Christopher Kormanyos 2024 - 2025.
+//  Distributed under the Boost Software License,
+//  Version 1.0. (See accompanying file LICENSE_1_0.txt
+//  or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+
+// cd /mnt/c/Users/ckorm/Documents/Ks/PC_Software/NumericalPrograms/ExtendedNumberTypes/wide_integer
+// clang++ -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I. -I/mnt/c/ChrisGitRepos/cppalliance/int128/include -I../NumericalPrograms/ExtendedNumberTypes/wide_integer test/fuzzing/test_fuzzing_div_versus_cppalliance_int128.cpp -o test_fuzzing_div_versus_cppalliance_int128
+// ./test_fuzzing_div_versus_cppalliance_int128 -max_total_time=1200 -max_len=32
+
+#include <math/wide_integer/uintwide_t.h>
+
+#include <boost/int128.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <utility>
+
+extern "C"
+int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
+
+namespace fuzzing
+{
+  template <typename CntrlUintType,
+            typename LocalUintType>
+  auto eval_op(const CntrlUintType& a_cntrl,
+               const CntrlUintType& b_cntrl,
+               const LocalUintType& a_local,
+               const LocalUintType& b_local) -> bool;
+}
+
+template <typename CntrlUintType,
+          typename LocalUintType>
+auto fuzzing::eval_op(const CntrlUintType& a_cntrl,
+                      const CntrlUintType& b_cntrl,
+                      const LocalUintType& a_local,
+                      const LocalUintType& b_local) -> bool
+{
+  using cntrl_uint_type = CntrlUintType;
+  using local_uint_type = LocalUintType;
+
+  static_assert
+  (
+       (std::numeric_limits<cntrl_uint_type>::digits == std::numeric_limits<local_uint_type>::digits)
+    && (std::numeric_limits<cntrl_uint_type>::digits == int { INT32_C(128) }),
+    "Error: the control and local types must both have 128 binary digits"
+  );
+
+  const local_uint_type result_local { local_uint_type(a_local) *= b_local };
+  const cntrl_uint_type result_cntrl { cntrl_uint_type(a_cntrl) *= b_cntrl };
+
+  const std::uint64_t result_local_lo = static_cast<std::uint64_t>(result_local);
+  const std::uint64_t result_local_hi = static_cast<std::uint64_t>(result_local >> unsigned { UINT8_C(64) });
+
+  const std::uint64_t result_cntrl_lo = static_cast<std::uint64_t>(result_cntrl);
+  const std::uint64_t result_cntrl_hi = static_cast<std::uint64_t>(result_cntrl >> unsigned { UINT8_C(64) });
+
+  // Verify that both the local (test) type as well as control type
+  // obtain the same numerical result.
+
+  const bool
+    result_is_ok
+    {
+         (result_local_lo == result_cntrl_lo)
+      && (result_local_hi == result_cntrl_hi)
+    };
+
+  if(!result_is_ok)
+  {
+    std::cout << "Error: lhs: " << a_local << ", rhs: " << b_local << ", result obtained: " << result_local << std::endl;
+  }
+
+  return result_is_ok;
+}
+
+// The fuzzing entry point.
+extern "C"
+int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
+{
+  constexpr std::size_t max_size { UINT8_C(32) };
+  constexpr std::size_t min_size { UINT8_C(17) };
+
+  bool result_is_ok { true };
+
+  if(((size >= min_size) && (size <= max_size)) && (data != nullptr))
+  {
+    using local_data_array_type = std::array<std::uint8_t, max_size>;
+
+    local_data_array_type tmp_data { };
+
+    tmp_data.fill(UINT8_C(0));
+
+    static_cast<void>(std::copy(data, data + size, tmp_data.begin()));
+
+    const std::uint64_t a_lo64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(0) }) };
+    const std::uint64_t a_hi64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(8) }) };
+    const std::uint64_t b_lo64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(16) }) };
+    const std::uint64_t b_hi64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(24) }) };
+
+    // Import data into the uint values.
+    using local_uint_type = ::boost::int128::uint128_t;
+
+    #if defined(WIDE_INTEGER_NAMESPACE)
+    using cntrl_uint_type = ::WIDE_INTEGER_NAMESPACE::math::wide_integer::uint128_t;
+    #else
+    using cntrl_uint_type = ::math::wide_integer::uint128_t;
+    #endif
+
+    cntrl_uint_type a_cntrl { a_hi64 }; a_cntrl <<= unsigned { UINT8_C(64) }; a_cntrl |= a_lo64;
+    cntrl_uint_type b_cntrl { b_hi64 }; b_cntrl <<= unsigned { UINT8_C(64) }; b_cntrl |= b_lo64;
+
+    local_uint_type a_local { a_hi64 }; a_local <<= unsigned { UINT8_C(64) }; a_local |= a_lo64;
+    local_uint_type b_local { b_hi64 }; b_local <<= unsigned { UINT8_C(64) }; b_local |= b_lo64;
+
+    if(a_local < b_local)
+    {
+      std::swap(a_local, b_local);
+      std::swap(a_cntrl, b_cntrl);
+    }
+
+    if(b_local != 0U)
+    {
+      const bool result_op_is_ok { fuzzing::eval_op(a_cntrl, b_cntrl, a_local, b_local) };
+
+      if(!result_op_is_ok)
+      {
+        assert(result_op_is_ok);
+      }
+
+      result_is_ok = (result_op_is_ok && result_is_ok);
+    }
+  }
+
+  return (result_is_ok ? 0 : -1);
+}
diff --git a/test/fuzzing/test_fuzzing_sub_versus_wide_int.cpp b/test/fuzzing/test_fuzzing_sub_versus_wide_int.cpp
new file mode 100644
index 00000000..e5bff887
--- /dev/null
+++ b/test/fuzzing/test_fuzzing_sub_versus_wide_int.cpp
@@ -0,0 +1,141 @@
+///////////////////////////////////////////////////////////////////////////////
+//  Copyright Matt Borland 2025.
+//  Copyright Christopher Kormanyos 2024 - 2025.
+//  Distributed under the Boost Software License,
+//  Version 1.0. (See accompanying file LICENSE_1_0.txt
+//  or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+
+// cd /mnt/c/Users/ckorm/Documents/Ks/PC_Software/NumericalPrograms/ExtendedNumberTypes/wide_integer
+// clang++ -std=c++20 -g -O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -fsanitize=fuzzer -I. -I/mnt/c/ChrisGitRepos/cppalliance/int128/include -I../NumericalPrograms/ExtendedNumberTypes/wide_integer test/fuzzing/test_fuzzing_div_versus_cppalliance_int128.cpp -o test_fuzzing_div_versus_cppalliance_int128
+// ./test_fuzzing_div_versus_cppalliance_int128 -max_total_time=1200 -max_len=32
+
+#include <math/wide_integer/uintwide_t.h>
+
+#include <boost/int128.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <utility>
+
+extern "C"
+int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
+
+namespace fuzzing
+{
+  template <typename CntrlUintType,
+            typename LocalUintType>
+  auto eval_op(const CntrlUintType& a_cntrl,
+               const CntrlUintType& b_cntrl,
+               const LocalUintType& a_local,
+               const LocalUintType& b_local) -> bool;
+}
+
+template <typename CntrlUintType,
+          typename LocalUintType>
+auto fuzzing::eval_op(const CntrlUintType& a_cntrl,
+                      const CntrlUintType& b_cntrl,
+                      const LocalUintType& a_local,
+                      const LocalUintType& b_local) -> bool
+{
+  using cntrl_uint_type = CntrlUintType;
+  using local_uint_type = LocalUintType;
+
+  static_assert
+  (
+       (std::numeric_limits<cntrl_uint_type>::digits == std::numeric_limits<local_uint_type>::digits)
+    && (std::numeric_limits<cntrl_uint_type>::digits == int { INT32_C(128) }),
+    "Error: the control and local types must both have 128 binary digits"
+  );
+
+  const local_uint_type result_local { local_uint_type(a_local) -= b_local };
+  const cntrl_uint_type result_cntrl { cntrl_uint_type(a_cntrl) -= b_cntrl };
+
+  const std::uint64_t result_local_lo = static_cast<std::uint64_t>(result_local);
+  const std::uint64_t result_local_hi = static_cast<std::uint64_t>(result_local >> unsigned { UINT8_C(64) });
+
+  const std::uint64_t result_cntrl_lo = static_cast<std::uint64_t>(result_cntrl);
+  const std::uint64_t result_cntrl_hi = static_cast<std::uint64_t>(result_cntrl >> unsigned { UINT8_C(64) });
+
+  // Verify that both the local (test) type as well as control type
+  // obtain the same numerical result.
+
+  const bool
+    result_is_ok
+    {
+         (result_local_lo == result_cntrl_lo)
+      && (result_local_hi == result_cntrl_hi)
+    };
+
+  if(!result_is_ok)
+  {
+    std::cout << "Error: lhs: " << a_local << ", rhs: " << b_local << ", result obtained: " << result_local << std::endl;
+  }
+
+  return result_is_ok;
+}
+
+// The fuzzing entry point.
+extern "C"
+int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
+{
+  constexpr std::size_t max_size { UINT8_C(32) };
+  constexpr std::size_t min_size { UINT8_C(17) };
+
+  bool result_is_ok { true };
+
+  if(((size >= min_size) && (size <= max_size)) && (data != nullptr))
+  {
+    using local_data_array_type = std::array<std::uint8_t, max_size>;
+
+    local_data_array_type tmp_data { };
+
+    tmp_data.fill(UINT8_C(0));
+
+    static_cast<void>(std::copy(data, data + size, tmp_data.begin()));
+
+    const std::uint64_t a_lo64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(0) }) };
+    const std::uint64_t a_hi64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(8) }) };
+    const std::uint64_t b_lo64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(16) }) };
+    const std::uint64_t b_hi64 { *reinterpret_cast<const std::uint64_t*>(tmp_data.data() + std::size_t { UINT8_C(24) }) };
+
+    // Import data into the uint values.
+    using local_uint_type = ::boost::int128::uint128_t;
+
+    #if defined(WIDE_INTEGER_NAMESPACE)
+    using cntrl_uint_type = ::WIDE_INTEGER_NAMESPACE::math::wide_integer::uint128_t;
+    #else
+    using cntrl_uint_type = ::math::wide_integer::uint128_t;
+    #endif
+
+    cntrl_uint_type a_cntrl { a_hi64 }; a_cntrl <<= unsigned { UINT8_C(64) }; a_cntrl |= a_lo64;
+    cntrl_uint_type b_cntrl { b_hi64 }; b_cntrl <<= unsigned { UINT8_C(64) }; b_cntrl |= b_lo64;
+
+    local_uint_type a_local { a_hi64 }; a_local <<= unsigned { UINT8_C(64) }; a_local |= a_lo64;
+    local_uint_type b_local { b_hi64 }; b_local <<= unsigned { UINT8_C(64) }; b_local |= b_lo64;
+
+    if(a_local < b_local)
+    {
+      std::swap(a_local, b_local);
+      std::swap(a_cntrl, b_cntrl);
+    }
+
+    if(b_local != 0U)
+    {
+      const bool result_op_is_ok { fuzzing::eval_op(a_cntrl, b_cntrl, a_local, b_local) };
+
+      if(!result_op_is_ok)
+      {
+        assert(result_op_is_ok);
+      }
+
+      result_is_ok = (result_op_is_ok && result_is_ok);
+    }
+  }
+
+  return (result_is_ok ? 0 : -1);
+}
diff --git a/test/github_issue_377.cpp b/test/github_issue_377.cpp
new file mode 100644
index 00000000..44f21817
--- /dev/null
+++ b/test/github_issue_377.cpp
@@ -0,0 +1,150 @@
+// Copyright 2026 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+//
+// See: https://github.com/cppalliance/int128/issues/377
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+#include <boost/int128.hpp>
+#include <boost/core/lightweight_test.hpp>
+#include <limits>
+#include <cstdint>
+
+using namespace boost::int128;
+
+template <typename T>
+void test_div_by_one()
+{
+    constexpr auto min_val {std::numeric_limits<int128_t>::min()};
+    BOOST_TEST_EQ(min_val, min_val / T{1});
+}
+
+template <typename T>
+void test_other_vals()
+{
+    constexpr auto min_val {std::numeric_limits<int128_t>::min()};
+    const auto min_div_2 {BOOST_INT128_INT128_C(-85070591730234615865843651857942052864)};
+    const auto min_div_4 {BOOST_INT128_INT128_C(-42535295865117307932921825928971026432)};
+    const auto min_div_16 {BOOST_INT128_INT128_C(-10633823966279326983230456482242756608)};
+
+    BOOST_TEST_EQ(min_div_2, min_val / T{2});
+    BOOST_TEST_EQ(min_div_4, min_val / T{4});
+    BOOST_TEST_EQ(min_div_16, min_val / T{16});
+}
+
+// Bug 1: operator>>(int128_t, int128_t) was calling << instead of >>
+void test_right_shift_int128_amount()
+{
+    const auto val {int128_t(0, 0xFF00)};
+    const auto shift_4 {int128_t(0, 4)};
+
+    // Right-shift with int128_t shift amount must match integer shift
+    BOOST_TEST_EQ(val >> shift_4, val >> 4);
+
+    const auto expected_ff0 {int128_t(0, 0xFF0)};
+    BOOST_TEST_EQ(val >> shift_4, expected_ff0);
+
+    // Test >>= with int128_t rhs
+    auto val2 {val};
+    val2 >>= shift_4;
+    BOOST_TEST_EQ(val2, expected_ff0);
+
+    // Cross-word shift
+    const auto big_val {int128_t(0x1234, 0)};
+    const auto shift_64 {int128_t(0, 64)};
+    const auto expected_1234 {int128_t(0, 0x1234)};
+    BOOST_TEST_EQ(big_val >> shift_64, expected_1234);
+
+    // Arithmetic right shift preserves sign for negative values
+    constexpr auto min_val {std::numeric_limits<int128_t>::min()};
+    const auto shift_1 {int128_t(0, 1)};
+    BOOST_TEST_EQ(min_val >> shift_1, min_val >> 1);
+    BOOST_TEST((min_val >> shift_1) < 0);
+}
+
+// Bug 2: UnsignedInteger / int128_t returned {rhs.high, res} instead of proper sign handling
+void test_unsigned_div_negative_int128()
+{
+    const std::uint64_t lhs {10};
+    const auto neg3 {-int128_t(0, 3)};
+    const auto pos3 {int128_t(0, 3)};
+    const auto expected_neg3 {-int128_t(0, 3)};
+    const auto expected_pos3 {int128_t(0, 3)};
+
+    // 10 / -3 = -3
+    BOOST_TEST_EQ(lhs / neg3, expected_neg3);
+
+    // 10 / 3 = 3
+    BOOST_TEST_EQ(lhs / pos3, expected_pos3);
+
+    // 7 / -1 = -7
+    const std::uint64_t seven {7};
+    const auto neg1 {-int128_t(0, 1)};
+    const auto expected_neg7 {-int128_t(0, 7)};
+    BOOST_TEST_EQ(seven / neg1, expected_neg7);
+}
+
+// Bug 3: UnsignedInteger % int128_t used rhs.low instead of abs_rhs.low
+// and applied wrong sign to remainder
+void test_unsigned_mod_negative_int128()
+{
+    const std::uint64_t lhs {10};
+    const auto neg3 {-int128_t(0, 3)};
+    const auto pos3 {int128_t(0, 3)};
+    const auto expected_1 {int128_t(0, 1)};
+
+    // 10 % -3 = 1 (remainder has sign of dividend, which is unsigned/positive)
+    BOOST_TEST_EQ(lhs % neg3, expected_1);
+
+    // 10 % 3 = 1
+    BOOST_TEST_EQ(lhs % pos3, expected_1);
+
+    // 12 % -5 = 2
+    const std::uint64_t twelve {12};
+    const auto neg5 {-int128_t(0, 5)};
+    const auto expected_2 {int128_t(0, 2)};
+    BOOST_TEST_EQ(twelve % neg5, expected_2);
+}
+
+// Bug 4: operator%(int128_t, int128_t) early return was wrong when lhs = INT128_MIN
+// because abs(INT128_MIN) overflows back to INT128_MIN
+void test_min_val_modulo()
+{
+    constexpr auto min_val {std::numeric_limits<int128_t>::min()};
+    const auto zero {int128_t(0, 0)};
+
+    // INT128_MIN % 1 = 0
+    const auto one {int128_t(0, 1)};
+    BOOST_TEST_EQ(min_val % one, zero);
+
+    // INT128_MIN % 2 = 0 (2^127 is even)
+    const auto two {int128_t(0, 2)};
+    BOOST_TEST_EQ(min_val % two, zero);
+
+    // INT128_MIN % 3 = -2
+    // -170141183460469231731687303715884105728 = -56713727820156410577229101238628035242 * 3 + (-2)
+    const auto three {int128_t(0, 3)};
+    const auto expected_neg2 {BOOST_INT128_INT128_C(-2)};
+    BOOST_TEST_EQ(min_val % three, expected_neg2);
+
+    // INT128_MIN % INT128_MIN = 0
+    BOOST_TEST_EQ(min_val % min_val, zero);
+}
+
+int main()
+{
+    test_div_by_one<std::int32_t>();
+    test_div_by_one<std::int64_t>();
+    test_div_by_one<int128_t>();
+
+    test_other_vals<std::int32_t>();
+    test_other_vals<std::int64_t>();
+    test_other_vals<int128_t>();
+
+    test_right_shift_int128_amount();
+    test_unsigned_div_negative_int128();
+    test_unsigned_mod_negative_int128();
+    test_min_val_modulo();
+
+    return boost::report_errors();
+}
diff --git a/test/stopwatch.hpp b/test/stopwatch.hpp
new file mode 100644
index 00000000..9f3c60de
--- /dev/null
+++ b/test/stopwatch.hpp
@@ -0,0 +1,39 @@
+//  Copyright John Maddock 2016.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef BOOST_MATH_CUDA_STOPWATCH_HPP
+#define BOOST_MATH_CUDA_STOPWATCH_HPP
+
+#ifdef _MSC_VER
+#pragma once
+#endif
+
+#include <chrono>
+
+template <class Clock>
+struct stopwatch
+{
+    typedef typename Clock::duration duration;
+    stopwatch()
+    {
+        m_start = Clock::now();
+    }
+    double elapsed()
+    {
+        duration t = Clock::now() - m_start;
+        return std::chrono::duration_cast<std::chrono::duration<double>>(t).count();
+    }
+    void reset()
+    {
+        m_start = Clock::now();
+    }
+
+private:
+    typename Clock::time_point m_start;
+};
+
+typedef stopwatch<std::chrono::high_resolution_clock> watch;
+
+#endif
diff --git a/test/test_bit.cpp b/test/test_bit.cpp
index 743c3522..8085c628 100644
--- a/test/test_bit.cpp
+++ b/test/test_bit.cpp
@@ -2,10 +2,19 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
+#include <boost/core/lightweight_test.hpp>
+
+#ifndef BOOST_INT128_BUILD_MODULE
+
 #include <boost/int128/int128.hpp>
 #include <boost/int128/bit.hpp>
 #include <boost/int128/iostream.hpp>
-#include <boost/core/lightweight_test.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
 
 void test_has_single_bit()
 {
@@ -42,7 +51,7 @@ void test_countl_zero()
 
 void test_bit_width()
 {
-    BOOST_TEST_EQ(boost::int128::bit_width(0), 0);
+    BOOST_TEST_EQ(boost::int128::bit_width(boost::int128::uint128_t{0}), 0);
 
     boost::int128::uint128_t x {1};
 
diff --git a/test/test_bit_ceil.cu b/test/test_bit_ceil.cu
new file mode 100644
index 00000000..a660eec8
--- /dev/null
+++ b/test/test_bit_ceil.cu
@@ -0,0 +1,92 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::bit_ceil(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Limit to values where bit_ceil won't overflow (bit_width <= 127)
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_MAX}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::bit_ceil(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_bit_floor.cu b/test/test_bit_floor.cu
new file mode 100644
index 00000000..171108d4
--- /dev/null
+++ b/test/test_bit_floor.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::bit_floor(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::bit_floor(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_bit_width.cu b/test/test_bit_width.cu
new file mode 100644
index 00000000..d6c13c7d
--- /dev/null
+++ b/test/test_bit_width.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::bit_width(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::bit_width(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_byteswap.cu b/test/test_byteswap.cu
new file mode 100644
index 00000000..9cb83f91
--- /dev/null
+++ b/test/test_byteswap.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::byteswap(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::byteswap(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_countl_one.cu b/test/test_countl_one.cu
new file mode 100644
index 00000000..ed76e92f
--- /dev/null
+++ b/test/test_countl_one.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::countl_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::countl_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_countl_zero.cu b/test/test_countl_zero.cu
new file mode 100644
index 00000000..829584de
--- /dev/null
+++ b/test/test_countl_zero.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::countl_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::countl_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_countr_one.cu b/test/test_countr_one.cu
new file mode 100644
index 00000000..9024cc29
--- /dev/null
+++ b/test/test_countr_one.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::countr_one(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::countr_one(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_countr_zero.cu b/test/test_countr_zero.cu
new file mode 100644
index 00000000..2cbf5b7d
--- /dev/null
+++ b/test/test_countr_zero.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::countr_zero(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::countr_zero(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_div.cpp b/test/test_div.cpp
index dd84a0ca..519d050c 100644
--- a/test/test_div.cpp
+++ b/test/test_div.cpp
@@ -2,7 +2,18 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
-#include <boost/int128.hpp>
+#ifndef BOOST_INT128_BUILD_MODULE
+
+#include <boost/int128/int128.hpp>
+#include <boost/int128/cstdlib.hpp>
+#include <boost/int128/iostream.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
+
 #include <boost/core/lightweight_test.hpp>
 #include <random>
 
@@ -48,22 +59,22 @@ void test_unsigned_div()
                 BOOST_INT128_UNREACHABLE;   // LCOV_EXCL_LINE
         }
 
-        const auto div_res {div(lhs, rhs)};
+        const auto div_res {boost::int128::div(lhs, rhs)};
         BOOST_TEST_EQ(div_res.quot, lhs / rhs);
         BOOST_TEST_EQ(div_res.rem, lhs % rhs);
 
-        const auto inv_div_res {div(rhs, lhs)};
+        const auto inv_div_res {boost::int128::div(rhs, lhs)};
         BOOST_TEST_EQ(inv_div_res.quot, rhs / lhs);
         BOOST_TEST_EQ(inv_div_res.rem, rhs % lhs);
     }
 
     uint128_t lhs {dist(rng), dist(rng)};
     uint128_t zero {dist(rng) * 0U, dist(rng) * 0U};
-    const auto lhs_num {div(lhs, zero)};
+    const auto lhs_num {boost::int128::div(lhs, zero)};
     BOOST_TEST_EQ(lhs_num.quot, 0U);
     BOOST_TEST_EQ(lhs_num.rem, 0U);
 
-    const auto lhs_denom {div(zero, lhs)};
+    const auto lhs_denom {boost::int128::div(zero, lhs)};
     BOOST_TEST_EQ(lhs_denom.quot, 0U);
     BOOST_TEST_EQ(lhs_denom.rem, 0U);
 }
@@ -98,22 +109,22 @@ void test_signed_div()
                 BOOST_INT128_UNREACHABLE;   // LCOV_EXCL_LINE
         }
 
-        const auto div_res {div(lhs, rhs)};
+        const auto div_res {boost::int128::div(lhs, rhs)};
         BOOST_TEST_EQ(div_res.quot, lhs / rhs);
         BOOST_TEST_EQ(div_res.rem, lhs % rhs);
 
-        const auto inv_div_res {div(rhs, lhs)};
+        const auto inv_div_res {boost::int128::div(rhs, lhs)};
         BOOST_TEST_EQ(inv_div_res.quot, rhs / lhs);
         BOOST_TEST_EQ(inv_div_res.rem, rhs % lhs);
     }
 
     int128_t lhs {idist(rng), dist(rng)};
     int128_t zero {idist(rng) * 0, dist(rng) * 0U};
-    const auto lhs_num {div(lhs, zero)};
+    const auto lhs_num {boost::int128::div(lhs, zero)};
     BOOST_TEST_EQ(lhs_num.quot, 0);
     BOOST_TEST_EQ(lhs_num.rem, 0);
 
-    const auto lhs_denom {div(zero, lhs)};
+    const auto lhs_denom {boost::int128::div(zero, lhs)};
     BOOST_TEST_EQ(lhs_denom.quot, 0);
     BOOST_TEST_EQ(lhs_denom.rem, 0);
 }
diff --git a/test/test_gcd_lcm.cpp b/test/test_gcd_lcm.cpp
index 4b8e1763..7d645c4c 100644
--- a/test/test_gcd_lcm.cpp
+++ b/test/test_gcd_lcm.cpp
@@ -2,7 +2,17 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
+
+#ifndef BOOST_INT128_BUILD_MODULE
+
 #include <boost/int128.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
+
 #include <boost/core/lightweight_test.hpp>
 #include <limits>
 
diff --git a/test/test_has_single_bit.cu b/test/test_has_single_bit.cu
new file mode 100644
index 00000000..fcaa8ddd
--- /dev/null
+++ b/test/test_has_single_bit.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::has_single_bit(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::has_single_bit(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_i128.cpp b/test/test_i128.cpp
index a898fdf8..d52c2e59 100644
--- a/test/test_i128.cpp
+++ b/test/test_i128.cpp
@@ -6,9 +6,18 @@
 #  define BOOST_INT128_ALLOW_SIGN_CONVERSION
 #endif
 
+#ifndef BOOST_INT128_BUILD_MODULE
+
 #include <boost/int128/detail/int128_imp.hpp>
 #include <boost/int128/detail/conversions.hpp>
 #include <boost/int128/iostream.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
+
 #include <boost/core/lightweight_test.hpp>
 #include <boost/mp11.hpp>
 #include <cstring>
diff --git a/test/test_i128_no_sign_conv.cpp b/test/test_i128_no_sign_conv.cpp
index c5bfe7f7..ba2a289e 100644
--- a/test/test_i128_no_sign_conv.cpp
+++ b/test/test_i128_no_sign_conv.cpp
@@ -2,8 +2,18 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
+#ifndef BOOST_INT128_BUILD_MODULE
+
 #include <boost/int128/detail/int128_imp.hpp>
 #include <boost/int128/detail/conversions.hpp>
+#include <boost/int128/iostream.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
+
 #include <boost/core/lightweight_test.hpp>
 #include <boost/mp11.hpp>
 #include <cstring>
diff --git a/test/test_limits_i128.cpp b/test/test_limits_i128.cpp
index c266bdcc..95d3ffaf 100644
--- a/test/test_limits_i128.cpp
+++ b/test/test_limits_i128.cpp
@@ -2,7 +2,16 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
+#ifndef BOOST_INT128_BUILD_MODULE
+
 #include <boost/int128/int128.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
+
 #include <boost/core/lightweight_test.hpp>
 #include <limits>
 
diff --git a/test/test_limits_u128.cpp b/test/test_limits_u128.cpp
index bac9c810..1dd0ab7a 100644
--- a/test/test_limits_u128.cpp
+++ b/test/test_limits_u128.cpp
@@ -2,7 +2,16 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
+#ifndef BOOST_INT128_BUILD_MODULE
+
 #include <boost/int128/int128.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
+
 #include <boost/core/lightweight_test.hpp>
 #include <limits>
 
diff --git a/test/test_literals.cpp b/test/test_literals.cpp
index a036ab61..37c94081 100644
--- a/test/test_literals.cpp
+++ b/test/test_literals.cpp
@@ -2,9 +2,18 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
+#ifndef BOOST_INT128_BUILD_MODULE
+
 #include <boost/int128/int128.hpp>
 #include <boost/int128/literals.hpp>
 #include <boost/int128/iostream.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
+
 #include <boost/core/lightweight_test.hpp>
 #include <limits>
 
diff --git a/test/test_midpoint.cpp b/test/test_midpoint.cpp
index e1d9ffa8..1a84dbe0 100644
--- a/test/test_midpoint.cpp
+++ b/test/test_midpoint.cpp
@@ -2,7 +2,16 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
+#ifndef BOOST_INT128_BUILD_MODULE
+
 #include <boost/int128.hpp>
+
+#else
+
+import boost.int128;
+
+#endif
+
 #include <boost/core/lightweight_test.hpp>
 #include <numeric>
 
diff --git a/test/test_mixed_type_sign_compare.cpp b/test/test_mixed_type_sign_compare.cpp
new file mode 100644
index 00000000..ebd99a88
--- /dev/null
+++ b/test/test_mixed_type_sign_compare.cpp
@@ -0,0 +1,87 @@
+// Copyright 2025 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+#define BOOST_INT128_ALLOW_SIGN_COMPARE
+
+#include <boost/int128.hpp>
+#include <boost/core/lightweight_test.hpp>
+#include <random>
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
+static std::mt19937_64 rng{42};
+static std::uniform_int_distribution<std::uint64_t> u_dist{0, UINT64_MAX};
+static std::uniform_int_distribution<std::int64_t> i_dist{0, INT64_MAX};
+static constexpr std::size_t N {1024U};
+
+using namespace boost::int128;
+
+void test_left_unsigned()
+{
+    for (std::size_t i {0}; i < N; ++i)
+    {
+        const auto lhs {u_dist(rng)};
+        const auto rhs {i_dist(rng)};
+
+        const uint128_t lib_lhs {lhs};
+        const int128_t lib_rhs {rhs};
+
+        BOOST_TEST_EQ(lib_lhs == lib_rhs, lhs == static_cast<std::uint64_t>(rhs));
+        BOOST_TEST_EQ(lib_lhs != lib_rhs, lhs != static_cast<std::uint64_t>(rhs));
+        BOOST_TEST_EQ(lib_lhs > lib_rhs, lhs > static_cast<std::uint64_t>(rhs));
+        BOOST_TEST_EQ(lib_lhs >= lib_rhs, lhs >= static_cast<std::uint64_t>(rhs));
+        BOOST_TEST_EQ(lib_lhs < lib_rhs, lhs < static_cast<std::uint64_t>(rhs));
+        BOOST_TEST_EQ(lib_lhs <= lib_rhs, lhs <= static_cast<std::uint64_t>(rhs));
+    }
+
+    const uint128_t lhs {42u};
+    const int128_t rhs {-42};
+
+    BOOST_TEST_EQ(lhs == rhs, false);
+    BOOST_TEST_EQ(lhs != rhs, true);
+    BOOST_TEST_EQ(lhs < rhs, false);
+    BOOST_TEST_EQ(lhs <= rhs, false);
+    BOOST_TEST_EQ(lhs > rhs, true);
+    BOOST_TEST_EQ(lhs >= rhs, true);
+}
+
+void test_right_unsigned()
+{
+    for (std::size_t i {0}; i < N; ++i)
+    {
+        const auto lhs {i_dist(rng)};
+        const auto rhs {u_dist(rng)};
+
+        const int128_t lib_lhs {lhs};
+        const uint128_t lib_rhs {rhs};
+
+        BOOST_TEST_EQ(lib_lhs == lib_rhs, static_cast<std::uint64_t>(lhs) == rhs);
+        BOOST_TEST_EQ(lib_lhs != lib_rhs, static_cast<std::uint64_t>(lhs) != rhs);
+        BOOST_TEST_EQ(lib_lhs > lib_rhs, static_cast<std::uint64_t>(lhs) > rhs);
+        BOOST_TEST_EQ(lib_lhs >= lib_rhs, static_cast<std::uint64_t>(lhs) >= rhs);
+        BOOST_TEST_EQ(lib_lhs < lib_rhs, static_cast<std::uint64_t>(lhs) < rhs);
+        BOOST_TEST_EQ(lib_lhs <= lib_rhs, static_cast<std::uint64_t>(lhs) <= rhs);
+    }
+
+    const int128_t lhs {-42};
+    const uint128_t rhs {42u};
+
+    BOOST_TEST_EQ(lhs == rhs, false);
+    BOOST_TEST_EQ(lhs != rhs, true);
+    BOOST_TEST_EQ(lhs < rhs, true);
+    BOOST_TEST_EQ(lhs <= rhs, true);
+    BOOST_TEST_EQ(lhs > rhs, false);
+    BOOST_TEST_EQ(lhs >= rhs, false);
+}
+
+int main()
+{
+    test_left_unsigned();
+    test_right_unsigned();
+
+    return boost::report_errors();
+}
diff --git a/test/test_mixed_type_sign_conversion.cpp b/test/test_mixed_type_sign_conversion.cpp
new file mode 100644
index 00000000..79f6106c
--- /dev/null
+++ b/test/test_mixed_type_sign_conversion.cpp
@@ -0,0 +1,62 @@
+// Copyright 2025 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <boost/int128.hpp>
+#include <boost/core/lightweight_test.hpp>
+#include <random>
+#include <cmath>
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wsign-compare"
+#  pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+static std::mt19937_64 rng{42};
+static std::uniform_int_distribution<std::uint64_t> u_dist{0, static_cast<std::uint64_t>(std::sqrt(UINT64_MAX))};
+static std::uniform_int_distribution<std::int64_t> i_dist{0, static_cast<std::int64_t>(std::sqrt(INT64_MAX))};
+static constexpr std::size_t N {1024U};
+
+using namespace boost::int128;
+
+void test()
+{
+    for (std::size_t i {0}; i < N; ++i)
+    {
+        const auto u_val {u_dist(rng)};
+        const auto i_val {i_dist(rng)};
+
+        if (u_val > static_cast<std::uint64_t>(i_val))
+        {
+            const uint128_t lhs {u_val};
+            const int128_t rhs {i_val};
+
+            BOOST_TEST_EQ(lhs + rhs, u_val + static_cast<std::uint64_t>(i_val));
+            BOOST_TEST_EQ(lhs - rhs, u_val - static_cast<std::uint64_t>(i_val));
+            BOOST_TEST_EQ(lhs * rhs, u_val * static_cast<std::uint64_t>(i_val));
+            BOOST_TEST_EQ(lhs / rhs, u_val / static_cast<std::uint64_t>(i_val));
+            BOOST_TEST_EQ(lhs % rhs, u_val % static_cast<std::uint64_t>(i_val));
+        }
+        else
+        {
+            const int128_t lhs {i_val};
+            const uint128_t rhs {u_val};
+
+            BOOST_TEST_EQ(lhs + rhs, static_cast<std::uint64_t>(i_val) + u_val);
+            BOOST_TEST_EQ(lhs - rhs, static_cast<std::uint64_t>(i_val) - u_val);
+            BOOST_TEST_EQ(lhs * rhs, static_cast<std::uint64_t>(i_val) * u_val);
+            BOOST_TEST_EQ(lhs / rhs, static_cast<std::uint64_t>(i_val) / u_val);
+            BOOST_TEST_EQ(lhs % rhs, static_cast<std::uint64_t>(i_val) % u_val);
+        }
+    }
+}
+
+int main()
+{
+    test();
+
+    return boost::report_errors();
+}
\ No newline at end of file
diff --git a/test/test_popcount.cu b/test/test_popcount.cu
new file mode 100644
index 00000000..5ee24511
--- /dev/null
+++ b/test/test_popcount.cu
@@ -0,0 +1,91 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, int *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::popcount(in[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<int> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::popcount(input_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_rotl.cu b/test/test_rotl.cu
new file mode 100644
index 00000000..ab3bd01b
--- /dev/null
+++ b/test/test_rotl.cu
@@ -0,0 +1,94 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const int *shift, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::rotl(in[i], shift[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> shift_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    std::uniform_int_distribution<int> shift_dist {0, 127};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        shift_vector[i] = shift_dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), shift_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::rotl(input_vector[i], shift_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_rotr.cu b/test/test_rotr.cu
new file mode 100644
index 00000000..09f2c68e
--- /dev/null
+++ b/test/test_rotr.cu
@@ -0,0 +1,94 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/bit.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const int *shift, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::rotr(in[i], shift[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<int> shift_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    std::uniform_int_distribution<int> shift_dist {0, 127};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        shift_vector[i] = shift_dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), shift_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::rotr(input_vector[i], shift_vector[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_add.cu b/test/test_signed_add.cu
new file mode 100644
index 00000000..57369852
--- /dev/null
+++ b/test/test_signed_add.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] + in[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() / test_type{2}, (std::numeric_limits<test_type>::max)() / test_type{2}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] + input_vector[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_add_sat.cu b/test/test_signed_add_sat.cu
new file mode 100644
index 00000000..45b45116
--- /dev/null
+++ b/test/test_signed_add_sat.cu
@@ -0,0 +1,106 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::add_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    int fail_count = 0;
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            if (fail_count < 5)
+            {
+                std::cerr << "Result verification failed at element " << i << std::endl;
+                std::cerr << "  input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl;
+                std::cerr << "  input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl;
+                std::cerr << "  GPU    high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl;
+                std::cerr << "  CPU    high: " << results[i].high << " low: " << results[i].low << std::endl;
+            }
+            ++fail_count;
+        }
+    }
+    if (fail_count > 0)
+    {
+        std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_and.cu b/test/test_signed_and.cu
new file mode 100644
index 00000000..95fb3fec
--- /dev/null
+++ b/test/test_signed_and.cu
@@ -0,0 +1,95 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] & in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] & input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_cstdlib_div.cu b/test/test_signed_cstdlib_div.cu
new file mode 100644
index 00000000..a8445ef5
--- /dev/null
+++ b/test/test_signed_cstdlib_div.cu
@@ -0,0 +1,99 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/cstdlib.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+using result_type = boost::int128::i128div_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, result_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::div(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<result_type> output_vector(numElements);
+
+    // Avoid zero divisors; use full signed range for both inputs
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        // Ensure non-zero divisor
+        do
+        {
+            input_vector2[i] = dist(rng);
+        } while (input_vector2[i] == 0);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<result_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::div(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i].quot != results[i].quot || output_vector[i].rem != results[i].rem)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_div.cu b/test/test_signed_div.cu
new file mode 100644
index 00000000..eb10a192
--- /dev/null
+++ b/test/test_signed_div.cu
@@ -0,0 +1,113 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] / in2[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] / input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << '\n'
+                      << "Got: " << output_vector[i] << "\n"
+                      << "Expected: " << results[i] << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_div_sat.cu b/test/test_signed_div_sat.cu
new file mode 100644
index 00000000..804d4dc9
--- /dev/null
+++ b/test/test_signed_div_sat.cu
@@ -0,0 +1,96 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::div_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        do
+        {
+            input_vector2[i] = dist(rng);
+        } while (input_vector2[i] == 0);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_eq.cu b/test/test_signed_eq.cu
new file mode 100644
index 00000000..4f7156c1
--- /dev/null
+++ b/test/test_signed_eq.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] == in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] == input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_from_chars.cu b/test/test_signed_from_chars.cu
new file mode 100644
index 00000000..66d67eb0
--- /dev/null
+++ b/test/test_signed_from_chars.cu
@@ -0,0 +1,112 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/int128/charconv.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+constexpr int BUF_SIZE = 64;
+
+__global__ void cuda_test(const char *in_strings, const int *in_lengths, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        const char* str = in_strings + i * BUF_SIZE;
+        test_type val {};
+        boost::charconv::from_chars(str, str + in_lengths[i], val);
+        out[i] = val;
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<char> input_strings(numElements * BUF_SIZE);
+    cuda_managed_ptr<int> input_lengths(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors by generating random values and converting to strings
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    std::vector<test_type> expected(numElements);
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        expected[i] = dist(rng);
+        char* buf = &input_strings[i * BUF_SIZE];
+        auto res = boost::charconv::to_chars(buf, buf + BUF_SIZE, expected[i]);
+        input_lengths[i] = static_cast<int>(res.ptr - buf);
+    }
+
+    // Launch the CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_strings.get(), input_lengths.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        test_type cpu_val {};
+        const char* str = &input_strings[i * BUF_SIZE];
+        boost::charconv::from_chars(str, str + input_lengths[i], cpu_val);
+
+        if (output_vector[i] != cpu_val)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+    double t = w.elapsed();
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_from_chars_bases.cu b/test/test_signed_from_chars_bases.cu
new file mode 100644
index 00000000..69b175fb
--- /dev/null
+++ b/test/test_signed_from_chars_bases.cu
@@ -0,0 +1,125 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/int128/charconv.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+constexpr int BUF_SIZE = 192;
+
+__global__ void cuda_test(const char *in_strings, const int *in_lengths, test_type *out, int numElements, int base)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        const char* str = in_strings + i * BUF_SIZE;
+        test_type val {};
+        boost::charconv::from_chars(str, str + in_lengths[i], val, base);
+        out[i] = val;
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<char> input_strings(numElements * BUF_SIZE);
+    cuda_managed_ptr<int> input_lengths(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    std::vector<test_type> expected(numElements);
+
+    for (int base = 2; base <= 36; ++base)
+    {
+        // Initialize the input vectors
+        for (std::size_t i = 0; i < numElements; ++i)
+        {
+            expected[i] = dist(rng);
+            char* buf = &input_strings[i * BUF_SIZE];
+            auto res = boost::charconv::to_chars(buf, buf + BUF_SIZE, expected[i], base);
+            input_lengths[i] = static_cast<int>(res.ptr - buf);
+        }
+
+        // Launch the CUDA Kernel
+        std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads, base " << base << std::endl;
+
+        watch w;
+
+        cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_strings.get(), input_lengths.get(), output_vector.get(), numElements, base);
+        cudaDeviceSynchronize();
+
+        std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+        err = cudaGetLastError();
+
+        if (err != cudaSuccess)
+        {
+            std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+            return EXIT_FAILURE;
+        }
+
+        // Verify that the result vector is correct
+        std::vector<test_type> results;
+        results.reserve(numElements);
+        w.reset();
+        for(int i = 0; i < numElements; ++i)
+        {
+            test_type val {};
+            const char* str = &input_strings[i * BUF_SIZE];
+            boost::charconv::from_chars(str, str + input_lengths[i], val, base);
+            results.push_back(val);
+        }
+        double t = w.elapsed();
+        // check the results
+        for(int i = 0; i < numElements; ++i)
+        {
+            if (output_vector[i] != results[i])
+            {
+                std::cerr << "Result verification failed at element " << i << " base " << base << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+
+        std::cout << "Test base " << base << " PASSED, normal calculation time: " << t << "s" << std::endl;
+    }
+
+    std::cout << "All bases PASSED" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_gcd.cu b/test/test_signed_gcd.cu
new file mode 100644
index 00000000..7d5c8434
--- /dev/null
+++ b/test/test_signed_gcd.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::gcd(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_ge.cu b/test/test_signed_ge.cu
new file mode 100644
index 00000000..efe510ea
--- /dev/null
+++ b/test/test_signed_ge.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] >= in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] >= input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_gt.cu b/test/test_signed_gt.cu
new file mode 100644
index 00000000..820b0797
--- /dev/null
+++ b/test/test_signed_gt.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] > in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] > input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_lcm.cu b/test/test_signed_lcm.cu
new file mode 100644
index 00000000..bafe559d
--- /dev/null
+++ b/test/test_signed_lcm.cu
@@ -0,0 +1,94 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::lcm(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Use smaller values to avoid overflow in lcm computation
+    boost::random::uniform_int_distribution<test_type> dist {test_type{-1, UINT64_MAX}, test_type{0, UINT64_MAX}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::lcm(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_le.cu b/test/test_signed_le.cu
new file mode 100644
index 00000000..d2d67ce6
--- /dev/null
+++ b/test/test_signed_le.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] <= in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] <= input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_left_shift.cu b/test/test_signed_left_shift.cu
new file mode 100644
index 00000000..89cf0a67
--- /dev/null
+++ b/test/test_signed_left_shift.cu
@@ -0,0 +1,97 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const unsigned *shift, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] << shift[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<unsigned> shift_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Use non-negative values only to avoid UB with signed left shift of negative values
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0}, (std::numeric_limits<test_type>::max)()};
+    std::uniform_int_distribution<unsigned> shift_dist {0U, 127U};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        shift_vector[i] = shift_dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), shift_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] << shift_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_literals.cu b/test/test_signed_literals.cu
new file mode 100644
index 00000000..8723b06c
--- /dev/null
+++ b/test/test_signed_literals.cu
@@ -0,0 +1,161 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <boost/int128.hpp>
+#include <boost/int128/literals.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using boost::int128::int128_t;
+using namespace boost::int128::literals;
+
+// Number of test cases: we test each literal operator with several values
+// Operators:
+//   1. operator""_i128(const char*)              - raw literal, decimal
+//   2. operator""_I128(const char*)              - raw literal, decimal
+//   3. operator""_i128(const char*, size_t)      - cooked string literal
+//   4. operator""_I128(const char*, size_t)      - cooked string literal
+//   5. operator""_i128(unsigned long long)        - integer literal
+//   6. operator""_I128(unsigned long long)        - integer literal
+
+constexpr int NUM_TESTS = 30;
+
+__global__ void cuda_test(int128_t *out)
+{
+    int i = threadIdx.x;
+
+    // operator""_i128(const char*) - raw literal (values must fit unsigned long long to avoid NVCC warnings)
+    if (i == 0) { out[i] = 0_i128; }
+    if (i == 1) { out[i] = 1_i128; }
+    if (i == 2) { out[i] = 18446744073709551615_i128; }
+    if (i == 3) { out[i] = 999999999999999999_i128; }
+    if (i == 4) { out[i] = 42_i128; }
+
+    // operator""_I128(const char*) - raw literal (values must fit unsigned long long to avoid NVCC warnings)
+    if (i == 5) { out[i] = 0_I128; }
+    if (i == 6) { out[i] = 1_I128; }
+    if (i == 7) { out[i] = 18446744073709551615_I128; }
+    if (i == 8) { out[i] = 999999999999999999_I128; }
+    if (i == 9) { out[i] = 42_I128; }
+
+    // operator""_i128(const char*, size_t) - string literal (supports negative)
+    if (i == 10) { out[i] = "0"_i128; }
+    if (i == 11) { out[i] = "1"_i128; }
+    if (i == 12) { out[i] = "170141183460469231731687303715884105727"_i128; }
+    if (i == 13) { out[i] = "-1"_i128; }
+    if (i == 14) { out[i] = "-170141183460469231731687303715884105727"_i128; }
+
+    // operator""_I128(const char*, size_t) - string literal (supports negative)
+    if (i == 15) { out[i] = "0"_I128; }
+    if (i == 16) { out[i] = "1"_I128; }
+    if (i == 17) { out[i] = "170141183460469231731687303715884105727"_I128; }
+    if (i == 18) { out[i] = "-1"_I128; }
+    if (i == 19) { out[i] = "-170141183460469231731687303715884105727"_I128; }
+
+    // operator""_i128(unsigned long long) - integer literal
+    if (i == 20) { out[i] = 0_i128; }
+    if (i == 21) { out[i] = 1_i128; }
+    if (i == 22) { out[i] = 18446744073709551615_i128; }
+    if (i == 23) { out[i] = 42_i128; }
+    if (i == 24) { out[i] = 100_i128; }
+
+    // operator""_I128(unsigned long long) - integer literal
+    if (i == 25) { out[i] = 0_I128; }
+    if (i == 26) { out[i] = 1_I128; }
+    if (i == 27) { out[i] = 18446744073709551615_I128; }
+    if (i == 28) { out[i] = 42_I128; }
+    if (i == 29) { out[i] = 100_I128; }
+}
+
+int main(void)
+{
+    cudaError_t err = cudaSuccess;
+
+    std::cout << "[Signed literal tests: " << NUM_TESTS << " cases]" << std::endl;
+
+    cuda_managed_ptr<int128_t> output(NUM_TESTS);
+
+    // Launch with 1 block of NUM_TESTS threads
+    watch w;
+
+    cuda_test<<<1, NUM_TESTS>>>(output.get());
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Build expected values on host using the same literals
+    int128_t expected[NUM_TESTS];
+
+    // operator""_i128(const char*) - raw literal (values must fit unsigned long long to avoid NVCC warnings)
+    expected[0]  = 0_i128;
+    expected[1]  = 1_i128;
+    expected[2]  = 18446744073709551615_i128;
+    expected[3]  = 999999999999999999_i128;
+    expected[4]  = 42_i128;
+
+    // operator""_I128(const char*) - raw literal (values must fit unsigned long long to avoid NVCC warnings)
+    expected[5]  = 0_I128;
+    expected[6]  = 1_I128;
+    expected[7]  = 18446744073709551615_I128;
+    expected[8]  = 999999999999999999_I128;
+    expected[9]  = 42_I128;
+
+    // operator""_i128(const char*, size_t) - string literal (supports negative)
+    expected[10] = "0"_i128;
+    expected[11] = "1"_i128;
+    expected[12] = "170141183460469231731687303715884105727"_i128;
+    expected[13] = "-1"_i128;
+    expected[14] = "-170141183460469231731687303715884105727"_i128;
+
+    // operator""_I128(const char*, size_t) - string literal (supports negative)
+    expected[15] = "0"_I128;
+    expected[16] = "1"_I128;
+    expected[17] = "170141183460469231731687303715884105727"_I128;
+    expected[18] = "-1"_I128;
+    expected[19] = "-170141183460469231731687303715884105727"_I128;
+
+    // operator""_i128(unsigned long long) - integer literal
+    expected[20] = 0_i128;
+    expected[21] = 1_i128;
+    expected[22] = 18446744073709551615_i128;
+    expected[23] = 42_i128;
+    expected[24] = 100_i128;
+
+    // operator""_I128(unsigned long long) - integer literal
+    expected[25] = 0_I128;
+    expected[26] = 1_I128;
+    expected[27] = 18446744073709551615_I128;
+    expected[28] = 42_I128;
+    expected[29] = 100_I128;
+
+    // Verify
+    for (int i = 0; i < NUM_TESTS; ++i)
+    {
+        if (output[i] != expected[i])
+        {
+            std::cerr << "Result verification failed at test case " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_lt.cu b/test/test_signed_lt.cu
new file mode 100644
index 00000000..c4094c4d
--- /dev/null
+++ b/test/test_signed_lt.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] < in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] < input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_midpoint.cu b/test/test_signed_midpoint.cu
new file mode 100644
index 00000000..5ee28d71
--- /dev/null
+++ b/test/test_signed_midpoint.cu
@@ -0,0 +1,106 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::midpoint(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::midpoint(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    int fail_count = 0;
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            if (fail_count < 5)
+            {
+                std::cerr << "Result verification failed at element " << i << std::endl;
+                std::cerr << "  input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl;
+                std::cerr << "  input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl;
+                std::cerr << "  GPU    high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl;
+                std::cerr << "  CPU    high: " << results[i].high << " low: " << results[i].low << std::endl;
+            }
+            ++fail_count;
+        }
+    }
+    if (fail_count > 0)
+    {
+        std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_mod.cu b/test/test_signed_mod.cu
new file mode 100644
index 00000000..cbda3580
--- /dev/null
+++ b/test/test_signed_mod.cu
@@ -0,0 +1,113 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] % in2[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] % input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << '\n'
+                      << "Got: " << output_vector[i] << "\n"
+                      << "Expected: " << results[i] << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_mul.cu b/test/test_signed_mul.cu
new file mode 100644
index 00000000..1c9a12fd
--- /dev/null
+++ b/test/test_signed_mul.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] * in[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{(std::numeric_limits<std::int64_t>::min)()} + 1, test_type{(std::numeric_limits<test_type>::max)()} - 1};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] * input_vector[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_mul_sat.cu b/test/test_signed_mul_sat.cu
new file mode 100644
index 00000000..569e583c
--- /dev/null
+++ b/test/test_signed_mul_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::mul_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::mul_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_ne.cu b/test/test_signed_ne.cu
new file mode 100644
index 00000000..6c34a111
--- /dev/null
+++ b/test/test_signed_ne.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] != in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] != input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_not.cu b/test/test_signed_not.cu
new file mode 100644
index 00000000..5dc285c6
--- /dev/null
+++ b/test/test_signed_not.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = ~in[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(~input_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_or.cu b/test/test_signed_or.cu
new file mode 100644
index 00000000..7bcf7a6e
--- /dev/null
+++ b/test/test_signed_or.cu
@@ -0,0 +1,95 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] | in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] | input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_right_shift.cu b/test/test_signed_right_shift.cu
new file mode 100644
index 00000000..c606ddec
--- /dev/null
+++ b/test/test_signed_right_shift.cu
@@ -0,0 +1,98 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const unsigned *shift, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] >> shift[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<unsigned> shift_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Include negative values — right shift of negative signed integers is
+    // implementation-defined (arithmetic shift) but not UB
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    std::uniform_int_distribution<unsigned> shift_dist {0U, 127U};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        shift_vector[i] = shift_dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), shift_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] >> shift_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_sub.cu b/test/test_signed_sub.cu
new file mode 100644
index 00000000..dd48db3a
--- /dev/null
+++ b/test/test_signed_sub.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] - in[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() / test_type{2}, (std::numeric_limits<test_type>::max)() / test_type{2}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] - input_vector[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_sub_sat.cu b/test/test_signed_sub_sat.cu
new file mode 100644
index 00000000..7dd40f30
--- /dev/null
+++ b/test/test_signed_sub_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::sub_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::sub_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_to_chars.cu b/test/test_signed_to_chars.cu
new file mode 100644
index 00000000..20a6a944
--- /dev/null
+++ b/test/test_signed_to_chars.cu
@@ -0,0 +1,110 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <cstring>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/int128/charconv.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+constexpr int BUF_SIZE = 64;
+
+__global__ void cuda_test(const test_type *in, char *out_strings, int *out_lengths, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char* buf = out_strings + i * BUF_SIZE;
+        auto res = boost::charconv::to_chars(buf, buf + BUF_SIZE, in[i]);
+        out_lengths[i] = static_cast<int>(res.ptr - buf);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vectors
+    cuda_managed_ptr<char> output_strings(numElements * BUF_SIZE);
+    cuda_managed_ptr<int> output_lengths(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_strings.get(), output_lengths.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        char cpu_buf[BUF_SIZE];
+        auto cpu_res = boost::charconv::to_chars(cpu_buf, cpu_buf + BUF_SIZE, input_vector[i]);
+        int cpu_len = static_cast<int>(cpu_res.ptr - cpu_buf);
+        int gpu_len = output_lengths[i];
+        const char* gpu_buf = &output_strings[i * BUF_SIZE];
+
+        if (cpu_len != gpu_len || std::memcmp(cpu_buf, gpu_buf, static_cast<std::size_t>(cpu_len)) != 0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+    double t = w.elapsed();
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_to_chars_bases.cu b/test/test_signed_to_chars_bases.cu
new file mode 100644
index 00000000..15733649
--- /dev/null
+++ b/test/test_signed_to_chars_bases.cu
@@ -0,0 +1,117 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <cstring>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/int128/charconv.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+constexpr int BUF_SIZE = 192;
+
+__global__ void cuda_test(const test_type *in, char *out_strings, int *out_lengths, int numElements, int base)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char* buf = out_strings + i * BUF_SIZE;
+        auto res = boost::charconv::to_chars(buf, buf + BUF_SIZE, in[i], base);
+        out_lengths[i] = static_cast<int>(res.ptr - buf);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vectors
+    cuda_managed_ptr<char> output_strings(numElements * BUF_SIZE);
+    cuda_managed_ptr<int> output_lengths(numElements);
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+
+    for (int base = 2; base <= 36; ++base)
+    {
+        // Initialize the input vectors
+        for (std::size_t i = 0; i < numElements; ++i)
+        {
+            input_vector[i] = dist(rng);
+        }
+
+        // Launch the CUDA Kernel
+        std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads, base " << base << std::endl;
+
+        watch w;
+
+        cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_strings.get(), output_lengths.get(), numElements, base);
+        cudaDeviceSynchronize();
+
+        std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+        err = cudaGetLastError();
+
+        if (err != cudaSuccess)
+        {
+            std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+            return EXIT_FAILURE;
+        }
+
+        // Verify that the result vector is correct
+        w.reset();
+        for(int i = 0; i < numElements; ++i)
+        {
+            char cpu_buf[BUF_SIZE];
+            auto cpu_res = boost::charconv::to_chars(cpu_buf, cpu_buf + BUF_SIZE, input_vector[i], base);
+            int cpu_len = static_cast<int>(cpu_res.ptr - cpu_buf);
+            int gpu_len = output_lengths[i];
+            const char* gpu_buf = &output_strings[i * BUF_SIZE];
+
+            if (cpu_len != gpu_len || std::memcmp(cpu_buf, gpu_buf, static_cast<std::size_t>(cpu_len)) != 0)
+            {
+                std::cerr << "Result verification failed at element " << i << " base " << base << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+        double t = w.elapsed();
+
+        std::cout << "Test base " << base << " PASSED, normal calculation time: " << t << "s" << std::endl;
+    }
+
+    std::cout << "All bases PASSED" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_to_unsigned_conversion.cu b/test/test_signed_to_unsigned_conversion.cu
new file mode 100644
index 00000000..5073f0f6
--- /dev/null
+++ b/test/test_signed_to_unsigned_conversion.cu
@@ -0,0 +1,110 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using signed_type = boost::int128::int128_t;
+using unsigned_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Converts signed int128 values to unsigned int128 values
+ */
+__global__ void cuda_test(const signed_type *in, unsigned_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = unsigned_type(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector
+    cuda_managed_ptr<signed_type> input_vector(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<unsigned_type> output_vector(numElements);
+
+    // Initialize the input vectors with non-negative values so bit patterns match
+    boost::random::uniform_int_distribution<signed_type> dist {signed_type{0}, (std::numeric_limits<signed_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<unsigned_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(unsigned_type(input_vector[i]));
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_xor.cu b/test/test_signed_xor.cu
new file mode 100644
index 00000000..ff11af14
--- /dev/null
+++ b/test/test_signed_xor.cu
@@ -0,0 +1,95 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] ^ in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] ^ input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_stream.cpp b/test/test_stream.cpp
index 836b4300..b6ebea8b 100644
--- a/test/test_stream.cpp
+++ b/test/test_stream.cpp
@@ -89,15 +89,25 @@ void test_ostream()
     std::stringstream hex_out;
     hex_out.flags(std::ios_base::hex);
     hex_out << hex_val;
-    BOOST_TEST_CSTR_EQ(hex_out.str().c_str(), "0xff");
+    BOOST_TEST_CSTR_EQ(hex_out.str().c_str(), "ff");
 
     // 32-bit windows does not set the flags correctly in CI
     #ifndef _M_IX86
 
+    std::stringstream hex_out_base;
+    hex_out_base.flags(std::ios_base::hex | std::ios_base::showbase);
+    hex_out_base << hex_val;
+    BOOST_TEST_CSTR_EQ(hex_out_base.str().c_str(), "0xff");
+
     std::stringstream hex_out_upper;
     hex_out_upper.flags(std::ios_base::hex | std::ios_base::uppercase);
     hex_out_upper << hex_val;
-    BOOST_TEST_CSTR_EQ(hex_out_upper.str().c_str(), "0XFF");
+    BOOST_TEST_CSTR_EQ(hex_out_upper.str().c_str(), "FF");
+
+    std::stringstream hex_out_upper_base;
+    hex_out_upper_base.flags(std::ios_base::hex | std::ios_base::uppercase | std::ios_base::showbase);
+    hex_out_upper_base << hex_val;
+    BOOST_TEST_CSTR_EQ(hex_out_upper_base.str().c_str(), "0XFF");
 
     #endif
 
@@ -105,7 +115,12 @@ void test_ostream()
     std::stringstream octal_out;
     octal_out.flags(std::ios_base::oct);
     octal_out << octal_val;
-    BOOST_TEST_CSTR_EQ(octal_out.str().c_str(), "04");
+    BOOST_TEST_CSTR_EQ(octal_out.str().c_str(), "4");
+
+    std::stringstream octal_out_upper;
+    octal_out_upper.flags(std::ios_base::hex | std::ios_base::showbase);
+    octal_out_upper << octal_val;
+    BOOST_TEST_CSTR_EQ(octal_out.str().c_str(), "4");
 
     BOOST_INT128_IF_CONSTEXPR (std::is_same<T, boost::int128::uint128_t>::value)
     {
diff --git a/test/test_u128.cpp b/test/test_u128.cpp
index b1a7c05f..81a9b5c7 100644
--- a/test/test_u128.cpp
+++ b/test/test_u128.cpp
@@ -1361,7 +1361,7 @@ int main()
 
     test_spot_div<long long>(-3237361348456748317LL, 8011834041509972187LL);
 
-    test_spot_div<boost::int128::uint128_t>(boost::int128::uint128_t{50012077812411ULL, 6429278683030093824ULL}, boost::int128::uint128_t{542101086ULL, 4477988020393345024ULL}, 92256);
+    test_spot_div<boost::int128::uint128_t>(boost::int128::uint128_t{50012077812411ULL, 6429278683030093824ULL}, boost::int128::uint128_t{542101086ULL, 4477988020393345024ULL}, boost::int128::uint128_t{92256});
 
     return boost::report_errors();
 }
diff --git a/test/test_unsigned_add.cu b/test/test_unsigned_add.cu
new file mode 100644
index 00000000..59368281
--- /dev/null
+++ b/test/test_unsigned_add.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] + in[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)() / test_type{2}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] + input_vector[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_add_sat.cu b/test/test_unsigned_add_sat.cu
new file mode 100644
index 00000000..3cfc0317
--- /dev/null
+++ b/test/test_unsigned_add_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::add_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_and.cu b/test/test_unsigned_and.cu
new file mode 100644
index 00000000..7ced87e1
--- /dev/null
+++ b/test/test_unsigned_and.cu
@@ -0,0 +1,95 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] & in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] & input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_cstdlib_div.cu b/test/test_unsigned_cstdlib_div.cu
new file mode 100644
index 00000000..62ccae81
--- /dev/null
+++ b/test/test_unsigned_cstdlib_div.cu
@@ -0,0 +1,94 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/cstdlib.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+using result_type = boost::int128::u128div_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, result_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::div(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<result_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{1U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<result_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::div(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i].quot != results[i].quot || output_vector[i].rem != results[i].rem)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_div.cu b/test/test_unsigned_div.cu
new file mode 100644
index 00000000..fb3070a2
--- /dev/null
+++ b/test/test_unsigned_div.cu
@@ -0,0 +1,111 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] / in2[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{1U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] / input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_div_sat.cu b/test/test_unsigned_div_sat.cu
new file mode 100644
index 00000000..9f76b869
--- /dev/null
+++ b/test/test_unsigned_div_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::div_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{1U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_eq.cu b/test/test_unsigned_eq.cu
new file mode 100644
index 00000000..c2c1d415
--- /dev/null
+++ b/test/test_unsigned_eq.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] == in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{std::numeric_limits<std::uint64_t>::max()}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] == input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_from_chars.cu b/test/test_unsigned_from_chars.cu
new file mode 100644
index 00000000..727dcfa9
--- /dev/null
+++ b/test/test_unsigned_from_chars.cu
@@ -0,0 +1,112 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/int128/charconv.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+constexpr int BUF_SIZE = 64;
+
+__global__ void cuda_test(const char *in_strings, const int *in_lengths, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        const char* str = in_strings + i * BUF_SIZE;
+        test_type val {};
+        boost::charconv::from_chars(str, str + in_lengths[i], val);
+        out[i] = val;
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<char> input_strings(numElements * BUF_SIZE);
+    cuda_managed_ptr<int> input_lengths(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors by generating random values and converting to strings
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    std::vector<test_type> expected(numElements);
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        expected[i] = dist(rng);
+        char* buf = &input_strings[i * BUF_SIZE];
+        auto res = boost::charconv::to_chars(buf, buf + BUF_SIZE, expected[i]);
+        input_lengths[i] = static_cast<int>(res.ptr - buf);
+    }
+
+    // Launch the CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_strings.get(), input_lengths.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        test_type cpu_val {};
+        const char* str = &input_strings[i * BUF_SIZE];
+        boost::charconv::from_chars(str, str + input_lengths[i], cpu_val);
+
+        if (output_vector[i] != cpu_val)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+    double t = w.elapsed();
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_from_chars_bases.cu b/test/test_unsigned_from_chars_bases.cu
new file mode 100644
index 00000000..514e4cdc
--- /dev/null
+++ b/test/test_unsigned_from_chars_bases.cu
@@ -0,0 +1,125 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/int128/charconv.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+constexpr int BUF_SIZE = 192;
+
+__global__ void cuda_test(const char *in_strings, const int *in_lengths, test_type *out, int numElements, int base)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        const char* str = in_strings + i * BUF_SIZE;
+        test_type val {};
+        boost::charconv::from_chars(str, str + in_lengths[i], val, base);
+        out[i] = val;
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<char> input_strings(numElements * BUF_SIZE);
+    cuda_managed_ptr<int> input_lengths(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    std::vector<test_type> expected(numElements);
+
+    for (int base = 2; base <= 36; ++base)
+    {
+        // Initialize the input vectors
+        for (std::size_t i = 0; i < numElements; ++i)
+        {
+            expected[i] = dist(rng);
+            char* buf = &input_strings[i * BUF_SIZE];
+            auto res = boost::charconv::to_chars(buf, buf + BUF_SIZE, expected[i], base);
+            input_lengths[i] = static_cast<int>(res.ptr - buf);
+        }
+
+        // Launch the CUDA Kernel
+        std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads, base " << base << std::endl;
+
+        watch w;
+
+        cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_strings.get(), input_lengths.get(), output_vector.get(), numElements, base);
+        cudaDeviceSynchronize();
+
+        std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+        err = cudaGetLastError();
+
+        if (err != cudaSuccess)
+        {
+            std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+            return EXIT_FAILURE;
+        }
+
+        // Verify that the result vector is correct
+        std::vector<test_type> results;
+        results.reserve(numElements);
+        w.reset();
+        for(int i = 0; i < numElements; ++i)
+        {
+            test_type val {};
+            const char* str = &input_strings[i * BUF_SIZE];
+            boost::charconv::from_chars(str, str + input_lengths[i], val, base);
+            results.push_back(val);
+        }
+        double t = w.elapsed();
+        // check the results
+        for(int i = 0; i < numElements; ++i)
+        {
+            if (output_vector[i] != results[i])
+            {
+                std::cerr << "Result verification failed at element " << i << " base " << base << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+
+        std::cout << "Test base " << base << " PASSED, normal calculation time: " << t << "s" << std::endl;
+    }
+
+    std::cout << "All bases PASSED" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_gcd.cu b/test/test_unsigned_gcd.cu
new file mode 100644
index 00000000..f23abe48
--- /dev/null
+++ b/test/test_unsigned_gcd.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::gcd(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_ge.cu b/test/test_unsigned_ge.cu
new file mode 100644
index 00000000..4803e307
--- /dev/null
+++ b/test/test_unsigned_ge.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] >= in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{std::numeric_limits<std::uint64_t>::max()}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] >= input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_gt.cu b/test/test_unsigned_gt.cu
new file mode 100644
index 00000000..0dd51292
--- /dev/null
+++ b/test/test_unsigned_gt.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] > in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{std::numeric_limits<std::uint64_t>::max()}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] > input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_lcm.cu b/test/test_unsigned_lcm.cu
new file mode 100644
index 00000000..d586d58b
--- /dev/null
+++ b/test/test_unsigned_lcm.cu
@@ -0,0 +1,94 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::lcm(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Use smaller values to avoid overflow in lcm computation
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{0U, UINT64_MAX}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::lcm(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_le.cu b/test/test_unsigned_le.cu
new file mode 100644
index 00000000..4ef2d2b6
--- /dev/null
+++ b/test/test_unsigned_le.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] <= in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{std::numeric_limits<std::uint64_t>::max()}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] <= input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_left_shift.cu b/test/test_unsigned_left_shift.cu
new file mode 100644
index 00000000..053c054b
--- /dev/null
+++ b/test/test_unsigned_left_shift.cu
@@ -0,0 +1,96 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const unsigned *shift, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] << shift[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<unsigned> shift_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    std::uniform_int_distribution<unsigned> shift_dist {0U, 127U};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        shift_vector[i] = shift_dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), shift_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] << shift_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_literals.cu b/test/test_unsigned_literals.cu
new file mode 100644
index 00000000..d3fad8c3
--- /dev/null
+++ b/test/test_unsigned_literals.cu
@@ -0,0 +1,149 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <boost/int128.hpp>
+#include <boost/int128/literals.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using boost::int128::uint128_t;
+using namespace boost::int128::literals;
+
+// Number of test cases: we test each literal operator with several values
+// Operators:
+//   1. operator""_u128(const char*)              - raw literal, decimal
+//   2. operator""_U128(const char*)              - raw literal, decimal
+//   3. operator""_u128(const char*, size_t)      - cooked string literal
+//   4. operator""_U128(const char*, size_t)      - cooked string literal
+//   5. operator""_u128(unsigned long long)        - integer literal
+//   6. operator""_U128(unsigned long long)        - integer literal
+
+constexpr int NUM_TESTS = 24;
+
+__global__ void cuda_test(uint128_t *out)
+{
+    int i = threadIdx.x;
+
+    // operator""_u128(const char*) - raw literal
+    if (i == 0) { out[i] = 0_u128; }
+    if (i == 1) { out[i] = 1_u128; }
+    if (i == 2) { out[i] = 18446744073709551615_u128; }
+    if (i == 3) { out[i] = 999999999999999999_u128; }
+
+    // operator""_U128(const char*) - raw literal
+    if (i == 4) { out[i] = 0_U128; }
+    if (i == 5) { out[i] = 1_U128; }
+    if (i == 6) { out[i] = 18446744073709551615_U128; }
+    if (i == 7) { out[i] = 999999999999999999_U128; }
+
+    // operator""_u128(const char*, size_t) - string literal
+    if (i == 8)  { out[i] = "0"_u128; }
+    if (i == 9)  { out[i] = "1"_u128; }
+    if (i == 10) { out[i] = "340282366920938463463374607431768211455"_u128; }
+    if (i == 11) { out[i] = "999999999999999999"_u128; }
+
+    // operator""_U128(const char*, size_t) - string literal
+    if (i == 12) { out[i] = "0"_U128; }
+    if (i == 13) { out[i] = "1"_U128; }
+    if (i == 14) { out[i] = "340282366920938463463374607431768211455"_U128; }
+    if (i == 15) { out[i] = "999999999999999999"_U128; }
+
+    // operator""_u128(unsigned long long) - integer literal
+    if (i == 16) { out[i] = 0_u128; }
+    if (i == 17) { out[i] = 1_u128; }
+    if (i == 18) { out[i] = 18446744073709551615_u128; }
+    if (i == 19) { out[i] = 42_u128; }
+
+    // operator""_U128(unsigned long long) - integer literal
+    if (i == 20) { out[i] = 0_U128; }
+    if (i == 21) { out[i] = 1_U128; }
+    if (i == 22) { out[i] = 18446744073709551615_U128; }
+    if (i == 23) { out[i] = 42_U128; }
+}
+
+int main(void)
+{
+    cudaError_t err = cudaSuccess;
+
+    std::cout << "[Unsigned literal tests: " << NUM_TESTS << " cases]" << std::endl;
+
+    cuda_managed_ptr<uint128_t> output(NUM_TESTS);
+
+    // Launch with 1 block of NUM_TESTS threads
+    watch w;
+
+    cuda_test<<<1, NUM_TESTS>>>(output.get());
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Build expected values on host using the same literals
+    uint128_t expected[NUM_TESTS];
+
+    // operator""_u128(const char*) - raw literal
+    expected[0]  = 0_u128;
+    expected[1]  = 1_u128;
+    expected[2]  = 18446744073709551615_u128;
+    expected[3]  = 999999999999999999_u128;
+
+    // operator""_U128(const char*) - raw literal
+    expected[4]  = 0_U128;
+    expected[5]  = 1_U128;
+    expected[6]  = 18446744073709551615_U128;
+    expected[7]  = 999999999999999999_U128;
+
+    // operator""_u128(const char*, size_t) - string literal
+    expected[8]  = "0"_u128;
+    expected[9]  = "1"_u128;
+    expected[10] = "340282366920938463463374607431768211455"_u128;
+    expected[11] = "999999999999999999"_u128;
+
+    // operator""_U128(const char*, size_t) - string literal
+    expected[12] = "0"_U128;
+    expected[13] = "1"_U128;
+    expected[14] = "340282366920938463463374607431768211455"_U128;
+    expected[15] = "999999999999999999"_U128;
+
+    // operator""_u128(unsigned long long) - integer literal
+    expected[16] = 0_u128;
+    expected[17] = 1_u128;
+    expected[18] = 18446744073709551615_u128;
+    expected[19] = 42_u128;
+
+    // operator""_U128(unsigned long long) - integer literal
+    expected[20] = 0_U128;
+    expected[21] = 1_U128;
+    expected[22] = 18446744073709551615_U128;
+    expected[23] = 42_U128;
+
+    // Verify
+    for (int i = 0; i < NUM_TESTS; ++i)
+    {
+        if (output[i] != expected[i])
+        {
+            std::cerr << "Result verification failed at test case " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_lt.cu b/test/test_unsigned_lt.cu
new file mode 100644
index 00000000..6394e773
--- /dev/null
+++ b/test/test_unsigned_lt.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] < in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{std::numeric_limits<std::uint64_t>::max()}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] < input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_midpoint.cu b/test/test_unsigned_midpoint.cu
new file mode 100644
index 00000000..e695b5ff
--- /dev/null
+++ b/test/test_unsigned_midpoint.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::midpoint(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::midpoint(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_mod.cu b/test/test_unsigned_mod.cu
new file mode 100644
index 00000000..56e31095
--- /dev/null
+++ b/test/test_unsigned_mod.cu
@@ -0,0 +1,111 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] % in2[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{1U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] % input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_mul.cu b/test/test_unsigned_mul.cu
new file mode 100644
index 00000000..fb32b655
--- /dev/null
+++ b/test/test_unsigned_mul.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] * in[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{std::numeric_limits<std::uint64_t>::max()}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] * input_vector[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_mul_sat.cu b/test/test_unsigned_mul_sat.cu
new file mode 100644
index 00000000..228ef806
--- /dev/null
+++ b/test/test_unsigned_mul_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::mul_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::mul_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_ne.cu b/test/test_unsigned_ne.cu
new file mode 100644
index 00000000..2356c75c
--- /dev/null
+++ b/test/test_unsigned_ne.cu
@@ -0,0 +1,119 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in1, const test_type *in2, bool *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = (in1[i] != in2[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vectors
+    cuda_managed_ptr<test_type> input_vector1(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<bool> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{std::numeric_limits<std::uint64_t>::max()}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector1[i] = dist(rng);
+        // Make some elements equal to test both true and false cases
+        if (i % 3 == 0)
+        {
+            input_vector2[i] = input_vector1[i];
+        }
+        else
+        {
+            input_vector2[i] = dist(rng);
+        }
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<bool> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector1[i] != input_vector2[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_not.cu b/test/test_unsigned_not.cu
new file mode 100644
index 00000000..809baf3c
--- /dev/null
+++ b/test/test_unsigned_not.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = ~in[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(~input_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_or.cu b/test/test_unsigned_or.cu
new file mode 100644
index 00000000..45ebf30d
--- /dev/null
+++ b/test/test_unsigned_or.cu
@@ -0,0 +1,95 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] | in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] | input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_right_shift.cu b/test/test_unsigned_right_shift.cu
new file mode 100644
index 00000000..f81792f2
--- /dev/null
+++ b/test/test_unsigned_right_shift.cu
@@ -0,0 +1,96 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const unsigned *shift, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] >> shift[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<unsigned> shift_vector(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    std::uniform_int_distribution<unsigned> shift_dist {0U, 127U};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        shift_vector[i] = shift_dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), shift_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] >> shift_vector[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_sub.cu b/test/test_unsigned_sub.cu
new file mode 100644
index 00000000..b4fc0f87
--- /dev/null
+++ b/test/test_unsigned_sub.cu
@@ -0,0 +1,109 @@
+//  Copyright John Maddock 2016.
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ */
+__global__ void cuda_test(const test_type *in, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] - in[i];
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector A
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vector C
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)() / test_type{2}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the Vector Add CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch vectorAdd kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] - input_vector[i]);
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_sub_sat.cu b/test/test_unsigned_sub_sat.cu
new file mode 100644
index 00000000..73bf36d7
--- /dev/null
+++ b/test/test_unsigned_sub_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::sub_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::sub_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_to_chars.cu b/test/test_unsigned_to_chars.cu
new file mode 100644
index 00000000..25d4252a
--- /dev/null
+++ b/test/test_unsigned_to_chars.cu
@@ -0,0 +1,110 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <cstring>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/int128/charconv.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+constexpr int BUF_SIZE = 64;
+
+__global__ void cuda_test(const test_type *in, char *out_strings, int *out_lengths, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char* buf = out_strings + i * BUF_SIZE;
+        auto res = boost::charconv::to_chars(buf, buf + BUF_SIZE, in[i]);
+        out_lengths[i] = static_cast<int>(res.ptr - buf);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vectors
+    cuda_managed_ptr<char> output_strings(numElements * BUF_SIZE);
+    cuda_managed_ptr<int> output_lengths(numElements);
+
+    // Initialize the input vectors
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_strings.get(), output_lengths.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        char cpu_buf[BUF_SIZE];
+        auto cpu_res = boost::charconv::to_chars(cpu_buf, cpu_buf + BUF_SIZE, input_vector[i]);
+        int cpu_len = static_cast<int>(cpu_res.ptr - cpu_buf);
+        int gpu_len = output_lengths[i];
+        const char* gpu_buf = &output_strings[i * BUF_SIZE];
+
+        if (cpu_len != gpu_len || std::memcmp(cpu_buf, gpu_buf, static_cast<std::size_t>(cpu_len)) != 0)
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+    double t = w.elapsed();
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_to_chars_bases.cu b/test/test_unsigned_to_chars_bases.cu
new file mode 100644
index 00000000..2a4545a2
--- /dev/null
+++ b/test/test_unsigned_to_chars_bases.cu
@@ -0,0 +1,117 @@
+//  Copyright Matt Borland 2024 - 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <cstring>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/int128/charconv.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+constexpr int BUF_SIZE = 192;
+
+__global__ void cuda_test(const test_type *in, char *out_strings, int *out_lengths, int numElements, int base)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        char* buf = out_strings + i * BUF_SIZE;
+        auto res = boost::charconv::to_chars(buf, buf + BUF_SIZE, in[i], base);
+        out_lengths[i] = static_cast<int>(res.ptr - buf);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector
+    cuda_managed_ptr<test_type> input_vector(numElements);
+
+    // Allocate the managed output vectors
+    cuda_managed_ptr<char> output_strings(numElements * BUF_SIZE);
+    cuda_managed_ptr<int> output_lengths(numElements);
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+
+    for (int base = 2; base <= 36; ++base)
+    {
+        // Initialize the input vectors
+        for (std::size_t i = 0; i < numElements; ++i)
+        {
+            input_vector[i] = dist(rng);
+        }
+
+        // Launch the CUDA Kernel
+        std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads, base " << base << std::endl;
+
+        watch w;
+
+        cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_strings.get(), output_lengths.get(), numElements, base);
+        cudaDeviceSynchronize();
+
+        std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+        err = cudaGetLastError();
+
+        if (err != cudaSuccess)
+        {
+            std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+            return EXIT_FAILURE;
+        }
+
+        // Verify that the result vector is correct
+        w.reset();
+        for(int i = 0; i < numElements; ++i)
+        {
+            char cpu_buf[BUF_SIZE];
+            auto cpu_res = boost::charconv::to_chars(cpu_buf, cpu_buf + BUF_SIZE, input_vector[i], base);
+            int cpu_len = static_cast<int>(cpu_res.ptr - cpu_buf);
+            int gpu_len = output_lengths[i];
+            const char* gpu_buf = &output_strings[i * BUF_SIZE];
+
+            if (cpu_len != gpu_len || std::memcmp(cpu_buf, gpu_buf, static_cast<std::size_t>(cpu_len)) != 0)
+            {
+                std::cerr << "Result verification failed at element " << i << " base " << base << "!" << std::endl;
+                return EXIT_FAILURE;
+            }
+        }
+        double t = w.elapsed();
+
+        std::cout << "Test base " << base << " PASSED, normal calculation time: " << t << "s" << std::endl;
+    }
+
+    std::cout << "All bases PASSED" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_to_signed_conversion.cu b/test/test_unsigned_to_signed_conversion.cu
new file mode 100644
index 00000000..016e7a5d
--- /dev/null
+++ b/test/test_unsigned_to_signed_conversion.cu
@@ -0,0 +1,110 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using signed_type = boost::int128::int128_t;
+using unsigned_type = boost::int128::uint128_t;
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Converts unsigned int128 values to signed int128 values
+ */
+__global__ void cuda_test(const unsigned_type *in, signed_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = signed_type(in[i]);
+    }
+}
+
+/**
+ * Host main routine
+ */
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    // Error code to check return values for CUDA calls
+    cudaError_t err = cudaSuccess;
+
+    // Print the vector length to be used, and compute its size
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    // Allocate the managed input vector
+    cuda_managed_ptr<unsigned_type> input_vector(numElements);
+
+    // Allocate the managed output vector
+    cuda_managed_ptr<signed_type> output_vector(numElements);
+
+    // Initialize the input vectors with values that fit in signed range
+    boost::random::uniform_int_distribution<unsigned_type> dist {unsigned_type{0U}, static_cast<unsigned_type>((std::numeric_limits<signed_type>::max)())};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+    }
+
+    // Launch the CUDA Kernel
+    int threadsPerBlock = 256;
+    int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Verify that the result vector is correct
+    std::vector<signed_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for(int i = 0; i < numElements; ++i)
+    {
+        results.push_back(signed_type(input_vector[i]));
+    }
+    double t = w.elapsed();
+    // check the results
+    for(int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_xor.cu b/test/test_unsigned_xor.cu
new file mode 100644
index 00000000..8201a432
--- /dev/null
+++ b/test/test_unsigned_xor.cu
@@ -0,0 +1,95 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = in[i] ^ in2[i];
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(input_vector[i] ^ input_vector2[i]);
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}