diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index 9afa6e4b15..96677676ed 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -338,18 +338,36 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCT endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if(OQS_USE_CUPQC) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_cuda "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF) +endif() +endif() + if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS)) cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if(OQS_USE_CUPQC) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_cuda "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF) +endif() +endif() + if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS)) cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if(OQS_USE_CUPQC) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_cuda "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF) +endif() +endif() + if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux") if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS)) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index bb412d17c1..8059ffc993 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -188,6 +188,19 @@ jobs: --numprocesses=auto \ --ignore=tests/test_code_conventions.py ${{ matrix.PYTEST_ARGS }}" + cupqc-buildcheck: + name: Check that code builds with OQS_USE_CUPQC=ON + runs-on: ubuntu-latest + container: openquantumsafe/ci-ubuntu-latest:latest + steps: + - name: Checkout code + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 + - name: Configure + run: mkdir build && cd build && cuPQC_DIR=/cupqc/cupqc/cupqc-pkg-0.2.0/cmake/ CUDACXX=/usr/local/cuda-12.6/bin/nvcc cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N .. + - name: Build code + run: ninja + working-directory: build + linux_cross_compile: runs-on: ubuntu-latest container: openquantumsafe/ci-ubuntu-latest:latest diff --git a/CMakeLists.txt b/CMakeLists.txt index 114961ed7f..1b4c2b1af6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ option(OQS_LIBJADE_BUILD "Enable formally verified implementation of supported a option(OQS_PERMIT_UNSUPPORTED_ARCHITECTURE "Permit compilation on an an unsupported architecture." OFF) option(OQS_STRICT_WARNINGS "Enable all compiler warnings." OFF) option(OQS_EMBEDDED_BUILD "Compile liboqs for an Embedded environment without a full standard library." OFF) +option(OQS_USE_CUPQC "Utilize cuPQC as the backend for supported PQC algorithms." OFF) # Libfuzzer isn't supported on gcc if('${CMAKE_C_COMPILER_ID}' STREQUAL 'Clang') @@ -140,6 +141,16 @@ else() message(FATAL_ERROR "Unknown or unsupported processor: " ${CMAKE_SYSTEM_PROCESSOR} ". Override by setting OQS_PERMIT_UNSUPPORTED_ARCHITECTURE=ON") endif() +if(${OQS_USE_CUPQC}) + # CMAKE's CUDA language requires CMAKE 3.18 + cmake_minimum_required (VERSION 3.18) + enable_language(CUDA) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 80 90) + endif() + find_package(cuPQC 0.2.0 REQUIRED) +endif() + if (NOT ((CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") AND (ARCH_X86_64 STREQUAL "ON")) AND (OQS_LIBJADE_BUILD STREQUAL "ON")) message(FATAL_ERROR "Building liboqs with libjade implementations from libjade is only supported on Linux and Darwin on x86_64.") endif() diff --git a/CONFIGURE.md b/CONFIGURE.md index 1d00565f31..02a8da2015 100644 --- a/CONFIGURE.md +++ b/CONFIGURE.md @@ -13,6 +13,7 @@ The following options can be passed to CMake before the build file generation pr - [OQS_DIST_BUILD](#OQS_DIST_BUILD) - [OQS_USE_CPUFEATURE_INSTRUCTIONS](#OQS_USE_CPUFEATURE_INSTRUCTIONS) - [OQS_USE_OPENSSL](#OQS_USE_OPENSSL) +- [OQS_USE_CUPQC](#OQS_USE_CUPQC) - [OQS_OPT_TARGET](#OQS_OPT_TARGET) - [OQS_SPEED_USE_ARM_PMU](#OQS_SPEED_USE_ARM_PMU) - [USE_SANITIZER](#USE_SANITIZER) @@ -124,6 +125,13 @@ Dynamically load OpenSSL through `dlopen`. When using liboqs from other cryptogr Only has an effect if the system supports `dlopen` and ELF binary format, such as Linux or BSD family. +### OQS_USE_CUPQC + +Can be `ON` or `OFF`. When `ON`, use NVIDIA's cuPQC library where able (currently just ML-KEM). When this option is enabled, liboqs may not run correctly on machines that lack supported GPUs. To download cuPQC follow the instructions at (https://developer.nvidia.com/cupqc-download/). Detailed descriptions of the API, requirements, and installation guide are in the cuPQC documentation (https://docs.nvidia.com/cuda/cupqc/index.html). While the code shipped by liboqs required to use cuPQC is licensed under Apache 2.0 the cuPQC SDK comes with its own license agreement (https://docs.nvidia.com/cuda/cupqc/license.html). + +**Default**: `OFF` + + ## Stateful Hash Based Signatures XMSS and LMS are the two supported Hash-Based Signatures schemes. diff --git a/PLATFORMS.md b/PLATFORMS.md index f1b3fc5ebd..544edd9319 100644 --- a/PLATFORMS.md +++ b/PLATFORMS.md @@ -63,3 +63,4 @@ In this policy, the words "must" and "must not" specify absolute requirements th - ppc641e for Ubuntu (Focal) - s390x for Ubuntu (Focal) - loongarch64 for Debian Linux (trixie) +- NVIDIA GPU architectures 70, 75, 80, 86, 89, and 90 with a x86_64 CPU for Linux diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md index d1806517ba..eeaf299dde 100644 --- a/docs/algorithms/kem/ml_kem.md +++ b/docs/algorithms/kem/ml_kem.md @@ -9,6 +9,10 @@ - **Primary Source**: - **Source**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0 +- **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches + - **cupqc-cuda**: + - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e + - **Implementation license (SPDX-Identifier)**: Apache-2.0 ## Parameter set summary @@ -25,6 +29,7 @@ |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. @@ -36,6 +41,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. @@ -45,6 +51,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml index 81ef2b6c4a..498617ff45 100644 --- a/docs/algorithms/kem/ml_kem.yml +++ b/docs/algorithms/kem/ml_kem.yml @@ -20,6 +20,10 @@ primary-upstream: source: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches spdx-license-identifier: CC0-1.0 or Apache-2.0 +optimized-upstreams: + cupqc-cuda: + source: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e + spdx-license-identifier: Apache-2.0 parameter-sets: - name: ML-KEM-512 claimed-nist-level: 1 @@ -54,6 +58,16 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: cupqc-cuda + upstream-id: cuda + supported-platforms: + - architecture: CUDA + operating_systems: + - Linux + - Darwin + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false - name: ML-KEM-768 claimed-nist-level: 3 claimed-security: IND-CCA2 @@ -87,6 +101,16 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: cupqc-cuda + upstream-id: cuda + supported-platforms: + - architecture: CUDA + operating_systems: + - Linux + - Darwin + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false - name: ML-KEM-1024 claimed-nist-level: 5 claimed-security: IND-CCA2 @@ -120,3 +144,13 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: cupqc-cuda + upstream-id: cuda + supported-platforms: + - architecture: CUDA + operating_systems: + - Linux + - Darwin + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false diff --git a/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment b/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment index daed5514c0..0830c024fd 100644 --- a/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment +++ b/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment @@ -11,6 +11,18 @@ if(OQS_DIST_X86_64_BUILD OR ({% for flag in platform['required_flags'] -%} OQS_U {%- endif %} endif() {% if platform['operating_systems'] %}endif() +{% endif -%} + {%- endfor -%} + {%- for platform in impl['supported_platforms'] if platform['architecture'] == 'CUDA' %} +{% if platform['operating_systems'] %}if(CMAKE_SYSTEM_NAME MATCHES "{{ platform['operating_systems']|join('|') }}") +{% endif -%} +if(OQS_USE_CUPQC) + cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}_{{ impl['name'] }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}" OFF) +{%- if 'alias_scheme' in scheme %} + cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['alias_scheme'] }}" OFF) +{%- endif %} +endif() +{% if platform['operating_systems'] %}endif() {% endif -%} {%- endfor -%} {%- for platform in impl['supported_platforms'] if platform['architecture'] == 'ARM64_V8' %} diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py index 400ecc57a0..46968fa33c 100755 --- a/scripts/copy_from_upstream/copy_from_upstream.py +++ b/scripts/copy_from_upstream/copy_from_upstream.py @@ -495,14 +495,15 @@ def handle_implementation(impl, family, scheme, dst_basedir): else: # determine list of files to copy: if 'sources' in i: - srcs = i['sources'].split(" ") - for s in srcs: - # Copy recursively only in case of directories not with plain files to avoid copying over symbolic links - if os.path.isfile(os.path.join(origfolder, s)): - subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) - else: - subprocess.run( - ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) + if i['sources']: + srcs = i['sources'].split(" ") + for s in srcs: + # Copy recursively only in case of directories not with plain files to avoid copying over symbolic links + if os.path.isfile(os.path.join(origfolder, s)): + subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) + else: + subprocess.run( + ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) else: subprocess.run(['cp', '-pr', os.path.join(origfolder, '.'), srcfolder]) # raise Exception("Malformed YML file: No sources listed to copy. Check upstream YML file." ) @@ -598,14 +599,15 @@ def process_families(instructions, basedir, with_kat, with_generator, with_libja # when provided to the compiler; OQS uses the term ARM_NEON if req['architecture'] == 'arm_8': req['architecture'] = 'ARM64_V8' - if req['architecture'] == 'ARM64_V8' and 'asimd' in req['required_flags']: - req['required_flags'].remove('asimd') - req['required_flags'].append('arm_neon') - if req['architecture'] == 'ARM64_V8' and 'sha3' in req['required_flags']: - req['required_flags'].remove('sha3') - req['required_flags'].append('arm_sha3') - impl['required_flags'] = req['required_flags'] - family['all_required_flags'].update(req['required_flags']) + if 'required_flags' in req: + if req['architecture'] == 'ARM64_V8' and 'asimd' in req['required_flags']: + req['required_flags'].remove('asimd') + req['required_flags'].append('arm_neon') + if req['architecture'] == 'ARM64_V8' and 'sha3' in req['required_flags']: + req['required_flags'].remove('sha3') + req['required_flags'].append('arm_sha3') + impl['required_flags'] = req['required_flags'] + family['all_required_flags'].update(req['required_flags']) except KeyError as ke: if (impl['name'] != family['default_implementation']): print("No required flags found for %s (KeyError %s on impl %s)" % ( diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index f80f0979d5..23d1f3a22d 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -38,6 +38,14 @@ upstreams: kem_meta_path: '{pretty_name_full}_META.yml' kem_scheme_path: '.' patches: [pqcrystals-ml_kem.patch] + - + name: cupqc + git_url: https://github.com/praveksharma/cupqc-mlkem.git + git_branch: main + git_commit: b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e + kem_meta_path: '{pretty_name_full}_META.yml' + kem_scheme_path: '.' + patches: [] - name: pqcrystals-dilithium git_url: https://github.com/pq-crystals/dilithium.git @@ -166,6 +174,10 @@ kems: - name: ml_kem default_implementation: ref + arch_specific_implementations: + cuda: cuda + arch_specific_upstream_locations: + cuda: cupqc upstream_location: pqcrystals-kyber-standard schemes: - diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt index ca9d41eac0..bd648d101d 100644 --- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt +++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt @@ -33,11 +33,19 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PUBLIC {{ impl['compile_opts'] }}) {%- endif -%} + {%- elif impl['name'] == 'cuda' %} + +if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %}) + add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu) + target_link_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc) + set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }}) {%- else %} if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %}) add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {% for source_file in impl['sources']|sort -%}{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/{{ source_file }}{%- if not loop.last %} {% endif -%}{%- endfor -%}) {%- endif %} + {%- if impl['name'] != 'cuda' %} target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}) target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) {%- if impl['name'] != scheme['default_implementation'] and impl['required_flags'] -%} @@ -60,6 +68,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if target_compile_definitions({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE old_gas_syntax) endif() {%- endif %} + {%- endif %}{# cupqc #} set(_{{ family|upper }}_OBJS ${_{{ family|upper }}_OBJS} $) endif() {%- endfor -%} diff --git a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c index 108078ffcd..630aee1389 100644 --- a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c +++ b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c @@ -93,7 +93,9 @@ extern int {{ scheme['metadata']['default_dec_signature'] }}(uint8_t *ss, const {%- endfor %} {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} - +{% if impl['name'] == 'cuda'%} +#if defined(OQS_USE_CUPQC) + {%- endif %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- if impl['signature_keypair'] %} extern int {{ impl['signature_keypair'] }}(uint8_t *pk, uint8_t *sk); @@ -113,6 +115,9 @@ extern int {{ impl['signature_dec'] }}(uint8_t *ss, const uint8_t *ct, const uin extern int PQCLEAN_{{ scheme['pqclean_scheme_c']|upper }}_{{ impl['name']|upper }}_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); {%- endif %} #endif + {%- if impl['name'] == 'cuda'%} +#endif /* OQS_USE_CUPQC */ + {%- endif %} {%- endfor %} {%- if libjade_implementation is defined and scheme['libjade_implementation'] %} @@ -166,7 +171,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_keypair(uint8_t * {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) + return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */ + {%- endfor %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} @@ -240,7 +250,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_encaps(uint8_t *c {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) + return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */ + {%- endfor %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} @@ -314,7 +329,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_decaps(uint8_t *s {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) + return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */ + {%- endfor %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} diff --git a/scripts/copy_from_upstream/update_upstream_alg_docs.py b/scripts/copy_from_upstream/update_upstream_alg_docs.py index 33483067e2..ba765b84c1 100755 --- a/scripts/copy_from_upstream/update_upstream_alg_docs.py +++ b/scripts/copy_from_upstream/update_upstream_alg_docs.py @@ -95,8 +95,43 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes oqs_yaml_path = os.path.join(liboqs_root, 'docs', 'algorithms', 'kem', '{}.yml'.format(kem['name'])) if os.path.isfile(oqs_yaml_path): oqs_yaml = load_yaml(oqs_yaml_path) + + upstream_base_url = ui['git_url'][:-len(".git")] + # upstream is special: We will take the upstream git commit information + # (possibly with added patch comment) as it is what drove the update + + # Need to check if yml is of old format. If so, update to new format + if 'primary-upstream' not in oqs_yaml: + print("Updating format of {}. Please double check ordering of yaml file".format(scheme['pretty_name_full'])) + lhs = oqs_yaml['upstream'] + oqs_yaml['primary-upstream'] = dict() + oqs_yaml['primary-upstream']['spdx-license-identifier'] = oqs_yaml['spdx-license-identifier'] + for i in range(len(oqs_yaml['parameter-sets'])): + for j in range(len(oqs_yaml['parameter-sets'][i]['implementations'])): + oqs_yaml['parameter-sets'][i]['implementations'][j]['upstream'] = 'primary-upstream' else: - continue + lhs = oqs_yaml['primary-upstream']['source'] + oqs_yaml['primary-upstream']['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(upstream_base_url, ui['git_commit']), "primary-upstream") + if 'upstream' in oqs_yaml: + del oqs_yaml['upstream'] + del oqs_yaml['spdx-license-identifier'] + + if ouis: + for upstream in ouis: + optimized_upstream_base_url = ouis[upstream]['git_url'][:-len(".git")] + optimized_patches_done="" + if 'patches' in ouis[upstream]: + for patchfilename in ouis[upstream]['patches']: + if kem['name'] in patchfilename: + optimized_patches_done=" with copy_from_upstream patches" + if 'optimized-upstreams' in oqs_yaml and upstream in oqs_yaml['optimized-upstreams']: + lhs = oqs_yaml['optimized-upstreams'][upstream]['source'] + else: + lhs = '' + oqs_yaml['optimized-upstreams'] = oqs_yaml.get('optimized-upstreams', dict()) + oqs_yaml['optimized-upstreams'][upstream] = oqs_yaml['optimized-upstreams'].get(upstream, dict()) + git_commit = ouis[upstream]['git_commit'] + oqs_yaml['optimized-upstreams'][upstream]['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+optimized_patches_done).format(optimized_upstream_base_url, git_commit), "optimized-upstreams") # We cannot assume that the ordering of "parameter-sets" # in the OQS YAML files matches that of copy_from_upstream.yml @@ -111,45 +146,6 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes oqs_yaml['type'] = rhs_if_not_equal(oqs_yaml['type'], upstream_yaml['type'], "type") oqs_yaml['principal-submitters'] = rhs_if_not_equal(oqs_yaml['principal-submitters'], upstream_yaml['principal-submitters'], "principal-submitters") - upstream_base_url = ui['git_url'][:-len(".git")] - # upstream is special: We will take the upstream git commit information - # (possibly with added patch comment) as it is what drove the update - - # Need to check if yml is of old format. If so, update to new format - if 'primary-upstream' not in oqs_yaml: - print("Updating format of {}. Please double check ordering of yaml file".format(scheme['pretty_name_full'])) - lhs = oqs_yaml['upstream'] - oqs_yaml['primary-upstream'] = dict() - oqs_yaml['primary-upstream']['spdx-license-identifier'] = oqs_yaml['spdx-license-identifier'] - for i in range(len(oqs_yaml['parameter-sets'])): - for j in range(len(oqs_yaml['parameter-sets'][i]['implementations'])): - oqs_yaml['parameter-sets'][i]['implementations'][j]['upstream'] = 'primary-upstream' - else: - lhs = oqs_yaml['primary-upstream']['source'] - oqs_yaml['primary-upstream']['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(upstream_base_url, ui['git_commit']), "primary-upstream") - if 'upstream' in oqs_yaml: - del oqs_yaml['upstream'] - del oqs_yaml['spdx-license-identifier'] - - if ouis: - for upstream in ouis: - optimized_upstream_base_url = ouis[upstream]['git_url'][:-len(".git")] - for patchfilename in ouis[upstream]['patches']: - if kem['name'] in patchfilename: - patches_done=" with copy_from_upstream patches" - patches_done="" - if 'patches' in ouis[upstream]: - for patchfilename in ouis[upstream]['patches']: - if kem['name'] in patchfilename: - patches_done=" with copy_from_upstream patches" - if 'optimized-upstreams' in oqs_yaml and upstream in oqs_yaml['optimized-upstreams']: - lhs = oqs_yaml['optimized-upstreams'][upstream]['source'] - else: - lhs = '' - git_commit = ouis[upstream]['git_commit'] - oqs_yaml['optimized-upstreams'][upstream]['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(optimized_upstream_base_url, git_commit), "optimized-upstreams") - - if 'auxiliary-submitters' in upstream_yaml: oqs_yaml['auxiliary-submitters'] = rhs_if_not_equal(oqs_yaml['auxiliary-submitters'] if 'auxiliary-submitters' in oqs_yaml else '', upstream_yaml['auxiliary-submitters'], "auxiliary-submitters") @@ -204,7 +200,7 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes upstream_impl['supported_platforms'][i]['architecture'] = 'ARM64_V8' if 'asimd' in upstream_impl['supported_platforms'][i]['required_flags']: upstream_impl['supported_platforms'][i]['required_flags'].remove('asimd') - if not upstream_impl['supported_platforms'][i]['required_flags']: + if 'required_flags' in upstream_impl['supported_platforms'][i] and not upstream_impl['supported_platforms'][i]['required_flags']: del upstream_impl['supported_platforms'][i]['required_flags'] impl['supported-platforms'] = rhs_if_not_equal(impl['supported-platforms'], upstream_impl['supported_platforms'], "supported-platforms") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a6bca7d998..8f0ac14b6b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -99,6 +99,11 @@ if(${OQS_USE_OPENSSL}) target_link_libraries(oqs-internal PRIVATE ${OPENSSL_CRYPTO_LIBRARY}) endif() endif() +if(${OQS_USE_CUPQC}) + set_property(TARGET oqs PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(oqs PRIVATE cupqc) + target_link_options(oqs PRIVATE $) +endif() target_include_directories(oqs PUBLIC diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt index 14cc9b850d..8af79b6d05 100644 --- a/src/kem/ml_kem/CMakeLists.txt +++ b/src/kem/ml_kem/CMakeLists.txt @@ -23,6 +23,14 @@ if(OQS_ENABLE_KEM_ml_kem_512_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() +if(OQS_ENABLE_KEM_ml_kem_512_cuda) + add_library(ml_kem_512_cuda OBJECT cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu) + target_link_libraries(ml_kem_512_cuda cupqc) + set_property(TARGET ml_kem_512_cuda PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_512_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +endif() + if(OQS_ENABLE_KEM_ml_kem_768) add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c pqcrystals-kyber-standard_ml-kem-768_ref/cbd.c pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-768_ref/kem.c pqcrystals-kyber-standard_ml-kem-768_ref/ntt.c pqcrystals-kyber-standard_ml-kem-768_ref/poly.c pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-768_ref/reduce.c pqcrystals-kyber-standard_ml-kem-768_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-768_ref/verify.c) target_compile_options(ml_kem_768_ref PUBLIC -DKYBER_K=3) @@ -41,6 +49,14 @@ if(OQS_ENABLE_KEM_ml_kem_768_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() +if(OQS_ENABLE_KEM_ml_kem_768_cuda) + add_library(ml_kem_768_cuda OBJECT cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu) + target_link_libraries(ml_kem_768_cuda cupqc) + set_property(TARGET ml_kem_768_cuda PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_768_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +endif() + if(OQS_ENABLE_KEM_ml_kem_1024) add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.c pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-1024_ref/kem.c pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.c pqcrystals-kyber-standard_ml-kem-1024_ref/poly.c pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.c pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-1024_ref/verify.c) target_compile_options(ml_kem_1024_ref PUBLIC -DKYBER_K=4) @@ -59,4 +75,12 @@ if(OQS_ENABLE_KEM_ml_kem_1024_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() +if(OQS_ENABLE_KEM_ml_kem_1024_cuda) + add_library(ml_kem_1024_cuda OBJECT cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu) + target_link_libraries(ml_kem_1024_cuda cupqc) + set_property(TARGET ml_kem_1024_cuda PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_1024_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +endif() + set(ML_KEM_OBJS ${_ML_KEM_OBJS} PARENT_SCOPE) diff --git a/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu new file mode 100644 index 0000000000..188e2f100d --- /dev/null +++ b/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu @@ -0,0 +1,172 @@ +/* + * Copyright 2025 Nvidia Corporation + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +#include +#include +#include + +using namespace cupqc; + +// Checks the return value from a CUDA API function +#define CUDA_CHECK(err) \ + if (err != cudaSuccess) { failure = true; goto cleanup; } + +template +__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; + MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); +} + +template +int keypair(uint8_t *pk, uint8_t *sk) { + using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_pk = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); + + // Run routine + keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_pk != nullptr) cudaFree(d_pk); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; + MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); +} + +template +int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); + + // Run routine + encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_pk != nullptr) cudaFree(d_pk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { + __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; + MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); +} + +template +int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); + + // Run routine + decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + + return failure ? -1 : 0; +} + +extern "C" { + using KEM_1024 = decltype(ML_KEM_1024() + Block()); + +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) + int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif +} diff --git a/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu new file mode 100644 index 0000000000..48ca5ca31a --- /dev/null +++ b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu @@ -0,0 +1,172 @@ +/* + * Copyright 2025 Nvidia Corporation + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +#include +#include +#include + +using namespace cupqc; + +// Checks the return value from a CUDA API function +#define CUDA_CHECK(err) \ + if (err != cudaSuccess) { failure = true; goto cleanup; } + +template +__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; + MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); +} + +template +int keypair(uint8_t *pk, uint8_t *sk) { + using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_pk = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); + + // Run routine + keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_pk != nullptr) cudaFree(d_pk); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; + MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); +} + +template +int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); + + // Run routine + encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_pk != nullptr) cudaFree(d_pk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { + __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; + MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); +} + +template +int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); + + // Run routine + decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + + return failure ? -1 : 0; +} + +extern "C" { + using KEM_512 = decltype(ML_KEM_512() + Block()); + +#if defined(OQS_ENABLE_KEM_ml_kem_512_cuda) + int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif +} diff --git a/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu new file mode 100644 index 0000000000..594c1f4c24 --- /dev/null +++ b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu @@ -0,0 +1,172 @@ +/* + * Copyright 2025 Nvidia Corporation + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +#include +#include +#include + +using namespace cupqc; + +// Checks the return value from a CUDA API function +#define CUDA_CHECK(err) \ + if (err != cudaSuccess) { failure = true; goto cleanup; } + +template +__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; + MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); +} + +template +int keypair(uint8_t *pk, uint8_t *sk) { + using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_pk = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); + + // Run routine + keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_pk != nullptr) cudaFree(d_pk); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; + MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); +} + +template +int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); + + // Run routine + encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_pk != nullptr) cudaFree(d_pk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { + __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; + MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); +} + +template +int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); + + // Run routine + decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + + return failure ? -1 : 0; +} + +extern "C" { + using KEM_768 = decltype(ML_KEM_768() + Block()); + +#if defined(OQS_ENABLE_KEM_ml_kem_768_cuda) + int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif +} diff --git a/src/kem/ml_kem/kem_ml_kem_1024.c b/src/kem/ml_kem/kem_ml_kem_1024.c index bc533aef9e..1e471af58a 100644 --- a/src/kem/ml_kem/kem_ml_kem_1024.c +++ b/src/kem/ml_kem/kem_ml_kem_1024.c @@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_1024_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8 extern int pqcrystals_ml_kem_1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_USE_CUPQC) +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) +extern int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk); +extern int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif +#endif /* OQS_USE_CUPQC */ + OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) + return (OQS_STATUS) cupqc_ml_kem_1024_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *sec } OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) + return (OQS_STATUS) cupqc_ml_kem_1024_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shar } OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) + return (OQS_STATUS) cupqc_ml_kem_1024_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/kem/ml_kem/kem_ml_kem_512.c b/src/kem/ml_kem/kem_ml_kem_512.c index f2dcde53d2..41805f91be 100644 --- a/src/kem/ml_kem/kem_ml_kem_512.c +++ b/src/kem/ml_kem/kem_ml_kem_512.c @@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_512_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_ extern int pqcrystals_ml_kem_512_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_USE_CUPQC) +#if defined(OQS_ENABLE_KEM_ml_kem_512_cuda) +extern int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk); +extern int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif +#endif /* OQS_USE_CUPQC */ + OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda) + return (OQS_STATUS) cupqc_ml_kem_512_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secr } OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda) + return (OQS_STATUS) cupqc_ml_kem_512_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *share } OQS_API OQS_STATUS OQS_KEM_ml_kem_512_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda) + return (OQS_STATUS) cupqc_ml_kem_512_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/kem/ml_kem/kem_ml_kem_768.c b/src/kem/ml_kem/kem_ml_kem_768.c index 14eb6ba404..11a7421b20 100644 --- a/src/kem/ml_kem/kem_ml_kem_768.c +++ b/src/kem/ml_kem/kem_ml_kem_768.c @@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_768_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_ extern int pqcrystals_ml_kem_768_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_USE_CUPQC) +#if defined(OQS_ENABLE_KEM_ml_kem_768_cuda) +extern int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk); +extern int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif +#endif /* OQS_USE_CUPQC */ + OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda) + return (OQS_STATUS) cupqc_ml_kem_768_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secr } OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda) + return (OQS_STATUS) cupqc_ml_kem_768_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *share } OQS_API OQS_STATUS OQS_KEM_ml_kem_768_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda) + return (OQS_STATUS) cupqc_ml_kem_768_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake index 967c35e64e..eb21d7b003 100644 --- a/src/oqsconfig.h.cmake +++ b/src/oqsconfig.h.cmake @@ -69,6 +69,8 @@ #cmakedefine OQS_ENABLE_SHA3_xkcp_low_avx2 1 +#cmakedefine01 OQS_USE_CUPQC + #cmakedefine OQS_ENABLE_KEM_BIKE 1 #cmakedefine OQS_ENABLE_KEM_bike_l1 1 #cmakedefine OQS_ENABLE_KEM_bike_l3 1 @@ -129,10 +131,13 @@ #cmakedefine OQS_ENABLE_KEM_ML_KEM 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_512 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_512_avx2 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_512_cuda 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_768 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_768_avx2 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_768_cuda 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024_avx2 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_1024_cuda 1 #cmakedefine OQS_ENABLE_SIG_DILITHIUM 1 #cmakedefine OQS_ENABLE_SIG_dilithium_2 1