From fb2a8e500d970740c7bc2ad4de310f8e911f2dd7 Mon Sep 17 00:00:00 2001 From: Steven Reeves Date: Wed, 15 Jan 2025 07:42:12 -0800 Subject: [PATCH 01/17] Adding cuPQC as a backend for ML-KEM. Signed-off-by: Steven Reeves --- CMakeLists.txt | 10 + CONFIGURE.md | 8 + .../add_enable_by_alg_conditional.fragment | 12 + .../patches/pqcrystals-ml_kem.patch | 271 +++++++++++++++++- .../src/kem/family/CMakeLists.txt | 9 + .../src/kem/family/kem_scheme.c | 28 +- src/CMakeLists.txt | 5 + src/kem/ml_kem/CMakeLists.txt | 24 ++ src/kem/ml_kem/kem_ml_kem_1024.c | 17 ++ src/kem/ml_kem/kem_ml_kem_512.c | 17 ++ src/kem/ml_kem/kem_ml_kem_768.c | 17 ++ .../cupqc_ml-kem.cu | 198 +++++++++++++ .../cupqc_ml-kem.cu | 198 +++++++++++++ .../cupqc_ml-kem.cu | 198 +++++++++++++ src/oqsconfig.h.cmake | 5 + 15 files changed, 1007 insertions(+), 10 deletions(-) create mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu create mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu create mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 114961ed7f..09667e71ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,6 +140,16 @@ else() message(FATAL_ERROR "Unknown or unsupported processor: " ${CMAKE_SYSTEM_PROCESSOR} ". Override by setting OQS_PERMIT_UNSUPPORTED_ARCHITECTURE=ON") endif() +if(${OQS_USE_CUPQC}) + # CMAKE's CUDA language requires CMAKE 3.18 + cmake_minimum_required (VERSION 3.18) + enable_language(CUDA) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 80 90) + endif() + find_package(cuPQC 0.2.0 REQUIRED) +endif() + if (NOT ((CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") AND (ARCH_X86_64 STREQUAL "ON")) AND (OQS_LIBJADE_BUILD STREQUAL "ON")) message(FATAL_ERROR "Building liboqs with libjade implementations from libjade is only supported on Linux and Darwin on x86_64.") endif() diff --git a/CONFIGURE.md b/CONFIGURE.md index 1d00565f31..67fd7d0aab 100644 --- a/CONFIGURE.md +++ b/CONFIGURE.md @@ -13,6 +13,7 @@ The following options can be passed to CMake before the build file generation pr - [OQS_DIST_BUILD](#OQS_DIST_BUILD) - [OQS_USE_CPUFEATURE_INSTRUCTIONS](#OQS_USE_CPUFEATURE_INSTRUCTIONS) - [OQS_USE_OPENSSL](#OQS_USE_OPENSSL) +- [OQS_USE_CUPQC](#OQS_USE_CUPQC) - [OQS_OPT_TARGET](#OQS_OPT_TARGET) - [OQS_SPEED_USE_ARM_PMU](#OQS_SPEED_USE_ARM_PMU) - [USE_SANITIZER](#USE_SANITIZER) @@ -124,6 +125,13 @@ Dynamically load OpenSSL through `dlopen`. When using liboqs from other cryptogr Only has an effect if the system supports `dlopen` and ELF binary format, such as Linux or BSD family. +### OQS_USE_CUPQC + +Can be `ON` or `OFF`. When `ON`, use NVIDIA's cuPQC library where able (currently just ML-KEM). When this option is enabled, liboqs may not run correctly on machines that lack supported GPUs. To download cuPQC follow the instructions at (https://developer.nvidia.com/cupqc-download/). Detailed descriptions of the API, requirments, and installation guide are in the cuPQC documentation (https://docs.nvidia.com/cuda/cupqc/index.html). + +**Default**: `OFF` + + ## Stateful Hash Based Signatures XMSS and LMS are the two supported Hash-Based Signatures schemes. diff --git a/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment b/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment index daed5514c0..0830c024fd 100644 --- a/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment +++ b/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment @@ -11,6 +11,18 @@ if(OQS_DIST_X86_64_BUILD OR ({% for flag in platform['required_flags'] -%} OQS_U {%- endif %} endif() {% if platform['operating_systems'] %}endif() +{% endif -%} + {%- endfor -%} + {%- for platform in impl['supported_platforms'] if platform['architecture'] == 'CUDA' %} +{% if platform['operating_systems'] %}if(CMAKE_SYSTEM_NAME MATCHES "{{ platform['operating_systems']|join('|') }}") +{% endif -%} +if(OQS_USE_CUPQC) + cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}_{{ impl['name'] }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}" OFF) +{%- if 'alias_scheme' in scheme %} + cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['alias_scheme'] }}" OFF) +{%- endif %} +endif() +{% if platform['operating_systems'] %}endif() {% endif -%} {%- endfor -%} {%- for platform in impl['supported_platforms'] if platform['architecture'] == 'ARM64_V8' %} diff --git a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch index d65eea2f30..a44b1c232e 100644 --- a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch +++ b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch @@ -1,8 +1,8 @@ diff --git a/Kyber1024_META.yml b/ML-KEM-1024_META.yml -similarity index 55% +similarity index 50% rename from Kyber1024_META.yml rename to ML-KEM-1024_META.yml -index baa5ca3..67243b8 100644 +index baa5ca3..42f147e 100644 --- a/Kyber1024_META.yml +++ b/ML-KEM-1024_META.yml @@ -1,4 +1,4 @@ @@ -55,11 +55,29 @@ index baa5ca3..67243b8 100644 supported_platforms: - architecture: x86_64 operating_systems: +@@ -47,3 +45,17 @@ implementations: + - avx2 + - bmi2 + - popcnt ++ - name: cupqc ++ version: FIPS203 ++ compile_opts: $<$:-rdc=true -dlto -arch=compute_70> ++ signature_keypair: ml_kem_1024_cupqc_keypair ++ signature_enc: ml_kem_1024_cupqc_enc ++ signature_dec: ml_kem_1024_cupqc_dec ++ sources: cupqc_ml-kem.cu ++ supported_platforms: ++ - architecture: CUDA ++ operating_systems: ++ - Linux ++ - Darwin ++ required_flags: ++ - dummy diff --git a/Kyber512_META.yml b/ML-KEM-512_META.yml -similarity index 55% +similarity index 50% rename from Kyber512_META.yml rename to ML-KEM-512_META.yml -index b251701..18c28b0 100644 +index b251701..a74f7ec 100644 --- a/Kyber512_META.yml +++ b/ML-KEM-512_META.yml @@ -1,4 +1,4 @@ @@ -112,11 +130,30 @@ index b251701..18c28b0 100644 supported_platforms: - architecture: x86_64 operating_systems: +@@ -47,3 +45,17 @@ implementations: + - avx2 + - bmi2 + - popcnt ++ - name: cupqc ++ version: FIPS203 ++ compile_opts: $<$:-rdc=true -dlto -arch=compute_70> ++ signature_keypair: ml_kem_512_cupqc_keypair ++ signature_enc: ml_kem_512_cupqc_enc ++ signature_dec: ml_kem_512_cupqc_dec ++ sources: cupqc_ml-kem.cu ++ supported_platforms: ++ - architecture: CUDA ++ operating_systems: ++ - Linux ++ - Darwin ++ required_flags: ++ - dummy +\ No newline at end of file diff --git a/Kyber768_META.yml b/ML-KEM-768_META.yml -similarity index 55% +similarity index 50% rename from Kyber768_META.yml rename to ML-KEM-768_META.yml -index 7a0cc3d..ccc03c9 100644 +index 7a0cc3d..2c0e23f 100644 --- a/Kyber768_META.yml +++ b/ML-KEM-768_META.yml @@ -1,4 +1,4 @@ @@ -169,6 +206,24 @@ index 7a0cc3d..ccc03c9 100644 supported_platforms: - architecture: x86_64 operating_systems: +@@ -47,3 +45,17 @@ implementations: + - avx2 + - bmi2 + - popcnt ++ - name: cupqc ++ version: FIPS203 ++ compile_opts: $<$:-rdc=true -dlto -arch=compute_70> ++ signature_keypair: ml_kem_768_cupqc_keypair ++ signature_enc: ml_kem_768_cupqc_enc ++ signature_dec: ml_kem_768_cupqc_dec ++ sources: cupqc_ml-kem.cu ++ supported_platforms: ++ - architecture: CUDA ++ operating_systems: ++ - Linux ++ - Darwin ++ required_flags: ++ - dummy diff --git a/avx2/indcpa.c b/avx2/indcpa.c index 18b9d08..c4b2b3a 100644 --- a/avx2/indcpa.c @@ -356,6 +411,210 @@ index 627b891..e4941f7 100644 const uint8_t seed[KYBER_SYMBYTES], uint8_t x, uint8_t y); +diff --git a/cupqc/cupqc_ml-kem.cu b/cupqc/cupqc_ml-kem.cu +new file mode 100644 +index 0000000..2935c2c +--- /dev/null ++++ b/cupqc/cupqc_ml-kem.cu +@@ -0,0 +1,198 @@ ++/* ++ * Copyright 2025 Nvidia Corporation ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++**/ ++ ++#include ++#include ++#include ++ ++using namespace cupqc; ++ ++// Checks the return value from a CUDA API function ++#define CUDA_CHECK(err) \ ++ if (err != cudaSuccess) { failure = true; goto cleanup; } ++ ++template ++__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { ++ __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; ++ MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); ++} ++ ++template ++int keypair(uint8_t *pk, uint8_t *sk) { ++ using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); ++ ++ bool failure = false; ++ uint8_t *workspace = nullptr, *randombytes=nullptr; ++ uint8_t *d_pk = nullptr, *d_sk = nullptr; ++ ++ // Allocate device workspaces ++ try { ++ workspace = make_workspace(1); ++ randombytes = get_entropy(1); ++ } catch (const std::runtime_error& ex) { ++ failure = true; ++ goto cleanup; ++ } ++ CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); ++ CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); ++ ++ // Run routine ++ keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); ++ ++ // Copy data back to the host ++ CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); ++ CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); ++ ++cleanup: ++ // Free device memory ++ if (d_pk != nullptr) cudaFree(d_pk); ++ if (d_sk != nullptr) cudaFree(d_sk); ++ if (workspace != nullptr) destroy_workspace(workspace); ++ if (randombytes != nullptr) release_entropy(randombytes); ++ ++ return failure ? -1 : 0; ++} ++ ++template ++__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { ++ __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; ++ MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); ++} ++ ++template ++int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { ++ using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); ++ ++ bool failure = false; ++ uint8_t *workspace = nullptr, *randombytes=nullptr; ++ uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; ++ ++ // Allocate device workspaces ++ try { ++ workspace = make_workspace(1); ++ randombytes = get_entropy(1); ++ } catch (const std::runtime_error& ex) { ++ failure = true; ++ goto cleanup; ++ } ++ CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); ++ CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); ++ CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); ++ ++ // Copy data to GPU ++ CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); ++ ++ // Run routine ++ encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); ++ ++ // Copy data back to the host ++ CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); ++ CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); ++ ++cleanup: ++ // Free device memory ++ if (d_ct != nullptr) cudaFree(d_ct); ++ if (d_ss != nullptr) cudaFree(d_ss); ++ if (d_pk != nullptr) cudaFree(d_pk); ++ if (workspace != nullptr) destroy_workspace(workspace); ++ if (randombytes != nullptr) release_entropy(randombytes); ++ ++ return failure ? -1 : 0; ++} ++ ++template ++__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { ++ __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; ++ MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); ++} ++ ++template ++int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { ++ using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); ++ ++ bool failure = false; ++ uint8_t *workspace = nullptr; ++ uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; ++ ++ // Allocate device workspaces ++ try { ++ workspace = make_workspace(1); ++ } catch (const std::runtime_error& ex) { ++ failure = true; ++ goto cleanup; ++ } ++ CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); ++ CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); ++ CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); ++ ++ // Copy data to GPU ++ CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); ++ CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); ++ ++ // Run routine ++ decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); ++ ++ // Copy data back to the host ++ CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); ++ ++cleanup: ++ // Free device memory ++ if (d_ct != nullptr) cudaFree(d_ct); ++ if (d_ss != nullptr) cudaFree(d_ss); ++ if (d_sk != nullptr) cudaFree(d_sk); ++ if (workspace != nullptr) destroy_workspace(workspace); ++ ++ return failure ? -1 : 0; ++} ++ ++extern "C" { ++ using KEM_512 = decltype(ML_KEM_512() + Block()); ++ using KEM_768 = decltype(ML_KEM_768() + Block()); ++ using KEM_1024 = decltype(ML_KEM_1024() + Block()); ++ ++#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) ++ int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { ++ return keypair(pk, sk); ++ } ++ int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { ++ return encaps(ct, ss, pk); ++ } ++ int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { ++ return decaps(ss, ct, sk); ++ } ++#endif ++ ++#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) ++ int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { ++ return keypair(pk, sk); ++ } ++ int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { ++ return encaps(ct, ss, pk); ++ } ++ int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { ++ return decaps(ss, ct, sk); ++ } ++#endif ++ ++#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) ++ int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { ++ return keypair(pk, sk); ++ } ++ int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { ++ return encaps(ct, ss, pk); ++ } ++ int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { ++ return decaps(ss, ct, sk); ++ } ++#endif ++} diff --git a/ref/indcpa.c b/ref/indcpa.c index 9a78c09..726cfa9 100644 --- a/ref/indcpa.c diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt index ca9d41eac0..100ddaab7e 100644 --- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt +++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt @@ -33,11 +33,19 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PUBLIC {{ impl['compile_opts'] }}) {%- endif -%} + {%- elif impl['name'] == 'cupqc' %} + +if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %}) + add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu) + target_include_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc) + set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }}) {%- else %} if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %}) add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {% for source_file in impl['sources']|sort -%}{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/{{ source_file }}{%- if not loop.last %} {% endif -%}{%- endfor -%}) {%- endif %} + {%- if impl['name'] != 'cupqc' %} target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}) target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) {%- if impl['name'] != scheme['default_implementation'] and impl['required_flags'] -%} @@ -60,6 +68,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if target_compile_definitions({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE old_gas_syntax) endif() {%- endif %} + {%- endif %}{# cupqc #} set(_{{ family|upper }}_OBJS ${_{{ family|upper }}_OBJS} $) endif() {%- endfor -%} diff --git a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c index 108078ffcd..cf8a705031 100644 --- a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c +++ b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c @@ -93,7 +93,9 @@ extern int {{ scheme['metadata']['default_dec_signature'] }}(uint8_t *ss, const {%- endfor %} {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} - +{% if impl['name'] == 'cupqc'%} +#if defined(OQS_USE_CUPQC) + {%- endif %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- if impl['signature_keypair'] %} extern int {{ impl['signature_keypair'] }}(uint8_t *pk, uint8_t *sk); @@ -113,6 +115,9 @@ extern int {{ impl['signature_dec'] }}(uint8_t *ss, const uint8_t *ct, const uin extern int PQCLEAN_{{ scheme['pqclean_scheme_c']|upper }}_{{ impl['name']|upper }}_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); {%- endif %} #endif + {%- if impl['name'] == 'cupqc'%} +#endif /* OQS_USE_CUPQC */ + {%- endif %} {%- endfor %} {%- if libjade_implementation is defined and scheme['libjade_implementation'] %} @@ -166,7 +171,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_keypair(uint8_t * {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) + return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ + {%- endfor %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} @@ -240,7 +250,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_encaps(uint8_t *c {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) + return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ + {%- endfor %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} @@ -314,7 +329,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_decaps(uint8_t *s {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) + return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ + {%- endfor %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a6bca7d998..8f0ac14b6b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -99,6 +99,11 @@ if(${OQS_USE_OPENSSL}) target_link_libraries(oqs-internal PRIVATE ${OPENSSL_CRYPTO_LIBRARY}) endif() endif() +if(${OQS_USE_CUPQC}) + set_property(TARGET oqs PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + target_link_libraries(oqs PRIVATE cupqc) + target_link_options(oqs PRIVATE $) +endif() target_include_directories(oqs PUBLIC diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt index 14cc9b850d..50d5537d1b 100644 --- a/src/kem/ml_kem/CMakeLists.txt +++ b/src/kem/ml_kem/CMakeLists.txt @@ -23,6 +23,14 @@ if(OQS_ENABLE_KEM_ml_kem_512_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() +if(OQS_ENABLE_KEM_ml_kem_512_cupqc) + add_library(ml_kem_512_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu) + target_include_libraries(ml_kem_512_cupqc cupqc) + set_property(TARGET ml_kem_512_cupqc PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_512_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +endif() + if(OQS_ENABLE_KEM_ml_kem_768) add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c pqcrystals-kyber-standard_ml-kem-768_ref/cbd.c pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-768_ref/kem.c pqcrystals-kyber-standard_ml-kem-768_ref/ntt.c pqcrystals-kyber-standard_ml-kem-768_ref/poly.c pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-768_ref/reduce.c pqcrystals-kyber-standard_ml-kem-768_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-768_ref/verify.c) target_compile_options(ml_kem_768_ref PUBLIC -DKYBER_K=3) @@ -41,6 +49,14 @@ if(OQS_ENABLE_KEM_ml_kem_768_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() +if(OQS_ENABLE_KEM_ml_kem_768_cupqc) + add_library(ml_kem_768_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu) + target_include_libraries(ml_kem_768_cupqc cupqc) + set_property(TARGET ml_kem_768_cupqc PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_768_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +endif() + if(OQS_ENABLE_KEM_ml_kem_1024) add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.c pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-1024_ref/kem.c pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.c pqcrystals-kyber-standard_ml-kem-1024_ref/poly.c pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.c pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-1024_ref/verify.c) target_compile_options(ml_kem_1024_ref PUBLIC -DKYBER_K=4) @@ -59,4 +75,12 @@ if(OQS_ENABLE_KEM_ml_kem_1024_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() +if(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + add_library(ml_kem_1024_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu) + target_include_libraries(ml_kem_1024_cupqc cupqc) + set_property(TARGET ml_kem_1024_cupqc PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_1024_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +endif() + set(ML_KEM_OBJS ${_ML_KEM_OBJS} PARENT_SCOPE) diff --git a/src/kem/ml_kem/kem_ml_kem_1024.c b/src/kem/ml_kem/kem_ml_kem_1024.c index bc533aef9e..21f8c84ca4 100644 --- a/src/kem/ml_kem/kem_ml_kem_1024.c +++ b/src/kem/ml_kem/kem_ml_kem_1024.c @@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_1024_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8 extern int pqcrystals_ml_kem_1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_USE_CUPQC) +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) +extern int ml_kem_1024_cupqc_keypair(uint8_t *pk, uint8_t *sk); +extern int ml_kem_1024_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int ml_kem_1024_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif +#endif /* OQS_USE_CUPQC */ + OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + return (OQS_STATUS) ml_kem_1024_cupqc_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *sec } OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + return (OQS_STATUS) ml_kem_1024_cupqc_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shar } OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + return (OQS_STATUS) ml_kem_1024_cupqc_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/kem/ml_kem/kem_ml_kem_512.c b/src/kem/ml_kem/kem_ml_kem_512.c index f2dcde53d2..e3e05be15b 100644 --- a/src/kem/ml_kem/kem_ml_kem_512.c +++ b/src/kem/ml_kem/kem_ml_kem_512.c @@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_512_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_ extern int pqcrystals_ml_kem_512_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_USE_CUPQC) +#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) +extern int ml_kem_512_cupqc_keypair(uint8_t *pk, uint8_t *sk); +extern int ml_kem_512_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int ml_kem_512_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif +#endif /* OQS_USE_CUPQC */ + OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) + return (OQS_STATUS) ml_kem_512_cupqc_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secr } OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) + return (OQS_STATUS) ml_kem_512_cupqc_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *share } OQS_API OQS_STATUS OQS_KEM_ml_kem_512_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) + return (OQS_STATUS) ml_kem_512_cupqc_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/kem/ml_kem/kem_ml_kem_768.c b/src/kem/ml_kem/kem_ml_kem_768.c index 14eb6ba404..a8b0bfdd52 100644 --- a/src/kem/ml_kem/kem_ml_kem_768.c +++ b/src/kem/ml_kem/kem_ml_kem_768.c @@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_768_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_ extern int pqcrystals_ml_kem_768_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif +#if defined(OQS_USE_CUPQC) +#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) +extern int ml_kem_768_cupqc_keypair(uint8_t *pk, uint8_t *sk); +extern int ml_kem_768_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int ml_kem_768_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#endif +#endif /* OQS_USE_CUPQC */ + OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) + return (OQS_STATUS) ml_kem_768_cupqc_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secr } OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) + return (OQS_STATUS) ml_kem_768_cupqc_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *share } OQS_API OQS_STATUS OQS_KEM_ml_kem_768_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) + return (OQS_STATUS) ml_kem_768_cupqc_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu new file mode 100644 index 0000000000..2935c2cd6a --- /dev/null +++ b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu @@ -0,0 +1,198 @@ +/* + * Copyright 2025 Nvidia Corporation + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +#include +#include +#include + +using namespace cupqc; + +// Checks the return value from a CUDA API function +#define CUDA_CHECK(err) \ + if (err != cudaSuccess) { failure = true; goto cleanup; } + +template +__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; + MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); +} + +template +int keypair(uint8_t *pk, uint8_t *sk) { + using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_pk = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); + + // Run routine + keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_pk != nullptr) cudaFree(d_pk); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; + MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); +} + +template +int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); + + // Run routine + encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_pk != nullptr) cudaFree(d_pk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { + __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; + MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); +} + +template +int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); + + // Run routine + decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + + return failure ? -1 : 0; +} + +extern "C" { + using KEM_512 = decltype(ML_KEM_512() + Block()); + using KEM_768 = decltype(ML_KEM_768() + Block()); + using KEM_1024 = decltype(ML_KEM_1024() + Block()); + +#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) + int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) + int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif +} diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu new file mode 100644 index 0000000000..2935c2cd6a --- /dev/null +++ b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu @@ -0,0 +1,198 @@ +/* + * Copyright 2025 Nvidia Corporation + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +#include +#include +#include + +using namespace cupqc; + +// Checks the return value from a CUDA API function +#define CUDA_CHECK(err) \ + if (err != cudaSuccess) { failure = true; goto cleanup; } + +template +__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; + MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); +} + +template +int keypair(uint8_t *pk, uint8_t *sk) { + using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_pk = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); + + // Run routine + keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_pk != nullptr) cudaFree(d_pk); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; + MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); +} + +template +int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); + + // Run routine + encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_pk != nullptr) cudaFree(d_pk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { + __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; + MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); +} + +template +int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); + + // Run routine + decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + + return failure ? -1 : 0; +} + +extern "C" { + using KEM_512 = decltype(ML_KEM_512() + Block()); + using KEM_768 = decltype(ML_KEM_768() + Block()); + using KEM_1024 = decltype(ML_KEM_1024() + Block()); + +#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) + int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) + int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif +} diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu new file mode 100644 index 0000000000..2935c2cd6a --- /dev/null +++ b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu @@ -0,0 +1,198 @@ +/* + * Copyright 2025 Nvidia Corporation + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +#include +#include +#include + +using namespace cupqc; + +// Checks the return value from a CUDA API function +#define CUDA_CHECK(err) \ + if (err != cudaSuccess) { failure = true; goto cleanup; } + +template +__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; + MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); +} + +template +int keypair(uint8_t *pk, uint8_t *sk) { + using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_pk = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); + + // Run routine + keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_pk != nullptr) cudaFree(d_pk); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; + MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); +} + +template +int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); + + // Run routine + encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_pk != nullptr) cudaFree(d_pk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { + __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; + MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); +} + +template +int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); + + // Run routine + decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + + return failure ? -1 : 0; +} + +extern "C" { + using KEM_512 = decltype(ML_KEM_512() + Block()); + using KEM_768 = decltype(ML_KEM_768() + Block()); + using KEM_1024 = decltype(ML_KEM_1024() + Block()); + +#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) + int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) + int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif +} diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake index 967c35e64e..875584f74a 100644 --- a/src/oqsconfig.h.cmake +++ b/src/oqsconfig.h.cmake @@ -69,6 +69,8 @@ #cmakedefine OQS_ENABLE_SHA3_xkcp_low_avx2 1 +#cmakedefine01 OQS_USE_CUPQC + #cmakedefine OQS_ENABLE_KEM_BIKE 1 #cmakedefine OQS_ENABLE_KEM_bike_l1 1 #cmakedefine OQS_ENABLE_KEM_bike_l3 1 @@ -129,10 +131,13 @@ #cmakedefine OQS_ENABLE_KEM_ML_KEM 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_512 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_512_avx2 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_512_cupqc 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_768 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_768_avx2 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_768_cupqc 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024_avx2 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_1024_cupqc 1 #cmakedefine OQS_ENABLE_SIG_DILITHIUM 1 #cmakedefine OQS_ENABLE_SIG_dilithium_2 1 From 2cb806739053d3de363a1ea7cda5e8b55d1f28c6 Mon Sep 17 00:00:00 2001 From: Steven Reeves Date: Wed, 15 Jan 2025 09:04:49 -0800 Subject: [PATCH 02/17] Fixing transposition error that left out OQS_USE_CUPQC in CMake system. Signed-off-by: Steven Reeves --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 09667e71ae..1b4c2b1af6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ option(OQS_LIBJADE_BUILD "Enable formally verified implementation of supported a option(OQS_PERMIT_UNSUPPORTED_ARCHITECTURE "Permit compilation on an an unsupported architecture." OFF) option(OQS_STRICT_WARNINGS "Enable all compiler warnings." OFF) option(OQS_EMBEDDED_BUILD "Compile liboqs for an Embedded environment without a full standard library." OFF) +option(OQS_USE_CUPQC "Utilize cuPQC as the backend for supported PQC algorithms." OFF) # Libfuzzer isn't supported on gcc if('${CMAKE_C_COMPILER_ID}' STREQUAL 'Clang') From 8c076753658109f79b19df908a7bbfa11f5543dd Mon Sep 17 00:00:00 2001 From: Steven Reeves Date: Wed, 15 Jan 2025 11:56:24 -0800 Subject: [PATCH 03/17] Add CMake dependent options for cupqc. Fixed formatting in kem_ml_kem_####.c and kem/family/kem_scheme.c Signed-off-by: Steven Reeves --- .CMake/alg_support.cmake | 18 ++++++++++++++++++ .../src/kem/family/kem_scheme.c | 6 +++--- src/kem/ml_kem/kem_ml_kem_1024.c | 6 +++--- src/kem/ml_kem/kem_ml_kem_512.c | 6 +++--- src/kem/ml_kem/kem_ml_kem_768.c | 6 +++--- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index 9afa6e4b15..de22dbceb9 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -338,18 +338,36 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCT endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if(OQS_USE_CUPQC) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF) +endif() +endif() + if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS)) cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if(OQS_USE_CUPQC) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF) +endif() +endif() + if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS)) cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") +if(OQS_USE_CUPQC) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF) +endif() +endif() + if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux") if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS)) diff --git a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c index cf8a705031..95eafa703e 100644 --- a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c +++ b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c @@ -173,7 +173,7 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_keypair(uint8_t * {%- endif %} {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) - return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key); + return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ {%- endfor %} {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} @@ -252,7 +252,7 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_encaps(uint8_t *c {%- endif %} {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) - return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key); + return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ {%- endfor %} {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} @@ -331,7 +331,7 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_decaps(uint8_t *s {%- endif %} {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) - return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key); + return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ {%- endfor %} {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} diff --git a/src/kem/ml_kem/kem_ml_kem_1024.c b/src/kem/ml_kem/kem_ml_kem_1024.c index 21f8c84ca4..2d4d46df8b 100644 --- a/src/kem/ml_kem/kem_ml_kem_1024.c +++ b/src/kem/ml_kem/kem_ml_kem_1024.c @@ -50,7 +50,7 @@ extern int ml_kem_1024_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t * OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *secret_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - return (OQS_STATUS) ml_kem_1024_cupqc_keypair(public_key, secret_key); + return (OQS_STATUS) ml_kem_1024_cupqc_keypair(public_key, secret_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) @@ -69,7 +69,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *sec OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - return (OQS_STATUS) ml_kem_1024_cupqc_enc(ciphertext, shared_secret, public_key); + return (OQS_STATUS) ml_kem_1024_cupqc_enc(ciphertext, shared_secret, public_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) @@ -88,7 +88,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shar OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - return (OQS_STATUS) ml_kem_1024_cupqc_dec(shared_secret, ciphertext, secret_key); + return (OQS_STATUS) ml_kem_1024_cupqc_dec(shared_secret, ciphertext, secret_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) diff --git a/src/kem/ml_kem/kem_ml_kem_512.c b/src/kem/ml_kem/kem_ml_kem_512.c index e3e05be15b..5e60018b37 100644 --- a/src/kem/ml_kem/kem_ml_kem_512.c +++ b/src/kem/ml_kem/kem_ml_kem_512.c @@ -50,7 +50,7 @@ extern int ml_kem_512_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *s OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secret_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - return (OQS_STATUS) ml_kem_512_cupqc_keypair(public_key, secret_key); + return (OQS_STATUS) ml_kem_512_cupqc_keypair(public_key, secret_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) @@ -69,7 +69,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secr OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - return (OQS_STATUS) ml_kem_512_cupqc_enc(ciphertext, shared_secret, public_key); + return (OQS_STATUS) ml_kem_512_cupqc_enc(ciphertext, shared_secret, public_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) @@ -88,7 +88,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *share OQS_API OQS_STATUS OQS_KEM_ml_kem_512_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - return (OQS_STATUS) ml_kem_512_cupqc_dec(shared_secret, ciphertext, secret_key); + return (OQS_STATUS) ml_kem_512_cupqc_dec(shared_secret, ciphertext, secret_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) diff --git a/src/kem/ml_kem/kem_ml_kem_768.c b/src/kem/ml_kem/kem_ml_kem_768.c index a8b0bfdd52..ceb651cedb 100644 --- a/src/kem/ml_kem/kem_ml_kem_768.c +++ b/src/kem/ml_kem/kem_ml_kem_768.c @@ -50,7 +50,7 @@ extern int ml_kem_768_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *s OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secret_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - return (OQS_STATUS) ml_kem_768_cupqc_keypair(public_key, secret_key); + return (OQS_STATUS) ml_kem_768_cupqc_keypair(public_key, secret_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) @@ -69,7 +69,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secr OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - return (OQS_STATUS) ml_kem_768_cupqc_enc(ciphertext, shared_secret, public_key); + return (OQS_STATUS) ml_kem_768_cupqc_enc(ciphertext, shared_secret, public_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) @@ -88,7 +88,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *share OQS_API OQS_STATUS OQS_KEM_ml_kem_768_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - return (OQS_STATUS) ml_kem_768_cupqc_dec(shared_secret, ciphertext, secret_key); + return (OQS_STATUS) ml_kem_768_cupqc_dec(shared_secret, ciphertext, secret_key); #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) From 7562c855141f216519313f8973b6f0a590b7a16e Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Wed, 15 Jan 2025 23:42:54 +0100 Subject: [PATCH 04/17] Move cupqc_ml-kem source files to correctly named dir Signed-off-by: Pravek Sharma --- .../copy_from_upstream/copy_from_upstream.py | 17 +- .../patches/pqcrystals-ml_kem.patch | 216 +----------------- .../src/kem/family/CMakeLists.txt | 2 +- src/kem/ml_kem/CMakeLists.txt | 6 +- .../cupqc_ml-kem.cu | 0 .../cupqc_ml-kem.cu | 198 ---------------- .../cupqc_ml-kem.cu | 198 ---------------- 7 files changed, 19 insertions(+), 618 deletions(-) rename src/kem/ml_kem/{pqcrystals-kyber-standard_ml-kem-1024_cupqc => cupqc_ml-kem}/cupqc_ml-kem.cu (100%) delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py index 400ecc57a0..ccab976b7f 100755 --- a/scripts/copy_from_upstream/copy_from_upstream.py +++ b/scripts/copy_from_upstream/copy_from_upstream.py @@ -495,14 +495,15 @@ def handle_implementation(impl, family, scheme, dst_basedir): else: # determine list of files to copy: if 'sources' in i: - srcs = i['sources'].split(" ") - for s in srcs: - # Copy recursively only in case of directories not with plain files to avoid copying over symbolic links - if os.path.isfile(os.path.join(origfolder, s)): - subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) - else: - subprocess.run( - ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) + if i['sources']: + srcs = i['sources'].split(" ") + for s in srcs: + # Copy recursively only in case of directories not with plain files to avoid copying over symbolic links + if os.path.isfile(os.path.join(origfolder, s)): + subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) + else: + subprocess.run( + ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))]) else: subprocess.run(['cp', '-pr', os.path.join(origfolder, '.'), srcfolder]) # raise Exception("Malformed YML file: No sources listed to copy. Check upstream YML file." ) diff --git a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch index a44b1c232e..2e7068cd83 100644 --- a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch +++ b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch @@ -2,7 +2,7 @@ diff --git a/Kyber1024_META.yml b/ML-KEM-1024_META.yml similarity index 50% rename from Kyber1024_META.yml rename to ML-KEM-1024_META.yml -index baa5ca3..42f147e 100644 +index baa5ca3..98e0e5c 100644 --- a/Kyber1024_META.yml +++ b/ML-KEM-1024_META.yml @@ -1,4 +1,4 @@ @@ -65,7 +65,7 @@ index baa5ca3..42f147e 100644 + signature_keypair: ml_kem_1024_cupqc_keypair + signature_enc: ml_kem_1024_cupqc_enc + signature_dec: ml_kem_1024_cupqc_dec -+ sources: cupqc_ml-kem.cu ++ sources: + supported_platforms: + - architecture: CUDA + operating_systems: @@ -77,7 +77,7 @@ diff --git a/Kyber512_META.yml b/ML-KEM-512_META.yml similarity index 50% rename from Kyber512_META.yml rename to ML-KEM-512_META.yml -index b251701..a74f7ec 100644 +index b251701..0df6843 100644 --- a/Kyber512_META.yml +++ b/ML-KEM-512_META.yml @@ -1,4 +1,4 @@ @@ -140,7 +140,7 @@ index b251701..a74f7ec 100644 + signature_keypair: ml_kem_512_cupqc_keypair + signature_enc: ml_kem_512_cupqc_enc + signature_dec: ml_kem_512_cupqc_dec -+ sources: cupqc_ml-kem.cu ++ sources: + supported_platforms: + - architecture: CUDA + operating_systems: @@ -153,7 +153,7 @@ diff --git a/Kyber768_META.yml b/ML-KEM-768_META.yml similarity index 50% rename from Kyber768_META.yml rename to ML-KEM-768_META.yml -index 7a0cc3d..2c0e23f 100644 +index 7a0cc3d..f5f45f7 100644 --- a/Kyber768_META.yml +++ b/ML-KEM-768_META.yml @@ -1,4 +1,4 @@ @@ -216,7 +216,7 @@ index 7a0cc3d..2c0e23f 100644 + signature_keypair: ml_kem_768_cupqc_keypair + signature_enc: ml_kem_768_cupqc_enc + signature_dec: ml_kem_768_cupqc_dec -+ sources: cupqc_ml-kem.cu ++ sources: + supported_platforms: + - architecture: CUDA + operating_systems: @@ -411,210 +411,6 @@ index 627b891..e4941f7 100644 const uint8_t seed[KYBER_SYMBYTES], uint8_t x, uint8_t y); -diff --git a/cupqc/cupqc_ml-kem.cu b/cupqc/cupqc_ml-kem.cu -new file mode 100644 -index 0000000..2935c2c ---- /dev/null -+++ b/cupqc/cupqc_ml-kem.cu -@@ -0,0 +1,198 @@ -+/* -+ * Copyright 2025 Nvidia Corporation -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+**/ -+ -+#include -+#include -+#include -+ -+using namespace cupqc; -+ -+// Checks the return value from a CUDA API function -+#define CUDA_CHECK(err) \ -+ if (err != cudaSuccess) { failure = true; goto cleanup; } -+ -+template -+__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { -+ __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; -+ MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); -+} -+ -+template -+int keypair(uint8_t *pk, uint8_t *sk) { -+ using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); -+ -+ bool failure = false; -+ uint8_t *workspace = nullptr, *randombytes=nullptr; -+ uint8_t *d_pk = nullptr, *d_sk = nullptr; -+ -+ // Allocate device workspaces -+ try { -+ workspace = make_workspace(1); -+ randombytes = get_entropy(1); -+ } catch (const std::runtime_error& ex) { -+ failure = true; -+ goto cleanup; -+ } -+ CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); -+ CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); -+ -+ // Run routine -+ keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); -+ -+ // Copy data back to the host -+ CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); -+ CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); -+ -+cleanup: -+ // Free device memory -+ if (d_pk != nullptr) cudaFree(d_pk); -+ if (d_sk != nullptr) cudaFree(d_sk); -+ if (workspace != nullptr) destroy_workspace(workspace); -+ if (randombytes != nullptr) release_entropy(randombytes); -+ -+ return failure ? -1 : 0; -+} -+ -+template -+__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { -+ __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; -+ MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); -+} -+ -+template -+int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { -+ using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); -+ -+ bool failure = false; -+ uint8_t *workspace = nullptr, *randombytes=nullptr; -+ uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; -+ -+ // Allocate device workspaces -+ try { -+ workspace = make_workspace(1); -+ randombytes = get_entropy(1); -+ } catch (const std::runtime_error& ex) { -+ failure = true; -+ goto cleanup; -+ } -+ CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); -+ CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); -+ CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); -+ -+ // Copy data to GPU -+ CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); -+ -+ // Run routine -+ encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); -+ -+ // Copy data back to the host -+ CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); -+ CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); -+ -+cleanup: -+ // Free device memory -+ if (d_ct != nullptr) cudaFree(d_ct); -+ if (d_ss != nullptr) cudaFree(d_ss); -+ if (d_pk != nullptr) cudaFree(d_pk); -+ if (workspace != nullptr) destroy_workspace(workspace); -+ if (randombytes != nullptr) release_entropy(randombytes); -+ -+ return failure ? -1 : 0; -+} -+ -+template -+__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { -+ __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; -+ MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); -+} -+ -+template -+int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { -+ using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); -+ -+ bool failure = false; -+ uint8_t *workspace = nullptr; -+ uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; -+ -+ // Allocate device workspaces -+ try { -+ workspace = make_workspace(1); -+ } catch (const std::runtime_error& ex) { -+ failure = true; -+ goto cleanup; -+ } -+ CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); -+ CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); -+ CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); -+ -+ // Copy data to GPU -+ CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); -+ CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); -+ -+ // Run routine -+ decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); -+ -+ // Copy data back to the host -+ CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); -+ -+cleanup: -+ // Free device memory -+ if (d_ct != nullptr) cudaFree(d_ct); -+ if (d_ss != nullptr) cudaFree(d_ss); -+ if (d_sk != nullptr) cudaFree(d_sk); -+ if (workspace != nullptr) destroy_workspace(workspace); -+ -+ return failure ? -1 : 0; -+} -+ -+extern "C" { -+ using KEM_512 = decltype(ML_KEM_512() + Block()); -+ using KEM_768 = decltype(ML_KEM_768() + Block()); -+ using KEM_1024 = decltype(ML_KEM_1024() + Block()); -+ -+#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) -+ int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { -+ return keypair(pk, sk); -+ } -+ int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { -+ return encaps(ct, ss, pk); -+ } -+ int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { -+ return decaps(ss, ct, sk); -+ } -+#endif -+ -+#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) -+ int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { -+ return keypair(pk, sk); -+ } -+ int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { -+ return encaps(ct, ss, pk); -+ } -+ int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { -+ return decaps(ss, ct, sk); -+ } -+#endif -+ -+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) -+ int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { -+ return keypair(pk, sk); -+ } -+ int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { -+ return encaps(ct, ss, pk); -+ } -+ int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { -+ return decaps(ss, ct, sk); -+ } -+#endif -+} diff --git a/ref/indcpa.c b/ref/indcpa.c index 9a78c09..726cfa9 100644 --- a/ref/indcpa.c diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt index 100ddaab7e..72c6c71d2b 100644 --- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt +++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt @@ -36,7 +36,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in {%- elif impl['name'] == 'cupqc' %} if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %}) - add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu) + add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT cupqc_{{ family.replace('_', '-') }}/cupqc_{{ family.replace('_', '-') }}.cu) target_include_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc) set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }}) diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt index 50d5537d1b..9a6a935ac3 100644 --- a/src/kem/ml_kem/CMakeLists.txt +++ b/src/kem/ml_kem/CMakeLists.txt @@ -24,7 +24,7 @@ if(OQS_ENABLE_KEM_ml_kem_512_avx2) endif() if(OQS_ENABLE_KEM_ml_kem_512_cupqc) - add_library(ml_kem_512_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu) + add_library(ml_kem_512_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu) target_include_libraries(ml_kem_512_cupqc cupqc) set_property(TARGET ml_kem_512_cupqc PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options(ml_kem_512_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) @@ -50,7 +50,7 @@ if(OQS_ENABLE_KEM_ml_kem_768_avx2) endif() if(OQS_ENABLE_KEM_ml_kem_768_cupqc) - add_library(ml_kem_768_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu) + add_library(ml_kem_768_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu) target_include_libraries(ml_kem_768_cupqc cupqc) set_property(TARGET ml_kem_768_cupqc PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options(ml_kem_768_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) @@ -76,7 +76,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024_avx2) endif() if(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - add_library(ml_kem_1024_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu) + add_library(ml_kem_1024_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu) target_include_libraries(ml_kem_1024_cupqc cupqc) set_property(TARGET ml_kem_1024_cupqc PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options(ml_kem_1024_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem/cupqc_ml-kem.cu similarity index 100% rename from src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu rename to src/kem/ml_kem/cupqc_ml-kem/cupqc_ml-kem.cu diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu deleted file mode 100644 index 2935c2cd6a..0000000000 --- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright 2025 Nvidia Corporation - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -**/ - -#include -#include -#include - -using namespace cupqc; - -// Checks the return value from a CUDA API function -#define CUDA_CHECK(err) \ - if (err != cudaSuccess) { failure = true; goto cleanup; } - -template -__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { - __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; - MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); -} - -template -int keypair(uint8_t *pk, uint8_t *sk) { - using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); - - bool failure = false; - uint8_t *workspace = nullptr, *randombytes=nullptr; - uint8_t *d_pk = nullptr, *d_sk = nullptr; - - // Allocate device workspaces - try { - workspace = make_workspace(1); - randombytes = get_entropy(1); - } catch (const std::runtime_error& ex) { - failure = true; - goto cleanup; - } - CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); - CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); - - // Run routine - keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); - - // Copy data back to the host - CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); - CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); - -cleanup: - // Free device memory - if (d_pk != nullptr) cudaFree(d_pk); - if (d_sk != nullptr) cudaFree(d_sk); - if (workspace != nullptr) destroy_workspace(workspace); - if (randombytes != nullptr) release_entropy(randombytes); - - return failure ? -1 : 0; -} - -template -__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { - __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; - MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); -} - -template -int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); - - bool failure = false; - uint8_t *workspace = nullptr, *randombytes=nullptr; - uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; - - // Allocate device workspaces - try { - workspace = make_workspace(1); - randombytes = get_entropy(1); - } catch (const std::runtime_error& ex) { - failure = true; - goto cleanup; - } - CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); - CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); - CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); - - // Copy data to GPU - CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); - - // Run routine - encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); - - // Copy data back to the host - CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); - CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); - -cleanup: - // Free device memory - if (d_ct != nullptr) cudaFree(d_ct); - if (d_ss != nullptr) cudaFree(d_ss); - if (d_pk != nullptr) cudaFree(d_pk); - if (workspace != nullptr) destroy_workspace(workspace); - if (randombytes != nullptr) release_entropy(randombytes); - - return failure ? -1 : 0; -} - -template -__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { - __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; - MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); -} - -template -int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); - - bool failure = false; - uint8_t *workspace = nullptr; - uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; - - // Allocate device workspaces - try { - workspace = make_workspace(1); - } catch (const std::runtime_error& ex) { - failure = true; - goto cleanup; - } - CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); - CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); - CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); - - // Copy data to GPU - CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); - CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); - - // Run routine - decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); - - // Copy data back to the host - CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); - -cleanup: - // Free device memory - if (d_ct != nullptr) cudaFree(d_ct); - if (d_ss != nullptr) cudaFree(d_ss); - if (d_sk != nullptr) cudaFree(d_sk); - if (workspace != nullptr) destroy_workspace(workspace); - - return failure ? -1 : 0; -} - -extern "C" { - using KEM_512 = decltype(ML_KEM_512() + Block()); - using KEM_768 = decltype(ML_KEM_768() + Block()); - using KEM_1024 = decltype(ML_KEM_1024() + Block()); - -#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif - -#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif - -#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif -} diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu deleted file mode 100644 index 2935c2cd6a..0000000000 --- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright 2025 Nvidia Corporation - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * http://www.apache.org/licenses/LICENSE-2.0 - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -**/ - -#include -#include -#include - -using namespace cupqc; - -// Checks the return value from a CUDA API function -#define CUDA_CHECK(err) \ - if (err != cudaSuccess) { failure = true; goto cleanup; } - -template -__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { - __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; - MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); -} - -template -int keypair(uint8_t *pk, uint8_t *sk) { - using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); - - bool failure = false; - uint8_t *workspace = nullptr, *randombytes=nullptr; - uint8_t *d_pk = nullptr, *d_sk = nullptr; - - // Allocate device workspaces - try { - workspace = make_workspace(1); - randombytes = get_entropy(1); - } catch (const std::runtime_error& ex) { - failure = true; - goto cleanup; - } - CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); - CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); - - // Run routine - keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); - - // Copy data back to the host - CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); - CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); - -cleanup: - // Free device memory - if (d_pk != nullptr) cudaFree(d_pk); - if (d_sk != nullptr) cudaFree(d_sk); - if (workspace != nullptr) destroy_workspace(workspace); - if (randombytes != nullptr) release_entropy(randombytes); - - return failure ? -1 : 0; -} - -template -__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { - __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; - MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); -} - -template -int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); - - bool failure = false; - uint8_t *workspace = nullptr, *randombytes=nullptr; - uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; - - // Allocate device workspaces - try { - workspace = make_workspace(1); - randombytes = get_entropy(1); - } catch (const std::runtime_error& ex) { - failure = true; - goto cleanup; - } - CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); - CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); - CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); - - // Copy data to GPU - CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); - - // Run routine - encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); - - // Copy data back to the host - CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); - CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); - -cleanup: - // Free device memory - if (d_ct != nullptr) cudaFree(d_ct); - if (d_ss != nullptr) cudaFree(d_ss); - if (d_pk != nullptr) cudaFree(d_pk); - if (workspace != nullptr) destroy_workspace(workspace); - if (randombytes != nullptr) release_entropy(randombytes); - - return failure ? -1 : 0; -} - -template -__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { - __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; - MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); -} - -template -int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); - - bool failure = false; - uint8_t *workspace = nullptr; - uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; - - // Allocate device workspaces - try { - workspace = make_workspace(1); - } catch (const std::runtime_error& ex) { - failure = true; - goto cleanup; - } - CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); - CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); - CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); - - // Copy data to GPU - CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); - CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); - - // Run routine - decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); - - // Copy data back to the host - CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); - -cleanup: - // Free device memory - if (d_ct != nullptr) cudaFree(d_ct); - if (d_ss != nullptr) cudaFree(d_ss); - if (d_sk != nullptr) cudaFree(d_sk); - if (workspace != nullptr) destroy_workspace(workspace); - - return failure ? -1 : 0; -} - -extern "C" { - using KEM_512 = decltype(ML_KEM_512() + Block()); - using KEM_768 = decltype(ML_KEM_768() + Block()); - using KEM_1024 = decltype(ML_KEM_1024() + Block()); - -#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif - -#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif - -#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif -} From 73ba48f8e5864bdceebb98f89d0fc93353cb4eee Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Fri, 17 Jan 2025 01:21:43 +0100 Subject: [PATCH 05/17] Stop piggybacking on pqcrystals-kyber-standard and move cupqc_ml-kem metadata to separate upstream repo Signed-off-by: Pravek Sharma --- .CMake/alg_support.cmake | 6 +- docs/algorithms/kem/ml_kem.md | 7 + docs/algorithms/kem/ml_kem.yml | 34 +++ .../copy_from_upstream/copy_from_upstream.py | 17 +- .../copy_from_upstream/copy_from_upstream.yml | 12 ++ .../patches/pqcrystals-ml_kem.patch | 67 +----- .../src/kem/family/CMakeLists.txt | 6 +- .../src/kem/family/kem_scheme.c | 28 +-- .../update_upstream_alg_docs.py | 78 ++++--- src/kem/ml_kem/CMakeLists.txt | 36 ++-- .../cupqc_ml-kem.cu | 0 .../cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu | 198 ++++++++++++++++++ .../cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu | 198 ++++++++++++++++++ src/kem/ml_kem/kem_ml_kem_1024.c | 26 +-- src/kem/ml_kem/kem_ml_kem_512.c | 26 +-- src/kem/ml_kem/kem_ml_kem_768.c | 26 +-- src/oqsconfig.h.cmake | 6 +- 17 files changed, 581 insertions(+), 190 deletions(-) rename src/kem/ml_kem/{cupqc_ml-kem => cupqc_ml-kem-1024_cuda}/cupqc_ml-kem.cu (100%) create mode 100644 src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu create mode 100644 src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index de22dbceb9..96677676ed 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -340,7 +340,7 @@ endif() if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_USE_CUPQC) - cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_cuda "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF) endif() endif() @@ -352,7 +352,7 @@ endif() if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_USE_CUPQC) - cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_cuda "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF) endif() endif() @@ -364,7 +364,7 @@ endif() if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") if(OQS_USE_CUPQC) - cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF) + cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_cuda "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF) endif() endif() diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md index d1806517ba..2d65aacff2 100644 --- a/docs/algorithms/kem/ml_kem.md +++ b/docs/algorithms/kem/ml_kem.md @@ -9,6 +9,10 @@ - **Primary Source**: - **Source**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0 +- **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches + - **cupqc-cuda**: + - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30 + - **Implementation license (SPDX-Identifier)**: https://docs.nvidia.com/cuda/cupqc/license.html ## Parameter set summary @@ -25,6 +29,7 @@ |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. @@ -36,6 +41,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. @@ -45,6 +51,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| | [Primary Source](#primary-source) | ref | All | All | None | True | True | False | | [Primary Source](#primary-source) | avx2 | x86\_64 | Linux,Darwin | AVX2,BMI2,POPCNT | True | True | False | +| [cupqc-cuda](#cupqc-cuda) | cuda | CUDA | Linux,Darwin | None | False | False | False | Are implementations chosen based on runtime CPU feature detection? **Yes**. diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml index 81ef2b6c4a..529e4c870c 100644 --- a/docs/algorithms/kem/ml_kem.yml +++ b/docs/algorithms/kem/ml_kem.yml @@ -20,6 +20,10 @@ primary-upstream: source: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches spdx-license-identifier: CC0-1.0 or Apache-2.0 +optimized-upstreams: + cupqc-cuda: + source: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30 + spdx-license-identifier: https://docs.nvidia.com/cuda/cupqc/license.html parameter-sets: - name: ML-KEM-512 claimed-nist-level: 1 @@ -54,6 +58,16 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: cupqc-cuda + upstream-id: cuda + supported-platforms: + - architecture: CUDA + operating_systems: + - Linux + - Darwin + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false - name: ML-KEM-768 claimed-nist-level: 3 claimed-security: IND-CCA2 @@ -87,6 +101,16 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: cupqc-cuda + upstream-id: cuda + supported-platforms: + - architecture: CUDA + operating_systems: + - Linux + - Darwin + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false - name: ML-KEM-1024 claimed-nist-level: 5 claimed-security: IND-CCA2 @@ -120,3 +144,13 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false + - upstream: cupqc-cuda + upstream-id: cuda + supported-platforms: + - architecture: CUDA + operating_systems: + - Linux + - Darwin + no-secret-dependent-branching-claimed: false + no-secret-dependent-branching-checked-by-valgrind: false + large-stack-usage: false diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py index ccab976b7f..46968fa33c 100755 --- a/scripts/copy_from_upstream/copy_from_upstream.py +++ b/scripts/copy_from_upstream/copy_from_upstream.py @@ -599,14 +599,15 @@ def process_families(instructions, basedir, with_kat, with_generator, with_libja # when provided to the compiler; OQS uses the term ARM_NEON if req['architecture'] == 'arm_8': req['architecture'] = 'ARM64_V8' - if req['architecture'] == 'ARM64_V8' and 'asimd' in req['required_flags']: - req['required_flags'].remove('asimd') - req['required_flags'].append('arm_neon') - if req['architecture'] == 'ARM64_V8' and 'sha3' in req['required_flags']: - req['required_flags'].remove('sha3') - req['required_flags'].append('arm_sha3') - impl['required_flags'] = req['required_flags'] - family['all_required_flags'].update(req['required_flags']) + if 'required_flags' in req: + if req['architecture'] == 'ARM64_V8' and 'asimd' in req['required_flags']: + req['required_flags'].remove('asimd') + req['required_flags'].append('arm_neon') + if req['architecture'] == 'ARM64_V8' and 'sha3' in req['required_flags']: + req['required_flags'].remove('sha3') + req['required_flags'].append('arm_sha3') + impl['required_flags'] = req['required_flags'] + family['all_required_flags'].update(req['required_flags']) except KeyError as ke: if (impl['name'] != family['default_implementation']): print("No required flags found for %s (KeyError %s on impl %s)" % ( diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index f80f0979d5..b2f74f566e 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -38,6 +38,14 @@ upstreams: kem_meta_path: '{pretty_name_full}_META.yml' kem_scheme_path: '.' patches: [pqcrystals-ml_kem.patch] + - + name: cupqc + git_url: https://github.com/praveksharma/cupqc-mlkem.git + git_branch: main + git_commit: adb8454e56979628c07b67eb7d90f9337be6dc30 + kem_meta_path: '{pretty_name_full}_META.yml' + kem_scheme_path: '.' + patches: [] - name: pqcrystals-dilithium git_url: https://github.com/pq-crystals/dilithium.git @@ -166,6 +174,10 @@ kems: - name: ml_kem default_implementation: ref + arch_specific_implementations: + cuda: cuda + arch_specific_upstream_locations: + cuda: cupqc upstream_location: pqcrystals-kyber-standard schemes: - diff --git a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch index 2e7068cd83..d65eea2f30 100644 --- a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch +++ b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch @@ -1,8 +1,8 @@ diff --git a/Kyber1024_META.yml b/ML-KEM-1024_META.yml -similarity index 50% +similarity index 55% rename from Kyber1024_META.yml rename to ML-KEM-1024_META.yml -index baa5ca3..98e0e5c 100644 +index baa5ca3..67243b8 100644 --- a/Kyber1024_META.yml +++ b/ML-KEM-1024_META.yml @@ -1,4 +1,4 @@ @@ -55,29 +55,11 @@ index baa5ca3..98e0e5c 100644 supported_platforms: - architecture: x86_64 operating_systems: -@@ -47,3 +45,17 @@ implementations: - - avx2 - - bmi2 - - popcnt -+ - name: cupqc -+ version: FIPS203 -+ compile_opts: $<$:-rdc=true -dlto -arch=compute_70> -+ signature_keypair: ml_kem_1024_cupqc_keypair -+ signature_enc: ml_kem_1024_cupqc_enc -+ signature_dec: ml_kem_1024_cupqc_dec -+ sources: -+ supported_platforms: -+ - architecture: CUDA -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - dummy diff --git a/Kyber512_META.yml b/ML-KEM-512_META.yml -similarity index 50% +similarity index 55% rename from Kyber512_META.yml rename to ML-KEM-512_META.yml -index b251701..0df6843 100644 +index b251701..18c28b0 100644 --- a/Kyber512_META.yml +++ b/ML-KEM-512_META.yml @@ -1,4 +1,4 @@ @@ -130,30 +112,11 @@ index b251701..0df6843 100644 supported_platforms: - architecture: x86_64 operating_systems: -@@ -47,3 +45,17 @@ implementations: - - avx2 - - bmi2 - - popcnt -+ - name: cupqc -+ version: FIPS203 -+ compile_opts: $<$:-rdc=true -dlto -arch=compute_70> -+ signature_keypair: ml_kem_512_cupqc_keypair -+ signature_enc: ml_kem_512_cupqc_enc -+ signature_dec: ml_kem_512_cupqc_dec -+ sources: -+ supported_platforms: -+ - architecture: CUDA -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - dummy -\ No newline at end of file diff --git a/Kyber768_META.yml b/ML-KEM-768_META.yml -similarity index 50% +similarity index 55% rename from Kyber768_META.yml rename to ML-KEM-768_META.yml -index 7a0cc3d..f5f45f7 100644 +index 7a0cc3d..ccc03c9 100644 --- a/Kyber768_META.yml +++ b/ML-KEM-768_META.yml @@ -1,4 +1,4 @@ @@ -206,24 +169,6 @@ index 7a0cc3d..f5f45f7 100644 supported_platforms: - architecture: x86_64 operating_systems: -@@ -47,3 +45,17 @@ implementations: - - avx2 - - bmi2 - - popcnt -+ - name: cupqc -+ version: FIPS203 -+ compile_opts: $<$:-rdc=true -dlto -arch=compute_70> -+ signature_keypair: ml_kem_768_cupqc_keypair -+ signature_enc: ml_kem_768_cupqc_enc -+ signature_dec: ml_kem_768_cupqc_dec -+ sources: -+ supported_platforms: -+ - architecture: CUDA -+ operating_systems: -+ - Linux -+ - Darwin -+ required_flags: -+ - dummy diff --git a/avx2/indcpa.c b/avx2/indcpa.c index 18b9d08..c4b2b3a 100644 --- a/avx2/indcpa.c diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt index 72c6c71d2b..004408aa3d 100644 --- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt +++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt @@ -33,10 +33,10 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PUBLIC {{ impl['compile_opts'] }}) {%- endif -%} - {%- elif impl['name'] == 'cupqc' %} + {%- elif impl['name'] == 'cuda' %} if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %}) - add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT cupqc_{{ family.replace('_', '-') }}/cupqc_{{ family.replace('_', '-') }}.cu) + add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu) target_include_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc) set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }}) @@ -45,7 +45,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %}) add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {% for source_file in impl['sources']|sort -%}{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/{{ source_file }}{%- if not loop.last %} {% endif -%}{%- endfor -%}) {%- endif %} - {%- if impl['name'] != 'cupqc' %} + {%- if impl['name'] != 'cuda' %} target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}) target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) {%- if impl['name'] != scheme['default_implementation'] and impl['required_flags'] -%} diff --git a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c index 95eafa703e..630aee1389 100644 --- a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c +++ b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c @@ -93,7 +93,7 @@ extern int {{ scheme['metadata']['default_dec_signature'] }}(uint8_t *ss, const {%- endfor %} {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %} -{% if impl['name'] == 'cupqc'%} +{% if impl['name'] == 'cuda'%} #if defined(OQS_USE_CUPQC) {%- endif %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} @@ -115,7 +115,7 @@ extern int {{ impl['signature_dec'] }}(uint8_t *ss, const uint8_t *ct, const uin extern int PQCLEAN_{{ scheme['pqclean_scheme_c']|upper }}_{{ impl['name']|upper }}_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); {%- endif %} #endif - {%- if impl['name'] == 'cupqc'%} + {%- if impl['name'] == 'cuda'%} #endif /* OQS_USE_CUPQC */ {%- endif %} {%- endfor %} @@ -171,12 +171,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_keypair(uint8_t * {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */ {%- endfor %} - {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} @@ -250,12 +250,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_encaps(uint8_t *c {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */ {%- endfor %} - {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} @@ -329,12 +329,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_decaps(uint8_t *s {% endfor -%} #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/ {%- endif %} - {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %} -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc) + {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %} +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */ +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */ {%- endfor %} - {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %} + {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %} {%- if loop.first %} #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %} {%- else %} diff --git a/scripts/copy_from_upstream/update_upstream_alg_docs.py b/scripts/copy_from_upstream/update_upstream_alg_docs.py index 33483067e2..ba765b84c1 100755 --- a/scripts/copy_from_upstream/update_upstream_alg_docs.py +++ b/scripts/copy_from_upstream/update_upstream_alg_docs.py @@ -95,8 +95,43 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes oqs_yaml_path = os.path.join(liboqs_root, 'docs', 'algorithms', 'kem', '{}.yml'.format(kem['name'])) if os.path.isfile(oqs_yaml_path): oqs_yaml = load_yaml(oqs_yaml_path) + + upstream_base_url = ui['git_url'][:-len(".git")] + # upstream is special: We will take the upstream git commit information + # (possibly with added patch comment) as it is what drove the update + + # Need to check if yml is of old format. If so, update to new format + if 'primary-upstream' not in oqs_yaml: + print("Updating format of {}. Please double check ordering of yaml file".format(scheme['pretty_name_full'])) + lhs = oqs_yaml['upstream'] + oqs_yaml['primary-upstream'] = dict() + oqs_yaml['primary-upstream']['spdx-license-identifier'] = oqs_yaml['spdx-license-identifier'] + for i in range(len(oqs_yaml['parameter-sets'])): + for j in range(len(oqs_yaml['parameter-sets'][i]['implementations'])): + oqs_yaml['parameter-sets'][i]['implementations'][j]['upstream'] = 'primary-upstream' else: - continue + lhs = oqs_yaml['primary-upstream']['source'] + oqs_yaml['primary-upstream']['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(upstream_base_url, ui['git_commit']), "primary-upstream") + if 'upstream' in oqs_yaml: + del oqs_yaml['upstream'] + del oqs_yaml['spdx-license-identifier'] + + if ouis: + for upstream in ouis: + optimized_upstream_base_url = ouis[upstream]['git_url'][:-len(".git")] + optimized_patches_done="" + if 'patches' in ouis[upstream]: + for patchfilename in ouis[upstream]['patches']: + if kem['name'] in patchfilename: + optimized_patches_done=" with copy_from_upstream patches" + if 'optimized-upstreams' in oqs_yaml and upstream in oqs_yaml['optimized-upstreams']: + lhs = oqs_yaml['optimized-upstreams'][upstream]['source'] + else: + lhs = '' + oqs_yaml['optimized-upstreams'] = oqs_yaml.get('optimized-upstreams', dict()) + oqs_yaml['optimized-upstreams'][upstream] = oqs_yaml['optimized-upstreams'].get(upstream, dict()) + git_commit = ouis[upstream]['git_commit'] + oqs_yaml['optimized-upstreams'][upstream]['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+optimized_patches_done).format(optimized_upstream_base_url, git_commit), "optimized-upstreams") # We cannot assume that the ordering of "parameter-sets" # in the OQS YAML files matches that of copy_from_upstream.yml @@ -111,45 +146,6 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes oqs_yaml['type'] = rhs_if_not_equal(oqs_yaml['type'], upstream_yaml['type'], "type") oqs_yaml['principal-submitters'] = rhs_if_not_equal(oqs_yaml['principal-submitters'], upstream_yaml['principal-submitters'], "principal-submitters") - upstream_base_url = ui['git_url'][:-len(".git")] - # upstream is special: We will take the upstream git commit information - # (possibly with added patch comment) as it is what drove the update - - # Need to check if yml is of old format. If so, update to new format - if 'primary-upstream' not in oqs_yaml: - print("Updating format of {}. Please double check ordering of yaml file".format(scheme['pretty_name_full'])) - lhs = oqs_yaml['upstream'] - oqs_yaml['primary-upstream'] = dict() - oqs_yaml['primary-upstream']['spdx-license-identifier'] = oqs_yaml['spdx-license-identifier'] - for i in range(len(oqs_yaml['parameter-sets'])): - for j in range(len(oqs_yaml['parameter-sets'][i]['implementations'])): - oqs_yaml['parameter-sets'][i]['implementations'][j]['upstream'] = 'primary-upstream' - else: - lhs = oqs_yaml['primary-upstream']['source'] - oqs_yaml['primary-upstream']['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(upstream_base_url, ui['git_commit']), "primary-upstream") - if 'upstream' in oqs_yaml: - del oqs_yaml['upstream'] - del oqs_yaml['spdx-license-identifier'] - - if ouis: - for upstream in ouis: - optimized_upstream_base_url = ouis[upstream]['git_url'][:-len(".git")] - for patchfilename in ouis[upstream]['patches']: - if kem['name'] in patchfilename: - patches_done=" with copy_from_upstream patches" - patches_done="" - if 'patches' in ouis[upstream]: - for patchfilename in ouis[upstream]['patches']: - if kem['name'] in patchfilename: - patches_done=" with copy_from_upstream patches" - if 'optimized-upstreams' in oqs_yaml and upstream in oqs_yaml['optimized-upstreams']: - lhs = oqs_yaml['optimized-upstreams'][upstream]['source'] - else: - lhs = '' - git_commit = ouis[upstream]['git_commit'] - oqs_yaml['optimized-upstreams'][upstream]['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(optimized_upstream_base_url, git_commit), "optimized-upstreams") - - if 'auxiliary-submitters' in upstream_yaml: oqs_yaml['auxiliary-submitters'] = rhs_if_not_equal(oqs_yaml['auxiliary-submitters'] if 'auxiliary-submitters' in oqs_yaml else '', upstream_yaml['auxiliary-submitters'], "auxiliary-submitters") @@ -204,7 +200,7 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes upstream_impl['supported_platforms'][i]['architecture'] = 'ARM64_V8' if 'asimd' in upstream_impl['supported_platforms'][i]['required_flags']: upstream_impl['supported_platforms'][i]['required_flags'].remove('asimd') - if not upstream_impl['supported_platforms'][i]['required_flags']: + if 'required_flags' in upstream_impl['supported_platforms'][i] and not upstream_impl['supported_platforms'][i]['required_flags']: del upstream_impl['supported_platforms'][i]['required_flags'] impl['supported-platforms'] = rhs_if_not_equal(impl['supported-platforms'], upstream_impl['supported_platforms'], "supported-platforms") diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt index 9a6a935ac3..76b53851b1 100644 --- a/src/kem/ml_kem/CMakeLists.txt +++ b/src/kem/ml_kem/CMakeLists.txt @@ -23,12 +23,12 @@ if(OQS_ENABLE_KEM_ml_kem_512_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() -if(OQS_ENABLE_KEM_ml_kem_512_cupqc) - add_library(ml_kem_512_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu) - target_include_libraries(ml_kem_512_cupqc cupqc) - set_property(TARGET ml_kem_512_cupqc PROPERTY CUDA_ARCHITECTURES OFF) - target_compile_options(ml_kem_512_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) - set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +if(OQS_ENABLE_KEM_ml_kem_512_cuda) + add_library(ml_kem_512_cuda OBJECT cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu) + target_include_libraries(ml_kem_512_cuda cupqc) + set_property(TARGET ml_kem_512_cuda PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_512_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() if(OQS_ENABLE_KEM_ml_kem_768) @@ -49,12 +49,12 @@ if(OQS_ENABLE_KEM_ml_kem_768_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() -if(OQS_ENABLE_KEM_ml_kem_768_cupqc) - add_library(ml_kem_768_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu) - target_include_libraries(ml_kem_768_cupqc cupqc) - set_property(TARGET ml_kem_768_cupqc PROPERTY CUDA_ARCHITECTURES OFF) - target_compile_options(ml_kem_768_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) - set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +if(OQS_ENABLE_KEM_ml_kem_768_cuda) + add_library(ml_kem_768_cuda OBJECT cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu) + target_include_libraries(ml_kem_768_cuda cupqc) + set_property(TARGET ml_kem_768_cuda PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_768_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() if(OQS_ENABLE_KEM_ml_kem_1024) @@ -75,12 +75,12 @@ if(OQS_ENABLE_KEM_ml_kem_1024_avx2) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() -if(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - add_library(ml_kem_1024_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu) - target_include_libraries(ml_kem_1024_cupqc cupqc) - set_property(TARGET ml_kem_1024_cupqc PROPERTY CUDA_ARCHITECTURES OFF) - target_compile_options(ml_kem_1024_cupqc PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) - set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) +if(OQS_ENABLE_KEM_ml_kem_1024_cuda) + add_library(ml_kem_1024_cuda OBJECT cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu) + target_include_libraries(ml_kem_1024_cuda cupqc) + set_property(TARGET ml_kem_1024_cuda PROPERTY CUDA_ARCHITECTURES OFF) + target_compile_options(ml_kem_1024_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) + set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) endif() set(ML_KEM_OBJS ${_ML_KEM_OBJS} PARENT_SCOPE) diff --git a/src/kem/ml_kem/cupqc_ml-kem/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu similarity index 100% rename from src/kem/ml_kem/cupqc_ml-kem/cupqc_ml-kem.cu rename to src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu diff --git a/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu new file mode 100644 index 0000000000..2935c2cd6a --- /dev/null +++ b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu @@ -0,0 +1,198 @@ +/* + * Copyright 2025 Nvidia Corporation + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +#include +#include +#include + +using namespace cupqc; + +// Checks the return value from a CUDA API function +#define CUDA_CHECK(err) \ + if (err != cudaSuccess) { failure = true; goto cleanup; } + +template +__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; + MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); +} + +template +int keypair(uint8_t *pk, uint8_t *sk) { + using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_pk = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); + + // Run routine + keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_pk != nullptr) cudaFree(d_pk); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; + MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); +} + +template +int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); + + // Run routine + encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_pk != nullptr) cudaFree(d_pk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { + __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; + MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); +} + +template +int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); + + // Run routine + decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + + return failure ? -1 : 0; +} + +extern "C" { + using KEM_512 = decltype(ML_KEM_512() + Block()); + using KEM_768 = decltype(ML_KEM_768() + Block()); + using KEM_1024 = decltype(ML_KEM_1024() + Block()); + +#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) + int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) + int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif +} diff --git a/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu new file mode 100644 index 0000000000..2935c2cd6a --- /dev/null +++ b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu @@ -0,0 +1,198 @@ +/* + * Copyright 2025 Nvidia Corporation + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +#include +#include +#include + +using namespace cupqc; + +// Checks the return value from a CUDA API function +#define CUDA_CHECK(err) \ + if (err != cudaSuccess) { failure = true; goto cleanup; } + +template +__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size]; + MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr); +} + +template +int keypair(uint8_t *pk, uint8_t *sk) { + using MLKEM_Keygen = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_pk = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size)); + + // Run routine + keygen_kernel<<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_pk != nullptr) cudaFree(d_pk); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) { + __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size]; + MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr); +} + +template +int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + using MLKEM_Encaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr, *randombytes=nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + randombytes = get_entropy(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault)); + + // Run routine + encaps_kernel<<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_pk != nullptr) cudaFree(d_pk); + if (workspace != nullptr) destroy_workspace(workspace); + if (randombytes != nullptr) release_entropy(randombytes); + + return failure ? -1 : 0; +} + +template +__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) { + __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size]; + MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr); +} + +template +int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + using MLKEM_Decaps = decltype(MLKEM_Base() + Function()); + + bool failure = false; + uint8_t *workspace = nullptr; + uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr; + + // Allocate device workspaces + try { + workspace = make_workspace(1); + } catch (const std::runtime_error& ex) { + failure = true; + goto cleanup; + } + CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size)); + CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size)); + CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size)); + + // Copy data to GPU + CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault)); + + // Run routine + decaps_kernel<<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace); + + // Copy data back to the host + CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault)); + +cleanup: + // Free device memory + if (d_ct != nullptr) cudaFree(d_ct); + if (d_ss != nullptr) cudaFree(d_ss); + if (d_sk != nullptr) cudaFree(d_sk); + if (workspace != nullptr) destroy_workspace(workspace); + + return failure ? -1 : 0; +} + +extern "C" { + using KEM_512 = decltype(ML_KEM_512() + Block()); + using KEM_768 = decltype(ML_KEM_768() + Block()); + using KEM_1024 = decltype(ML_KEM_1024() + Block()); + +#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) + int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) + int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif + +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) + int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { + return keypair(pk, sk); + } + int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { + return encaps(ct, ss, pk); + } + int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { + return decaps(ss, ct, sk); + } +#endif +} diff --git a/src/kem/ml_kem/kem_ml_kem_1024.c b/src/kem/ml_kem/kem_ml_kem_1024.c index 2d4d46df8b..1e471af58a 100644 --- a/src/kem/ml_kem/kem_ml_kem_1024.c +++ b/src/kem/ml_kem/kem_ml_kem_1024.c @@ -41,17 +41,17 @@ extern int pqcrystals_ml_kem_1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const #endif #if defined(OQS_USE_CUPQC) -#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) -extern int ml_kem_1024_cupqc_keypair(uint8_t *pk, uint8_t *sk); -extern int ml_kem_1024_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); -extern int ml_kem_1024_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) +extern int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk); +extern int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif #endif /* OQS_USE_CUPQC */ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *secret_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - return (OQS_STATUS) ml_kem_1024_cupqc_keypair(public_key, secret_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) + return (OQS_STATUS) cupqc_ml_kem_1024_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -68,9 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *sec } OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - return (OQS_STATUS) ml_kem_1024_cupqc_enc(ciphertext, shared_secret, public_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) + return (OQS_STATUS) cupqc_ml_kem_1024_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -87,9 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shar } OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - return (OQS_STATUS) ml_kem_1024_cupqc_dec(shared_secret, ciphertext, secret_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) + return (OQS_STATUS) cupqc_ml_kem_1024_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/kem/ml_kem/kem_ml_kem_512.c b/src/kem/ml_kem/kem_ml_kem_512.c index 5e60018b37..41805f91be 100644 --- a/src/kem/ml_kem/kem_ml_kem_512.c +++ b/src/kem/ml_kem/kem_ml_kem_512.c @@ -41,17 +41,17 @@ extern int pqcrystals_ml_kem_512_avx2_dec(uint8_t *ss, const uint8_t *ct, const #endif #if defined(OQS_USE_CUPQC) -#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) -extern int ml_kem_512_cupqc_keypair(uint8_t *pk, uint8_t *sk); -extern int ml_kem_512_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); -extern int ml_kem_512_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#if defined(OQS_ENABLE_KEM_ml_kem_512_cuda) +extern int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk); +extern int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif #endif /* OQS_USE_CUPQC */ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secret_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - return (OQS_STATUS) ml_kem_512_cupqc_keypair(public_key, secret_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda) + return (OQS_STATUS) cupqc_ml_kem_512_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -68,9 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secr } OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - return (OQS_STATUS) ml_kem_512_cupqc_enc(ciphertext, shared_secret, public_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda) + return (OQS_STATUS) cupqc_ml_kem_512_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -87,9 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *share } OQS_API OQS_STATUS OQS_KEM_ml_kem_512_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - return (OQS_STATUS) ml_kem_512_cupqc_dec(shared_secret, ciphertext, secret_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda) + return (OQS_STATUS) cupqc_ml_kem_512_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/kem/ml_kem/kem_ml_kem_768.c b/src/kem/ml_kem/kem_ml_kem_768.c index ceb651cedb..11a7421b20 100644 --- a/src/kem/ml_kem/kem_ml_kem_768.c +++ b/src/kem/ml_kem/kem_ml_kem_768.c @@ -41,17 +41,17 @@ extern int pqcrystals_ml_kem_768_avx2_dec(uint8_t *ss, const uint8_t *ct, const #endif #if defined(OQS_USE_CUPQC) -#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) -extern int ml_kem_768_cupqc_keypair(uint8_t *pk, uint8_t *sk); -extern int ml_kem_768_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); -extern int ml_kem_768_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); +#if defined(OQS_ENABLE_KEM_ml_kem_768_cuda) +extern int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk); +extern int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk); +extern int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk); #endif #endif /* OQS_USE_CUPQC */ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secret_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - return (OQS_STATUS) ml_kem_768_cupqc_keypair(public_key, secret_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda) + return (OQS_STATUS) cupqc_ml_kem_768_keypair(public_key, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -68,9 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secr } OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - return (OQS_STATUS) ml_kem_768_cupqc_enc(ciphertext, shared_secret, public_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda) + return (OQS_STATUS) cupqc_ml_kem_768_enc(ciphertext, shared_secret, public_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { @@ -87,9 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *share } OQS_API OQS_STATUS OQS_KEM_ml_kem_768_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) { -#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - return (OQS_STATUS) ml_kem_768_cupqc_dec(shared_secret, ciphertext, secret_key); -#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */ +#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda) + return (OQS_STATUS) cupqc_ml_kem_768_dec(shared_secret, ciphertext, secret_key); +#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */ #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2) #if defined(OQS_DIST_BUILD) if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) { diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake index 875584f74a..eb21d7b003 100644 --- a/src/oqsconfig.h.cmake +++ b/src/oqsconfig.h.cmake @@ -131,13 +131,13 @@ #cmakedefine OQS_ENABLE_KEM_ML_KEM 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_512 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_512_avx2 1 -#cmakedefine OQS_ENABLE_KEM_ml_kem_512_cupqc 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_512_cuda 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_768 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_768_avx2 1 -#cmakedefine OQS_ENABLE_KEM_ml_kem_768_cupqc 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_768_cuda 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024 1 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024_avx2 1 -#cmakedefine OQS_ENABLE_KEM_ml_kem_1024_cupqc 1 +#cmakedefine OQS_ENABLE_KEM_ml_kem_1024_cuda 1 #cmakedefine OQS_ENABLE_SIG_DILITHIUM 1 #cmakedefine OQS_ENABLE_SIG_dilithium_2 1 From cf46255bbe60c36e0885745ef79102d99c382018 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Fri, 17 Jan 2025 17:45:28 +0100 Subject: [PATCH 06/17] Update licensing information Signed-off-by: Pravek Sharma --- CONFIGURE.md | 2 +- docs/algorithms/kem/ml_kem.md | 2 +- docs/algorithms/kem/ml_kem.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CONFIGURE.md b/CONFIGURE.md index 67fd7d0aab..02a8da2015 100644 --- a/CONFIGURE.md +++ b/CONFIGURE.md @@ -127,7 +127,7 @@ Only has an effect if the system supports `dlopen` and ELF binary format, such a ### OQS_USE_CUPQC -Can be `ON` or `OFF`. When `ON`, use NVIDIA's cuPQC library where able (currently just ML-KEM). When this option is enabled, liboqs may not run correctly on machines that lack supported GPUs. To download cuPQC follow the instructions at (https://developer.nvidia.com/cupqc-download/). Detailed descriptions of the API, requirments, and installation guide are in the cuPQC documentation (https://docs.nvidia.com/cuda/cupqc/index.html). +Can be `ON` or `OFF`. When `ON`, use NVIDIA's cuPQC library where able (currently just ML-KEM). When this option is enabled, liboqs may not run correctly on machines that lack supported GPUs. To download cuPQC follow the instructions at (https://developer.nvidia.com/cupqc-download/). Detailed descriptions of the API, requirements, and installation guide are in the cuPQC documentation (https://docs.nvidia.com/cuda/cupqc/index.html). While the code shipped by liboqs required to use cuPQC is licensed under Apache 2.0 the cuPQC SDK comes with its own license agreement (https://docs.nvidia.com/cuda/cupqc/license.html). **Default**: `OFF` diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md index 2d65aacff2..b0a7e786b3 100644 --- a/docs/algorithms/kem/ml_kem.md +++ b/docs/algorithms/kem/ml_kem.md @@ -12,7 +12,7 @@ - **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches - **cupqc-cuda**: - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30 - - **Implementation license (SPDX-Identifier)**: https://docs.nvidia.com/cuda/cupqc/license.html + - **Implementation license (SPDX-Identifier)**: Apache-2.0 ## Parameter set summary diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml index 529e4c870c..d48f056080 100644 --- a/docs/algorithms/kem/ml_kem.yml +++ b/docs/algorithms/kem/ml_kem.yml @@ -23,7 +23,7 @@ primary-upstream: optimized-upstreams: cupqc-cuda: source: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30 - spdx-license-identifier: https://docs.nvidia.com/cuda/cupqc/license.html + spdx-license-identifier: Apache-2.0 parameter-sets: - name: ML-KEM-512 claimed-nist-level: 1 From ae3fd37b65497b763a8b83a11b9d7a6b9e46e103 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Fri, 17 Jan 2025 20:28:05 +0100 Subject: [PATCH 07/17] Update PLATFORMS.md Signed-off-by: Pravek Sharma --- PLATFORMS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/PLATFORMS.md b/PLATFORMS.md index f1b3fc5ebd..544edd9319 100644 --- a/PLATFORMS.md +++ b/PLATFORMS.md @@ -63,3 +63,4 @@ In this policy, the words "must" and "must not" specify absolute requirements th - ppc641e for Ubuntu (Focal) - s390x for Ubuntu (Focal) - loongarch64 for Debian Linux (trixie) +- NVIDIA GPU architectures 70, 75, 80, 86, 89, and 90 with a x86_64 CPU for Linux From cf03b8da4832afdf13838474ce4d9358686c301c Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Mon, 20 Jan 2025 22:50:22 +0100 Subject: [PATCH 08/17] Fix kem_family cmakelists template Signed-off-by: Pravek Sharma --- scripts/copy_from_upstream/src/kem/family/CMakeLists.txt | 2 +- src/kem/ml_kem/CMakeLists.txt | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt index 004408aa3d..bd648d101d 100644 --- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt +++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt @@ -37,7 +37,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %}) add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu) - target_include_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc) + target_link_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc) set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }}) {%- else %} diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt index 76b53851b1..8af79b6d05 100644 --- a/src/kem/ml_kem/CMakeLists.txt +++ b/src/kem/ml_kem/CMakeLists.txt @@ -25,7 +25,7 @@ endif() if(OQS_ENABLE_KEM_ml_kem_512_cuda) add_library(ml_kem_512_cuda OBJECT cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu) - target_include_libraries(ml_kem_512_cuda cupqc) + target_link_libraries(ml_kem_512_cuda cupqc) set_property(TARGET ml_kem_512_cuda PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options(ml_kem_512_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) @@ -51,7 +51,7 @@ endif() if(OQS_ENABLE_KEM_ml_kem_768_cuda) add_library(ml_kem_768_cuda OBJECT cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu) - target_include_libraries(ml_kem_768_cuda cupqc) + target_link_libraries(ml_kem_768_cuda cupqc) set_property(TARGET ml_kem_768_cuda PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options(ml_kem_768_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) @@ -77,7 +77,7 @@ endif() if(OQS_ENABLE_KEM_ml_kem_1024_cuda) add_library(ml_kem_1024_cuda OBJECT cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu) - target_include_libraries(ml_kem_1024_cuda cupqc) + target_link_libraries(ml_kem_1024_cuda cupqc) set_property(TARGET ml_kem_1024_cuda PROPERTY CUDA_ARCHITECTURES OFF) target_compile_options(ml_kem_1024_cuda PRIVATE $<$:-rdc=true -dlto -arch=compute_70>) set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $) From 5dc3f495e50dad1c3a12e27ed8ddfd6de460e123 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Tue, 21 Jan 2025 19:14:23 +0100 Subject: [PATCH 09/17] Run copy_from_upsream.py and pull updated upstream Signed-off-by: Pravek Sharma --- docs/algorithms/kem/ml_kem.md | 2 +- docs/algorithms/kem/ml_kem.yml | 2 +- .../copy_from_upstream/copy_from_upstream.yml | 2 +- .../cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu | 28 +------------------ .../cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu | 28 +------------------ .../cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu | 28 +------------------ 6 files changed, 6 insertions(+), 84 deletions(-) diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md index b0a7e786b3..eeaf299dde 100644 --- a/docs/algorithms/kem/ml_kem.md +++ b/docs/algorithms/kem/ml_kem.md @@ -11,7 +11,7 @@ - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0 - **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches - **cupqc-cuda**: - - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30 + - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e - **Implementation license (SPDX-Identifier)**: Apache-2.0 diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml index d48f056080..498617ff45 100644 --- a/docs/algorithms/kem/ml_kem.yml +++ b/docs/algorithms/kem/ml_kem.yml @@ -22,7 +22,7 @@ primary-upstream: spdx-license-identifier: CC0-1.0 or Apache-2.0 optimized-upstreams: cupqc-cuda: - source: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30 + source: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e spdx-license-identifier: Apache-2.0 parameter-sets: - name: ML-KEM-512 diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index b2f74f566e..23d1f3a22d 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -42,7 +42,7 @@ upstreams: name: cupqc git_url: https://github.com/praveksharma/cupqc-mlkem.git git_branch: main - git_commit: adb8454e56979628c07b67eb7d90f9337be6dc30 + git_commit: b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e kem_meta_path: '{pretty_name_full}_META.yml' kem_scheme_path: '.' patches: [] diff --git a/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu index 2935c2cd6a..188e2f100d 100644 --- a/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu +++ b/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu @@ -156,35 +156,9 @@ cleanup: } extern "C" { - using KEM_512 = decltype(ML_KEM_512() + Block()); - using KEM_768 = decltype(ML_KEM_768() + Block()); using KEM_1024 = decltype(ML_KEM_1024() + Block()); -#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif - -#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif - -#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) +#if defined(OQS_ENABLE_KEM_ml_kem_1024_cuda) int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { return keypair(pk, sk); } diff --git a/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu index 2935c2cd6a..48ca5ca31a 100644 --- a/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu +++ b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu @@ -157,10 +157,8 @@ cleanup: extern "C" { using KEM_512 = decltype(ML_KEM_512() + Block()); - using KEM_768 = decltype(ML_KEM_768() + Block()); - using KEM_1024 = decltype(ML_KEM_1024() + Block()); -#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) +#if defined(OQS_ENABLE_KEM_ml_kem_512_cuda) int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { return keypair(pk, sk); } @@ -171,28 +169,4 @@ extern "C" { return decaps(ss, ct, sk); } #endif - -#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) - int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif - -#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif } diff --git a/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu index 2935c2cd6a..594c1f4c24 100644 --- a/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu +++ b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu @@ -156,23 +156,9 @@ cleanup: } extern "C" { - using KEM_512 = decltype(ML_KEM_512() + Block()); using KEM_768 = decltype(ML_KEM_768() + Block()); - using KEM_1024 = decltype(ML_KEM_1024() + Block()); -#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc) - int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif - -#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc) +#if defined(OQS_ENABLE_KEM_ml_kem_768_cuda) int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) { return keypair(pk, sk); } @@ -183,16 +169,4 @@ extern "C" { return decaps(ss, ct, sk); } #endif - -#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc) - int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) { - return keypair(pk, sk); - } - int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - return encaps(ct, ss, pk); - } - int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - return decaps(ss, ct, sk); - } -#endif } From a3126fc778881c41f416e0b231e2ce829cb51c91 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Tue, 21 Jan 2025 23:13:52 +0100 Subject: [PATCH 10/17] Add cupqc build test to basic.yml Signed-off-by: Pravek Sharma --- .github/workflows/basic.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml index 57b6374934..64589aac7d 100644 --- a/.github/workflows/basic.yml +++ b/.github/workflows/basic.yml @@ -89,6 +89,36 @@ jobs: run: ninja gen_docs working-directory: ${{ env.RANDOM_BUILD_DIR }} + cupqc-buildcheck: + name: Check that code build with OQS_USE_CUPQC=ON + needs: [buildcheck] + runs-on: ubuntu-latest + container: openquantumsafe/ci-ubuntu-latest:latest + steps: + - name: Create random build folder + run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV + - name: + - name: Checkout code + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 + - name: Configure + run: | + cuPQC_DIR=$cuPQC_DIR \ + cmake \ + -B ${{ env.RANDOM_BUILD_DIR }} \ + -GNinja \ + -DOQS_USE_CUPQC=ON \ + -DOQS_STRICT_WARNINGS=ON \ + --warn-uninitialized . > config.log 2>&1 && \ + cat config.log && \ + cmake -LA -N . && \ + ! (grep -i "uninitialized variable" config.log) + - name: Build code + run: ninja + working-directory: ${{ env.RANDOM_BUILD_DIR }} + - name: Build documentation + run: ninja gen_docs + working-directory: ${{ env.RANDOM_BUILD_DIR }} + cppcheck: name: Check C++ linking with example program runs-on: ubuntu-latest From 0fa3031bb9477613b6f449b480f83cc236e15014 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Tue, 21 Jan 2025 23:31:01 +0100 Subject: [PATCH 11/17] Move cupqc build test from basic.yml to linux.yml Signed-off-by: Pravek Sharma --- .github/workflows/basic.yml | 30 ------------------------------ .github/workflows/linux.yml | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml index 64589aac7d..57b6374934 100644 --- a/.github/workflows/basic.yml +++ b/.github/workflows/basic.yml @@ -89,36 +89,6 @@ jobs: run: ninja gen_docs working-directory: ${{ env.RANDOM_BUILD_DIR }} - cupqc-buildcheck: - name: Check that code build with OQS_USE_CUPQC=ON - needs: [buildcheck] - runs-on: ubuntu-latest - container: openquantumsafe/ci-ubuntu-latest:latest - steps: - - name: Create random build folder - run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV - - name: - - name: Checkout code - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 - - name: Configure - run: | - cuPQC_DIR=$cuPQC_DIR \ - cmake \ - -B ${{ env.RANDOM_BUILD_DIR }} \ - -GNinja \ - -DOQS_USE_CUPQC=ON \ - -DOQS_STRICT_WARNINGS=ON \ - --warn-uninitialized . > config.log 2>&1 && \ - cat config.log && \ - cmake -LA -N . && \ - ! (grep -i "uninitialized variable" config.log) - - name: Build code - run: ninja - working-directory: ${{ env.RANDOM_BUILD_DIR }} - - name: Build documentation - run: ninja gen_docs - working-directory: ${{ env.RANDOM_BUILD_DIR }} - cppcheck: name: Check C++ linking with example program runs-on: ubuntu-latest diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index bb412d17c1..2ac89c43d5 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -188,6 +188,33 @@ jobs: --numprocesses=auto \ --ignore=tests/test_code_conventions.py ${{ matrix.PYTEST_ARGS }}" + cupqc-buildcheck: + name: Check that code builds with OQS_USE_CUPQC=ON + needs: [buildcheck] + runs-on: ubuntu-latest + container: openquantumsafe/ci-ubuntu-latest:latest + steps: + - name: Create random build folder + run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV + - name: + - name: Checkout code + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 + - name: Configure + run: | + cuPQC_DIR=$cuPQC_DIR \ + cmake \ + -B ${{ env.RANDOM_BUILD_DIR }} \ + -GNinja \ + -DOQS_USE_CUPQC=ON \ + -DOQS_STRICT_WARNINGS=ON \ + --warn-uninitialized . > config.log 2>&1 && \ + cat config.log && \ + cmake -LA -N . && \ + ! (grep -i "uninitialized variable" config.log) + - name: Build code + run: ninja + working-directory: ${{ env.RANDOM_BUILD_DIR }} + linux_cross_compile: runs-on: ubuntu-latest container: openquantumsafe/ci-ubuntu-latest:latest From 512d07115479fbe392c4c31a3ec2a00a8d9d11b1 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Tue, 21 Jan 2025 23:39:03 +0100 Subject: [PATCH 12/17] Fix error in linux.yml Signed-off-by: Pravek Sharma --- .github/workflows/linux.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 2ac89c43d5..b85d02a055 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -196,7 +196,6 @@ jobs: steps: - name: Create random build folder run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV - - name: - name: Checkout code uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 - name: Configure From 7f2eb3017e2cae2f4473ee18837f4cf75f50e92b Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Tue, 21 Jan 2025 23:41:06 +0100 Subject: [PATCH 13/17] fixup! Fix error in linux.yml Signed-off-by: Pravek Sharma --- .github/workflows/linux.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index b85d02a055..e0a7daccff 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -190,7 +190,6 @@ jobs: cupqc-buildcheck: name: Check that code builds with OQS_USE_CUPQC=ON - needs: [buildcheck] runs-on: ubuntu-latest container: openquantumsafe/ci-ubuntu-latest:latest steps: From 7fbdd134174e311f2bb5eac1605198147f85d719 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Thu, 23 Jan 2025 19:48:53 +0100 Subject: [PATCH 14/17] Redo cupqc build check Signed-off-by: Pravek Sharma --- .github/workflows/linux.yml | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index e0a7daccff..03d206f0ff 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -193,25 +193,13 @@ jobs: runs-on: ubuntu-latest container: openquantumsafe/ci-ubuntu-latest:latest steps: - - name: Create random build folder - run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV - name: Checkout code uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 - name: Configure - run: | - cuPQC_DIR=$cuPQC_DIR \ - cmake \ - -B ${{ env.RANDOM_BUILD_DIR }} \ - -GNinja \ - -DOQS_USE_CUPQC=ON \ - -DOQS_STRICT_WARNINGS=ON \ - --warn-uninitialized . > config.log 2>&1 && \ - cat config.log && \ - cmake -LA -N . && \ - ! (grep -i "uninitialized variable" config.log) + run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR cmake -GNinja -DOQS_USE_CUPQC=ON .. && cmake -LA -N .. - name: Build code run: ninja - working-directory: ${{ env.RANDOM_BUILD_DIR }} + working-directory: build linux_cross_compile: runs-on: ubuntu-latest From 083b26792c352d9e002a979ad65a310724b88f52 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Thu, 23 Jan 2025 20:54:17 +0100 Subject: [PATCH 15/17] Supply default CUDA arch to cupqc-buildcheck configuration stage Signed-off-by: Pravek Sharma --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 03d206f0ff..603ab7466b 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -196,7 +196,7 @@ jobs: - name: Checkout code uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 - name: Configure - run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR cmake -GNinja -DOQS_USE_CUPQC=ON .. && cmake -LA -N .. + run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N .. - name: Build code run: ninja working-directory: build From 1ee99730a67ad14b235cf334b07b213a3321763b Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Thu, 23 Jan 2025 21:22:42 +0100 Subject: [PATCH 16/17] Specify CUDAXX in cupqc-buildcheck Signed-off-by: Pravek Sharma --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 603ab7466b..a12931c1c8 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -196,7 +196,7 @@ jobs: - name: Checkout code uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 - name: Configure - run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N .. + run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR CUDACXX=/usr/local/cuda-12.6/bin/nvcc cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N .. - name: Build code run: ninja working-directory: build From c94b7c73dea4c8af15bcb4e98f6fa70f5557e574 Mon Sep 17 00:00:00 2001 From: Pravek Sharma Date: Thu, 23 Jan 2025 21:51:04 +0100 Subject: [PATCH 17/17] Make cuPQC_DIR explicit in cupqc-buildcheck Signed-off-by: Pravek Sharma --- .github/workflows/linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index a12931c1c8..8059ffc993 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -196,7 +196,7 @@ jobs: - name: Checkout code uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 - name: Configure - run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR CUDACXX=/usr/local/cuda-12.6/bin/nvcc cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N .. + run: mkdir build && cd build && cuPQC_DIR=/cupqc/cupqc/cupqc-pkg-0.2.0/cmake/ CUDACXX=/usr/local/cuda-12.6/bin/nvcc cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N .. - name: Build code run: ninja working-directory: build