From fb2a8e500d970740c7bc2ad4de310f8e911f2dd7 Mon Sep 17 00:00:00 2001
From: Steven Reeves <sreeves@nvidia.com>
Date: Wed, 15 Jan 2025 07:42:12 -0800
Subject: [PATCH 01/17] Adding cuPQC as a backend for ML-KEM.

Signed-off-by: Steven Reeves <sreeves@nvidia.com>
---
 CMakeLists.txt                                |  10 +
 CONFIGURE.md                                  |   8 +
 .../add_enable_by_alg_conditional.fragment    |  12 +
 .../patches/pqcrystals-ml_kem.patch           | 271 +++++++++++++++++-
 .../src/kem/family/CMakeLists.txt             |   9 +
 .../src/kem/family/kem_scheme.c               |  28 +-
 src/CMakeLists.txt                            |   5 +
 src/kem/ml_kem/CMakeLists.txt                 |  24 ++
 src/kem/ml_kem/kem_ml_kem_1024.c              |  17 ++
 src/kem/ml_kem/kem_ml_kem_512.c               |  17 ++
 src/kem/ml_kem/kem_ml_kem_768.c               |  17 ++
 .../cupqc_ml-kem.cu                           | 198 +++++++++++++
 .../cupqc_ml-kem.cu                           | 198 +++++++++++++
 .../cupqc_ml-kem.cu                           | 198 +++++++++++++
 src/oqsconfig.h.cmake                         |   5 +
 15 files changed, 1007 insertions(+), 10 deletions(-)
 create mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu
 create mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu
 create mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 114961ed7f..09667e71ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,6 +140,16 @@ else()
     message(FATAL_ERROR "Unknown or unsupported processor: " ${CMAKE_SYSTEM_PROCESSOR} ". Override by setting OQS_PERMIT_UNSUPPORTED_ARCHITECTURE=ON")
 endif()
 
+if(${OQS_USE_CUPQC})
+    # CMAKE's CUDA language requires CMAKE 3.18
+    cmake_minimum_required (VERSION 3.18)
+    enable_language(CUDA)
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+      set(CMAKE_CUDA_ARCHITECTURES 80 90)
+    endif()
+    find_package(cuPQC 0.2.0 REQUIRED)
+endif()
+
 if (NOT ((CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin") AND (ARCH_X86_64 STREQUAL "ON")) AND (OQS_LIBJADE_BUILD STREQUAL "ON"))
     message(FATAL_ERROR "Building liboqs with libjade implementations from libjade is only supported on Linux and Darwin on x86_64.")
 endif()
diff --git a/CONFIGURE.md b/CONFIGURE.md
index 1d00565f31..67fd7d0aab 100644
--- a/CONFIGURE.md
+++ b/CONFIGURE.md
@@ -13,6 +13,7 @@ The following options can be passed to CMake before the build file generation pr
 - [OQS_DIST_BUILD](#OQS_DIST_BUILD)
 - [OQS_USE_CPUFEATURE_INSTRUCTIONS](#OQS_USE_CPUFEATURE_INSTRUCTIONS)
 - [OQS_USE_OPENSSL](#OQS_USE_OPENSSL)
+- [OQS_USE_CUPQC](#OQS_USE_CUPQC)
 - [OQS_OPT_TARGET](#OQS_OPT_TARGET)
 - [OQS_SPEED_USE_ARM_PMU](#OQS_SPEED_USE_ARM_PMU)
 - [USE_SANITIZER](#USE_SANITIZER)
@@ -124,6 +125,13 @@ Dynamically load OpenSSL through `dlopen`. When using liboqs from other cryptogr
 
 Only has an effect if the system supports `dlopen` and ELF binary format, such as Linux or BSD family.
 
+### OQS_USE_CUPQC
+
+Can be `ON` or `OFF`.  When `ON`, use NVIDIA's cuPQC library where able (currently just ML-KEM).  When this option is enabled, liboqs may not run correctly on machines that lack supported GPUs. To download cuPQC follow the instructions at (https://developer.nvidia.com/cupqc-download/). Detailed descriptions of the API, requirments, and installation guide are in the cuPQC documentation (https://docs.nvidia.com/cuda/cupqc/index.html). 
+
+**Default**: `OFF`
+
+
 ## Stateful Hash Based Signatures 
 
 XMSS and LMS are the two supported Hash-Based Signatures schemes.
diff --git a/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment b/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment
index daed5514c0..0830c024fd 100644
--- a/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment
+++ b/scripts/copy_from_upstream/.CMake/alg_support.cmake/add_enable_by_alg_conditional.fragment
@@ -11,6 +11,18 @@ if(OQS_DIST_X86_64_BUILD OR ({% for flag in platform['required_flags'] -%} OQS_U
 {%- endif %}
 endif()
 {% if platform['operating_systems'] %}endif()
+{% endif -%}
+            {%- endfor -%}
+            {%- for platform in impl['supported_platforms'] if platform['architecture'] == 'CUDA' %}
+{% if platform['operating_systems'] %}if(CMAKE_SYSTEM_NAME MATCHES "{{ platform['operating_systems']|join('|') }}")
+{% endif -%}
+if(OQS_USE_CUPQC)
+    cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}_{{ impl['name']  }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['scheme'] }}" OFF)
+{%- if 'alias_scheme' in scheme %}
+    cmake_dependent_option(OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['alias_scheme'] }}_{{ impl['name']  }} "" ON "OQS_ENABLE_KEM_{{ family['name'] }}_{{ scheme['alias_scheme'] }}" OFF)
+{%- endif %}
+endif()
+{% if platform['operating_systems'] %}endif()
 {% endif -%}
             {%- endfor -%}
             {%- for platform in impl['supported_platforms'] if platform['architecture'] == 'ARM64_V8' %}
diff --git a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
index d65eea2f30..a44b1c232e 100644
--- a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
+++ b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
@@ -1,8 +1,8 @@
 diff --git a/Kyber1024_META.yml b/ML-KEM-1024_META.yml
-similarity index 55%
+similarity index 50%
 rename from Kyber1024_META.yml
 rename to ML-KEM-1024_META.yml
-index baa5ca3..67243b8 100644
+index baa5ca3..42f147e 100644
 --- a/Kyber1024_META.yml
 +++ b/ML-KEM-1024_META.yml
 @@ -1,4 +1,4 @@
@@ -55,11 +55,29 @@ index baa5ca3..67243b8 100644
      supported_platforms:
        - architecture: x86_64
          operating_systems:
+@@ -47,3 +45,17 @@ implementations:
+           - avx2
+           - bmi2
+           - popcnt
++  - name: cupqc
++    version: FIPS203
++    compile_opts: $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>
++    signature_keypair: ml_kem_1024_cupqc_keypair
++    signature_enc: ml_kem_1024_cupqc_enc
++    signature_dec: ml_kem_1024_cupqc_dec
++    sources: cupqc_ml-kem.cu
++    supported_platforms:
++      - architecture: CUDA
++        operating_systems:
++          - Linux
++          - Darwin
++        required_flags:
++          - dummy
 diff --git a/Kyber512_META.yml b/ML-KEM-512_META.yml
-similarity index 55%
+similarity index 50%
 rename from Kyber512_META.yml
 rename to ML-KEM-512_META.yml
-index b251701..18c28b0 100644
+index b251701..a74f7ec 100644
 --- a/Kyber512_META.yml
 +++ b/ML-KEM-512_META.yml
 @@ -1,4 +1,4 @@
@@ -112,11 +130,30 @@ index b251701..18c28b0 100644
      supported_platforms:
        - architecture: x86_64
          operating_systems:
+@@ -47,3 +45,17 @@ implementations:
+           - avx2
+           - bmi2
+           - popcnt
++  - name: cupqc
++    version: FIPS203
++    compile_opts: $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>
++    signature_keypair: ml_kem_512_cupqc_keypair
++    signature_enc: ml_kem_512_cupqc_enc
++    signature_dec: ml_kem_512_cupqc_dec
++    sources: cupqc_ml-kem.cu
++    supported_platforms:
++      - architecture: CUDA
++        operating_systems:
++          - Linux
++          - Darwin
++        required_flags:
++          - dummy
+\ No newline at end of file
 diff --git a/Kyber768_META.yml b/ML-KEM-768_META.yml
-similarity index 55%
+similarity index 50%
 rename from Kyber768_META.yml
 rename to ML-KEM-768_META.yml
-index 7a0cc3d..ccc03c9 100644
+index 7a0cc3d..2c0e23f 100644
 --- a/Kyber768_META.yml
 +++ b/ML-KEM-768_META.yml
 @@ -1,4 +1,4 @@
@@ -169,6 +206,24 @@ index 7a0cc3d..ccc03c9 100644
      supported_platforms:
        - architecture: x86_64
          operating_systems:
+@@ -47,3 +45,17 @@ implementations:
+           - avx2
+           - bmi2
+           - popcnt
++  - name: cupqc
++    version: FIPS203
++    compile_opts: $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>
++    signature_keypair: ml_kem_768_cupqc_keypair
++    signature_enc: ml_kem_768_cupqc_enc
++    signature_dec: ml_kem_768_cupqc_dec
++    sources: cupqc_ml-kem.cu
++    supported_platforms:
++      - architecture: CUDA
++        operating_systems:
++          - Linux
++          - Darwin
++        required_flags:
++          - dummy
 diff --git a/avx2/indcpa.c b/avx2/indcpa.c
 index 18b9d08..c4b2b3a 100644
 --- a/avx2/indcpa.c
@@ -356,6 +411,210 @@ index 627b891..e4941f7 100644
                             const uint8_t seed[KYBER_SYMBYTES],
                             uint8_t x,
                             uint8_t y);
+diff --git a/cupqc/cupqc_ml-kem.cu b/cupqc/cupqc_ml-kem.cu
+new file mode 100644
+index 0000000..2935c2c
+--- /dev/null
++++ b/cupqc/cupqc_ml-kem.cu
+@@ -0,0 +1,198 @@
++/*
++ * Copyright 2025 Nvidia Corporation
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++**/
++
++#include <cupqc.hpp>
++#include <stdexcept>
++#include <oqs/oqsconfig.h>
++
++using namespace cupqc;
++
++// Checks the return value from a CUDA API function
++#define CUDA_CHECK(err) \
++    if (err != cudaSuccess) { failure = true; goto cleanup; }
++
++template<class MLKEM_Keygen>
++__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
++    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
++    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
++}
++
++template<class MLKEM_Base>
++int keypair(uint8_t *pk, uint8_t *sk) {
++    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
++
++    bool failure = false;
++    uint8_t *workspace = nullptr, *randombytes=nullptr;
++    uint8_t *d_pk = nullptr, *d_sk = nullptr;
++
++    // Allocate device workspaces
++    try {
++        workspace   = make_workspace<MLKEM_Keygen>(1);
++        randombytes = get_entropy<MLKEM_Keygen>(1);
++    } catch (const std::runtime_error& ex) {
++        failure = true;
++        goto cleanup;
++    }
++    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
++    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
++
++    // Run routine
++    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
++
++    // Copy data back to the host
++    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
++    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
++
++cleanup:
++    // Free device memory
++    if (d_pk != nullptr) cudaFree(d_pk);
++    if (d_sk != nullptr) cudaFree(d_sk);
++    if (workspace != nullptr) destroy_workspace(workspace);
++    if (randombytes != nullptr) release_entropy(randombytes);
++
++    return failure ? -1 : 0;
++}
++
++template<class MLKEM_Encaps>
++__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
++    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
++    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
++}
++
++template<class MLKEM_Base>
++int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
++    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
++
++    bool failure = false;
++    uint8_t *workspace = nullptr, *randombytes=nullptr;
++    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
++
++    // Allocate device workspaces
++    try {
++        workspace   = make_workspace<MLKEM_Encaps>(1);
++        randombytes = get_entropy<MLKEM_Encaps>(1);
++    } catch (const std::runtime_error& ex) {
++        failure = true;
++        goto cleanup;
++    }
++    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
++    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
++    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
++
++    // Copy data to GPU
++    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
++
++    // Run routine
++    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
++
++    // Copy data back to the host
++    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
++    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
++
++cleanup:
++    // Free device memory
++    if (d_ct != nullptr) cudaFree(d_ct);
++    if (d_ss != nullptr) cudaFree(d_ss);
++    if (d_pk != nullptr) cudaFree(d_pk);
++    if (workspace != nullptr) destroy_workspace(workspace);
++    if (randombytes != nullptr) release_entropy(randombytes);
++
++    return failure ? -1 : 0;
++}
++
++template<class MLKEM_Decaps>
++__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
++    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
++    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
++}
++
++template<class MLKEM_Base>
++int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
++    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
++
++    bool failure = false;
++    uint8_t *workspace = nullptr;
++    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
++
++    // Allocate device workspaces
++    try {
++        workspace = make_workspace<MLKEM_Decaps>(1);
++    } catch (const std::runtime_error& ex) {
++        failure = true;
++        goto cleanup;
++    }
++    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
++    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
++    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
++
++    // Copy data to GPU
++    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
++    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
++
++    // Run routine
++    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
++
++    // Copy data back to the host
++    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
++
++cleanup:
++    // Free device memory
++    if (d_ct != nullptr) cudaFree(d_ct);
++    if (d_ss != nullptr) cudaFree(d_ss);
++    if (d_sk != nullptr) cudaFree(d_sk);
++    if (workspace != nullptr) destroy_workspace(workspace);
++
++    return failure ? -1 : 0;
++}
++
++extern "C" {
++    using KEM_512  = decltype(ML_KEM_512()  + Block());
++    using KEM_768  = decltype(ML_KEM_768()  + Block());
++    using KEM_1024 = decltype(ML_KEM_1024() + Block());
++
++#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
++    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
++        return keypair<KEM_512>(pk, sk);
++    }
++    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
++        return encaps<KEM_512>(ct, ss, pk);
++    }
++    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
++        return decaps<KEM_512>(ss, ct, sk);
++    }
++#endif
++
++#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
++    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
++        return keypair<KEM_768>(pk, sk);
++    }
++    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
++        return encaps<KEM_768>(ct, ss, pk);
++    }
++    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
++        return decaps<KEM_768>(ss, ct, sk);
++    }
++#endif
++
++#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
++    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
++        return keypair<KEM_1024>(pk, sk);
++    }
++    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
++        return encaps<KEM_1024>(ct, ss, pk);
++    }
++    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
++        return decaps<KEM_1024>(ss, ct, sk);
++    }
++#endif
++}
 diff --git a/ref/indcpa.c b/ref/indcpa.c
 index 9a78c09..726cfa9 100644
 --- a/ref/indcpa.c
diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
index ca9d41eac0..100ddaab7e 100644
--- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
+++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
@@ -33,11 +33,19 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in
     target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PUBLIC {{ impl['compile_opts'] }})
            {%- endif -%}
 
+        {%- elif impl['name'] == 'cupqc' %}
+
+if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %})
+    add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu)
+    target_include_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc)
+    set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF)
+    target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }})
         {%- else %}
 
 if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %})
     add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {% for source_file in impl['sources']|sort -%}{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/{{ source_file }}{%- if not loop.last %} {% endif -%}{%- endfor -%})
         {%- endif %}
+        {%- if impl['name'] != 'cupqc' %}
     target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }})
     target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
         {%- if impl['name'] != scheme['default_implementation'] and impl['required_flags'] -%}
@@ -60,6 +68,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if
         target_compile_definitions({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE old_gas_syntax)
     endif()
     	{%- endif %}
+        {%- endif %}{# cupqc #}
     set(_{{ family|upper }}_OBJS ${_{{ family|upper }}_OBJS} $<TARGET_OBJECTS:{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}>)
 endif()
     {%- endfor -%}
diff --git a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
index 108078ffcd..cf8a705031 100644
--- a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
+++ b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
@@ -93,7 +93,9 @@ extern int {{ scheme['metadata']['default_dec_signature']  }}(uint8_t *ss, const
     {%- endfor %}
 
     {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %}
-
+{% if impl['name'] == 'cupqc'%}
+#if defined(OQS_USE_CUPQC)
+        {%- endif %}
 #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %}
         {%- if impl['signature_keypair'] %}
 extern int {{ impl['signature_keypair'] }}(uint8_t *pk, uint8_t *sk);
@@ -113,6 +115,9 @@ extern int {{ impl['signature_dec'] }}(uint8_t *ss, const uint8_t *ct, const uin
 extern int PQCLEAN_{{ scheme['pqclean_scheme_c']|upper }}_{{ impl['name']|upper }}_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
         {%- endif %}
 #endif
+        {%- if impl['name'] == 'cupqc'%}
+#endif /* OQS_USE_CUPQC */
+        {%- endif %}
     {%- endfor %}
 
 {%- if libjade_implementation is defined and scheme['libjade_implementation'] %}
@@ -166,7 +171,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_keypair(uint8_t *
 {% endfor -%}
 #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/
 {%- endif %}
-    {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %}
+    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
+    return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
+    {%- endfor %}
+    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
     {%- if loop.first %}
 #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %}
     {%- else %}
@@ -240,7 +250,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_encaps(uint8_t *c
 {% endfor -%}
 #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/
 {%- endif %}
-    {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %}
+    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
+    return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
+    {%- endfor %}
+    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
     {%- if loop.first %}
 #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %}
     {%- else %}
@@ -314,7 +329,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_decaps(uint8_t *s
 {% endfor -%}
 #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/
 {%- endif %}
-    {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %}
+    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
+    return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
+    {%- endfor %}
+    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
     {%- if loop.first %}
 #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %}
     {%- else %}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a6bca7d998..8f0ac14b6b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -99,6 +99,11 @@ if(${OQS_USE_OPENSSL})
     target_link_libraries(oqs-internal PRIVATE ${OPENSSL_CRYPTO_LIBRARY})
   endif()
 endif()
+if(${OQS_USE_CUPQC})
+    set_property(TARGET oqs PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+    target_link_libraries(oqs PRIVATE cupqc)
+    target_link_options(oqs PRIVATE $<DEVICE_LINK: -dlto>)
+endif()
 
 target_include_directories(oqs
                            PUBLIC
diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index 14cc9b850d..50d5537d1b 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -23,6 +23,14 @@ if(OQS_ENABLE_KEM_ml_kem_512_avx2)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_avx2>)
 endif()
 
+if(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    add_library(ml_kem_512_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu)
+    target_include_libraries(ml_kem_512_cupqc cupqc)
+    set_property(TARGET ml_kem_512_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
+    target_compile_options(ml_kem_512_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
+    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_cupqc>)
+endif()
+
 if(OQS_ENABLE_KEM_ml_kem_768)
     add_library(ml_kem_768_ref OBJECT kem_ml_kem_768.c pqcrystals-kyber-standard_ml-kem-768_ref/cbd.c pqcrystals-kyber-standard_ml-kem-768_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-768_ref/kem.c pqcrystals-kyber-standard_ml-kem-768_ref/ntt.c pqcrystals-kyber-standard_ml-kem-768_ref/poly.c pqcrystals-kyber-standard_ml-kem-768_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-768_ref/reduce.c pqcrystals-kyber-standard_ml-kem-768_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-768_ref/verify.c)
     target_compile_options(ml_kem_768_ref PUBLIC -DKYBER_K=3)
@@ -41,6 +49,14 @@ if(OQS_ENABLE_KEM_ml_kem_768_avx2)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_avx2>)
 endif()
 
+if(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    add_library(ml_kem_768_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu)
+    target_include_libraries(ml_kem_768_cupqc cupqc)
+    set_property(TARGET ml_kem_768_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
+    target_compile_options(ml_kem_768_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
+    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_cupqc>)
+endif()
+
 if(OQS_ENABLE_KEM_ml_kem_1024)
     add_library(ml_kem_1024_ref OBJECT kem_ml_kem_1024.c pqcrystals-kyber-standard_ml-kem-1024_ref/cbd.c pqcrystals-kyber-standard_ml-kem-1024_ref/indcpa.c pqcrystals-kyber-standard_ml-kem-1024_ref/kem.c pqcrystals-kyber-standard_ml-kem-1024_ref/ntt.c pqcrystals-kyber-standard_ml-kem-1024_ref/poly.c pqcrystals-kyber-standard_ml-kem-1024_ref/polyvec.c pqcrystals-kyber-standard_ml-kem-1024_ref/reduce.c pqcrystals-kyber-standard_ml-kem-1024_ref/symmetric-shake.c pqcrystals-kyber-standard_ml-kem-1024_ref/verify.c)
     target_compile_options(ml_kem_1024_ref PUBLIC -DKYBER_K=4)
@@ -59,4 +75,12 @@ if(OQS_ENABLE_KEM_ml_kem_1024_avx2)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_avx2>)
 endif()
 
+if(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    add_library(ml_kem_1024_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu)
+    target_include_libraries(ml_kem_1024_cupqc cupqc)
+    set_property(TARGET ml_kem_1024_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
+    target_compile_options(ml_kem_1024_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
+    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_cupqc>)
+endif()
+
 set(ML_KEM_OBJS ${_ML_KEM_OBJS} PARENT_SCOPE)
diff --git a/src/kem/ml_kem/kem_ml_kem_1024.c b/src/kem/ml_kem/kem_ml_kem_1024.c
index bc533aef9e..21f8c84ca4 100644
--- a/src/kem/ml_kem/kem_ml_kem_1024.c
+++ b/src/kem/ml_kem/kem_ml_kem_1024.c
@@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_1024_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8
 extern int pqcrystals_ml_kem_1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 #endif
 
+#if defined(OQS_USE_CUPQC)
+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+extern int ml_kem_1024_cupqc_keypair(uint8_t *pk, uint8_t *sk);
+extern int ml_kem_1024_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int ml_kem_1024_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+#endif
+#endif /* OQS_USE_CUPQC */
+
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    return (OQS_STATUS) ml_kem_1024_cupqc_keypair(public_key, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *sec
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    return (OQS_STATUS) ml_kem_1024_cupqc_enc(ciphertext, shared_secret, public_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shar
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    return (OQS_STATUS) ml_kem_1024_cupqc_dec(shared_secret, ciphertext, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
diff --git a/src/kem/ml_kem/kem_ml_kem_512.c b/src/kem/ml_kem/kem_ml_kem_512.c
index f2dcde53d2..e3e05be15b 100644
--- a/src/kem/ml_kem/kem_ml_kem_512.c
+++ b/src/kem/ml_kem/kem_ml_kem_512.c
@@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_512_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_
 extern int pqcrystals_ml_kem_512_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 #endif
 
+#if defined(OQS_USE_CUPQC)
+#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+extern int ml_kem_512_cupqc_keypair(uint8_t *pk, uint8_t *sk);
+extern int ml_kem_512_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int ml_kem_512_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+#endif
+#endif /* OQS_USE_CUPQC */
+
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    return (OQS_STATUS) ml_kem_512_cupqc_keypair(public_key, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secr
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    return (OQS_STATUS) ml_kem_512_cupqc_enc(ciphertext, shared_secret, public_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *share
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    return (OQS_STATUS) ml_kem_512_cupqc_dec(shared_secret, ciphertext, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
diff --git a/src/kem/ml_kem/kem_ml_kem_768.c b/src/kem/ml_kem/kem_ml_kem_768.c
index 14eb6ba404..a8b0bfdd52 100644
--- a/src/kem/ml_kem/kem_ml_kem_768.c
+++ b/src/kem/ml_kem/kem_ml_kem_768.c
@@ -40,7 +40,18 @@ extern int pqcrystals_ml_kem_768_avx2_enc(uint8_t *ct, uint8_t *ss, const uint8_
 extern int pqcrystals_ml_kem_768_avx2_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 #endif
 
+#if defined(OQS_USE_CUPQC)
+#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+extern int ml_kem_768_cupqc_keypair(uint8_t *pk, uint8_t *sk);
+extern int ml_kem_768_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int ml_kem_768_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+#endif
+#endif /* OQS_USE_CUPQC */
+
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    return (OQS_STATUS) ml_kem_768_cupqc_keypair(public_key, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -57,6 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secr
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    return (OQS_STATUS) ml_kem_768_cupqc_enc(ciphertext, shared_secret, public_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -73,6 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *share
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    return (OQS_STATUS) ml_kem_768_cupqc_dec(shared_secret, ciphertext, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu
new file mode 100644
index 0000000000..2935c2cd6a
--- /dev/null
+++ b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2025 Nvidia Corporation
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+**/
+
+#include <cupqc.hpp>
+#include <stdexcept>
+#include <oqs/oqsconfig.h>
+
+using namespace cupqc;
+
+// Checks the return value from a CUDA API function
+#define CUDA_CHECK(err) \
+    if (err != cudaSuccess) { failure = true; goto cleanup; }
+
+template<class MLKEM_Keygen>
+__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
+    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int keypair(uint8_t *pk, uint8_t *sk) {
+    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_pk = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Keygen>(1);
+        randombytes = get_entropy<MLKEM_Keygen>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
+
+    // Run routine
+    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Encaps>
+__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
+    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Encaps>(1);
+        randombytes = get_entropy<MLKEM_Encaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
+
+    // Run routine
+    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Decaps>
+__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
+    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
+    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace = make_workspace<MLKEM_Decaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
+
+    // Run routine
+    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+
+    return failure ? -1 : 0;
+}
+
+extern "C" {
+    using KEM_512  = decltype(ML_KEM_512()  + Block());
+    using KEM_768  = decltype(ML_KEM_768()  + Block());
+    using KEM_1024 = decltype(ML_KEM_1024() + Block());
+
+#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_512>(pk, sk);
+    }
+    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_512>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_512>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_768>(pk, sk);
+    }
+    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_768>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_768>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_1024>(pk, sk);
+    }
+    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_1024>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_1024>(ss, ct, sk);
+    }
+#endif
+}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu
new file mode 100644
index 0000000000..2935c2cd6a
--- /dev/null
+++ b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2025 Nvidia Corporation
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+**/
+
+#include <cupqc.hpp>
+#include <stdexcept>
+#include <oqs/oqsconfig.h>
+
+using namespace cupqc;
+
+// Checks the return value from a CUDA API function
+#define CUDA_CHECK(err) \
+    if (err != cudaSuccess) { failure = true; goto cleanup; }
+
+template<class MLKEM_Keygen>
+__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
+    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int keypair(uint8_t *pk, uint8_t *sk) {
+    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_pk = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Keygen>(1);
+        randombytes = get_entropy<MLKEM_Keygen>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
+
+    // Run routine
+    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Encaps>
+__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
+    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Encaps>(1);
+        randombytes = get_entropy<MLKEM_Encaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
+
+    // Run routine
+    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Decaps>
+__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
+    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
+    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace = make_workspace<MLKEM_Decaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
+
+    // Run routine
+    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+
+    return failure ? -1 : 0;
+}
+
+extern "C" {
+    using KEM_512  = decltype(ML_KEM_512()  + Block());
+    using KEM_768  = decltype(ML_KEM_768()  + Block());
+    using KEM_1024 = decltype(ML_KEM_1024() + Block());
+
+#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_512>(pk, sk);
+    }
+    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_512>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_512>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_768>(pk, sk);
+    }
+    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_768>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_768>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_1024>(pk, sk);
+    }
+    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_1024>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_1024>(ss, ct, sk);
+    }
+#endif
+}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu
new file mode 100644
index 0000000000..2935c2cd6a
--- /dev/null
+++ b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2025 Nvidia Corporation
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+**/
+
+#include <cupqc.hpp>
+#include <stdexcept>
+#include <oqs/oqsconfig.h>
+
+using namespace cupqc;
+
+// Checks the return value from a CUDA API function
+#define CUDA_CHECK(err) \
+    if (err != cudaSuccess) { failure = true; goto cleanup; }
+
+template<class MLKEM_Keygen>
+__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
+    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int keypair(uint8_t *pk, uint8_t *sk) {
+    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_pk = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Keygen>(1);
+        randombytes = get_entropy<MLKEM_Keygen>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
+
+    // Run routine
+    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Encaps>
+__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
+    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Encaps>(1);
+        randombytes = get_entropy<MLKEM_Encaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
+
+    // Run routine
+    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Decaps>
+__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
+    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
+    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace = make_workspace<MLKEM_Decaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
+
+    // Run routine
+    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+
+    return failure ? -1 : 0;
+}
+
+extern "C" {
+    using KEM_512  = decltype(ML_KEM_512()  + Block());
+    using KEM_768  = decltype(ML_KEM_768()  + Block());
+    using KEM_1024 = decltype(ML_KEM_1024() + Block());
+
+#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_512>(pk, sk);
+    }
+    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_512>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_512>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_768>(pk, sk);
+    }
+    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_768>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_768>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_1024>(pk, sk);
+    }
+    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_1024>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_1024>(ss, ct, sk);
+    }
+#endif
+}
diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake
index 967c35e64e..875584f74a 100644
--- a/src/oqsconfig.h.cmake
+++ b/src/oqsconfig.h.cmake
@@ -69,6 +69,8 @@
 
 #cmakedefine OQS_ENABLE_SHA3_xkcp_low_avx2 1
 
+#cmakedefine01 OQS_USE_CUPQC
+
 #cmakedefine OQS_ENABLE_KEM_BIKE 1
 #cmakedefine OQS_ENABLE_KEM_bike_l1 1
 #cmakedefine OQS_ENABLE_KEM_bike_l3 1
@@ -129,10 +131,13 @@
 #cmakedefine OQS_ENABLE_KEM_ML_KEM 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_512 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_512_avx2 1
+#cmakedefine OQS_ENABLE_KEM_ml_kem_512_cupqc 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_768 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_768_avx2 1
+#cmakedefine OQS_ENABLE_KEM_ml_kem_768_cupqc 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024_avx2 1
+#cmakedefine OQS_ENABLE_KEM_ml_kem_1024_cupqc 1
 
 #cmakedefine OQS_ENABLE_SIG_DILITHIUM 1
 #cmakedefine OQS_ENABLE_SIG_dilithium_2 1

From 2cb806739053d3de363a1ea7cda5e8b55d1f28c6 Mon Sep 17 00:00:00 2001
From: Steven Reeves <sreeves@nvidia.com>
Date: Wed, 15 Jan 2025 09:04:49 -0800
Subject: [PATCH 02/17] Fixing transposition error that left out OQS_USE_CUPQC
 in CMake system.

Signed-off-by: Steven Reeves <sreeves@nvidia.com>
---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09667e71ae..1b4c2b1af6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@ option(OQS_LIBJADE_BUILD "Enable formally verified implementation of supported a
 option(OQS_PERMIT_UNSUPPORTED_ARCHITECTURE "Permit compilation on an an unsupported architecture." OFF)
 option(OQS_STRICT_WARNINGS "Enable all compiler warnings." OFF)
 option(OQS_EMBEDDED_BUILD "Compile liboqs for an Embedded environment without a full standard library." OFF)
+option(OQS_USE_CUPQC "Utilize cuPQC as the backend for supported PQC algorithms." OFF)
 
 # Libfuzzer isn't supported on gcc
 if('${CMAKE_C_COMPILER_ID}' STREQUAL 'Clang')

From 8c076753658109f79b19df908a7bbfa11f5543dd Mon Sep 17 00:00:00 2001
From: Steven Reeves <sreeves@nvidia.com>
Date: Wed, 15 Jan 2025 11:56:24 -0800
Subject: [PATCH 03/17] Add CMake dependent options for cupqc. Fixed formatting
 in kem_ml_kem_####.c and kem/family/kem_scheme.c

Signed-off-by: Steven Reeves <sreeves@nvidia.com>
---
 .CMake/alg_support.cmake                       | 18 ++++++++++++++++++
 .../src/kem/family/kem_scheme.c                |  6 +++---
 src/kem/ml_kem/kem_ml_kem_1024.c               |  6 +++---
 src/kem/ml_kem/kem_ml_kem_512.c                |  6 +++---
 src/kem/ml_kem/kem_ml_kem_768.c                |  6 +++---
 5 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake
index 9afa6e4b15..de22dbceb9 100644
--- a/.CMake/alg_support.cmake
+++ b/.CMake/alg_support.cmake
@@ -338,18 +338,36 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCT
 endif()
 endif()
 
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+if(OQS_USE_CUPQC)
+    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF)
+endif()
+endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
 if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
     cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF)
 endif()
 endif()
 
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+if(OQS_USE_CUPQC)
+    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF)
+endif()
+endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
 if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_BMI2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
     cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_avx2 "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF)
 endif()
 endif()
 
+if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
+if(OQS_USE_CUPQC)
+    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF)
+endif()
+endif()
+
 
 if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
 if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS AND OQS_USE_POPCNT_INSTRUCTIONS))
diff --git a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
index cf8a705031..95eafa703e 100644
--- a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
+++ b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
@@ -173,7 +173,7 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_keypair(uint8_t *
 {%- endif %}
     {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
-    return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key);
+	return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
     {%- endfor %}
     {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
@@ -252,7 +252,7 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_encaps(uint8_t *c
 {%- endif %}
     {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
-    return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key);
+	return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
     {%- endfor %}
     {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
@@ -331,7 +331,7 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_decaps(uint8_t *s
 {%- endif %}
     {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
-    return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key);
+	return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
     {%- endfor %}
     {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
diff --git a/src/kem/ml_kem/kem_ml_kem_1024.c b/src/kem/ml_kem/kem_ml_kem_1024.c
index 21f8c84ca4..2d4d46df8b 100644
--- a/src/kem/ml_kem/kem_ml_kem_1024.c
+++ b/src/kem/ml_kem/kem_ml_kem_1024.c
@@ -50,7 +50,7 @@ extern int ml_kem_1024_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *secret_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    return (OQS_STATUS) ml_kem_1024_cupqc_keypair(public_key, secret_key);
+	return (OQS_STATUS) ml_kem_1024_cupqc_keypair(public_key, secret_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
@@ -69,7 +69,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *sec
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    return (OQS_STATUS) ml_kem_1024_cupqc_enc(ciphertext, shared_secret, public_key);
+	return (OQS_STATUS) ml_kem_1024_cupqc_enc(ciphertext, shared_secret, public_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
@@ -88,7 +88,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shar
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    return (OQS_STATUS) ml_kem_1024_cupqc_dec(shared_secret, ciphertext, secret_key);
+	return (OQS_STATUS) ml_kem_1024_cupqc_dec(shared_secret, ciphertext, secret_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
diff --git a/src/kem/ml_kem/kem_ml_kem_512.c b/src/kem/ml_kem/kem_ml_kem_512.c
index e3e05be15b..5e60018b37 100644
--- a/src/kem/ml_kem/kem_ml_kem_512.c
+++ b/src/kem/ml_kem/kem_ml_kem_512.c
@@ -50,7 +50,7 @@ extern int ml_kem_512_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *s
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secret_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    return (OQS_STATUS) ml_kem_512_cupqc_keypair(public_key, secret_key);
+	return (OQS_STATUS) ml_kem_512_cupqc_keypair(public_key, secret_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
@@ -69,7 +69,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secr
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    return (OQS_STATUS) ml_kem_512_cupqc_enc(ciphertext, shared_secret, public_key);
+	return (OQS_STATUS) ml_kem_512_cupqc_enc(ciphertext, shared_secret, public_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
@@ -88,7 +88,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *share
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    return (OQS_STATUS) ml_kem_512_cupqc_dec(shared_secret, ciphertext, secret_key);
+	return (OQS_STATUS) ml_kem_512_cupqc_dec(shared_secret, ciphertext, secret_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
diff --git a/src/kem/ml_kem/kem_ml_kem_768.c b/src/kem/ml_kem/kem_ml_kem_768.c
index a8b0bfdd52..ceb651cedb 100644
--- a/src/kem/ml_kem/kem_ml_kem_768.c
+++ b/src/kem/ml_kem/kem_ml_kem_768.c
@@ -50,7 +50,7 @@ extern int ml_kem_768_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *s
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secret_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    return (OQS_STATUS) ml_kem_768_cupqc_keypair(public_key, secret_key);
+	return (OQS_STATUS) ml_kem_768_cupqc_keypair(public_key, secret_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)
@@ -69,7 +69,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secr
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    return (OQS_STATUS) ml_kem_768_cupqc_enc(ciphertext, shared_secret, public_key);
+	return (OQS_STATUS) ml_kem_768_cupqc_enc(ciphertext, shared_secret, public_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)
@@ -88,7 +88,7 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *share
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
 #if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    return (OQS_STATUS) ml_kem_768_cupqc_dec(shared_secret, ciphertext, secret_key);
+	return (OQS_STATUS) ml_kem_768_cupqc_dec(shared_secret, ciphertext, secret_key);
 #endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)

From 7562c855141f216519313f8973b6f0a590b7a16e Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Wed, 15 Jan 2025 23:42:54 +0100
Subject: [PATCH 04/17] Move cupqc_ml-kem source files to correctly named dir

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .../copy_from_upstream/copy_from_upstream.py  |  17 +-
 .../patches/pqcrystals-ml_kem.patch           | 216 +-----------------
 .../src/kem/family/CMakeLists.txt             |   2 +-
 src/kem/ml_kem/CMakeLists.txt                 |   6 +-
 .../cupqc_ml-kem.cu                           |   0
 .../cupqc_ml-kem.cu                           | 198 ----------------
 .../cupqc_ml-kem.cu                           | 198 ----------------
 7 files changed, 19 insertions(+), 618 deletions(-)
 rename src/kem/ml_kem/{pqcrystals-kyber-standard_ml-kem-1024_cupqc => cupqc_ml-kem}/cupqc_ml-kem.cu (100%)
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu
 delete mode 100644 src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu

diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py
index 400ecc57a0..ccab976b7f 100755
--- a/scripts/copy_from_upstream/copy_from_upstream.py
+++ b/scripts/copy_from_upstream/copy_from_upstream.py
@@ -495,14 +495,15 @@ def handle_implementation(impl, family, scheme, dst_basedir):
         else:
             # determine list of files to copy:
             if 'sources' in i:
-                srcs = i['sources'].split(" ")
-                for s in srcs:
-                    # Copy recursively only in case of directories not with plain files to avoid copying over symbolic links
-                    if os.path.isfile(os.path.join(origfolder, s)):
-                        subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
-                    else:
-                        subprocess.run(
-                            ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
+                if i['sources']:
+                    srcs = i['sources'].split(" ")
+                    for s in srcs:
+                        # Copy recursively only in case of directories not with plain files to avoid copying over symbolic links
+                        if os.path.isfile(os.path.join(origfolder, s)):
+                            subprocess.run(['cp', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
+                        else:
+                            subprocess.run(
+                                ['cp', '-r', os.path.join(origfolder, s), os.path.join(srcfolder, os.path.basename(s))])
             else:
                 subprocess.run(['cp', '-pr', os.path.join(origfolder, '.'), srcfolder])
                 # raise Exception("Malformed YML file: No sources listed to copy. Check upstream YML file." )
diff --git a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
index a44b1c232e..2e7068cd83 100644
--- a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
+++ b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
@@ -2,7 +2,7 @@ diff --git a/Kyber1024_META.yml b/ML-KEM-1024_META.yml
 similarity index 50%
 rename from Kyber1024_META.yml
 rename to ML-KEM-1024_META.yml
-index baa5ca3..42f147e 100644
+index baa5ca3..98e0e5c 100644
 --- a/Kyber1024_META.yml
 +++ b/ML-KEM-1024_META.yml
 @@ -1,4 +1,4 @@
@@ -65,7 +65,7 @@ index baa5ca3..42f147e 100644
 +    signature_keypair: ml_kem_1024_cupqc_keypair
 +    signature_enc: ml_kem_1024_cupqc_enc
 +    signature_dec: ml_kem_1024_cupqc_dec
-+    sources: cupqc_ml-kem.cu
++    sources:
 +    supported_platforms:
 +      - architecture: CUDA
 +        operating_systems:
@@ -77,7 +77,7 @@ diff --git a/Kyber512_META.yml b/ML-KEM-512_META.yml
 similarity index 50%
 rename from Kyber512_META.yml
 rename to ML-KEM-512_META.yml
-index b251701..a74f7ec 100644
+index b251701..0df6843 100644
 --- a/Kyber512_META.yml
 +++ b/ML-KEM-512_META.yml
 @@ -1,4 +1,4 @@
@@ -140,7 +140,7 @@ index b251701..a74f7ec 100644
 +    signature_keypair: ml_kem_512_cupqc_keypair
 +    signature_enc: ml_kem_512_cupqc_enc
 +    signature_dec: ml_kem_512_cupqc_dec
-+    sources: cupqc_ml-kem.cu
++    sources:
 +    supported_platforms:
 +      - architecture: CUDA
 +        operating_systems:
@@ -153,7 +153,7 @@ diff --git a/Kyber768_META.yml b/ML-KEM-768_META.yml
 similarity index 50%
 rename from Kyber768_META.yml
 rename to ML-KEM-768_META.yml
-index 7a0cc3d..2c0e23f 100644
+index 7a0cc3d..f5f45f7 100644
 --- a/Kyber768_META.yml
 +++ b/ML-KEM-768_META.yml
 @@ -1,4 +1,4 @@
@@ -216,7 +216,7 @@ index 7a0cc3d..2c0e23f 100644
 +    signature_keypair: ml_kem_768_cupqc_keypair
 +    signature_enc: ml_kem_768_cupqc_enc
 +    signature_dec: ml_kem_768_cupqc_dec
-+    sources: cupqc_ml-kem.cu
++    sources:
 +    supported_platforms:
 +      - architecture: CUDA
 +        operating_systems:
@@ -411,210 +411,6 @@ index 627b891..e4941f7 100644
                             const uint8_t seed[KYBER_SYMBYTES],
                             uint8_t x,
                             uint8_t y);
-diff --git a/cupqc/cupqc_ml-kem.cu b/cupqc/cupqc_ml-kem.cu
-new file mode 100644
-index 0000000..2935c2c
---- /dev/null
-+++ b/cupqc/cupqc_ml-kem.cu
-@@ -0,0 +1,198 @@
-+/*
-+ * Copyright 2025 Nvidia Corporation
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *     http://www.apache.org/licenses/LICENSE-2.0
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+**/
-+
-+#include <cupqc.hpp>
-+#include <stdexcept>
-+#include <oqs/oqsconfig.h>
-+
-+using namespace cupqc;
-+
-+// Checks the return value from a CUDA API function
-+#define CUDA_CHECK(err) \
-+    if (err != cudaSuccess) { failure = true; goto cleanup; }
-+
-+template<class MLKEM_Keygen>
-+__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
-+    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
-+    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
-+}
-+
-+template<class MLKEM_Base>
-+int keypair(uint8_t *pk, uint8_t *sk) {
-+    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
-+
-+    bool failure = false;
-+    uint8_t *workspace = nullptr, *randombytes=nullptr;
-+    uint8_t *d_pk = nullptr, *d_sk = nullptr;
-+
-+    // Allocate device workspaces
-+    try {
-+        workspace   = make_workspace<MLKEM_Keygen>(1);
-+        randombytes = get_entropy<MLKEM_Keygen>(1);
-+    } catch (const std::runtime_error& ex) {
-+        failure = true;
-+        goto cleanup;
-+    }
-+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
-+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
-+
-+    // Run routine
-+    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
-+
-+    // Copy data back to the host
-+    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
-+    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
-+
-+cleanup:
-+    // Free device memory
-+    if (d_pk != nullptr) cudaFree(d_pk);
-+    if (d_sk != nullptr) cudaFree(d_sk);
-+    if (workspace != nullptr) destroy_workspace(workspace);
-+    if (randombytes != nullptr) release_entropy(randombytes);
-+
-+    return failure ? -1 : 0;
-+}
-+
-+template<class MLKEM_Encaps>
-+__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
-+    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
-+    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
-+}
-+
-+template<class MLKEM_Base>
-+int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-+    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
-+
-+    bool failure = false;
-+    uint8_t *workspace = nullptr, *randombytes=nullptr;
-+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
-+
-+    // Allocate device workspaces
-+    try {
-+        workspace   = make_workspace<MLKEM_Encaps>(1);
-+        randombytes = get_entropy<MLKEM_Encaps>(1);
-+    } catch (const std::runtime_error& ex) {
-+        failure = true;
-+        goto cleanup;
-+    }
-+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
-+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
-+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
-+
-+    // Copy data to GPU
-+    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
-+
-+    // Run routine
-+    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
-+
-+    // Copy data back to the host
-+    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
-+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
-+
-+cleanup:
-+    // Free device memory
-+    if (d_ct != nullptr) cudaFree(d_ct);
-+    if (d_ss != nullptr) cudaFree(d_ss);
-+    if (d_pk != nullptr) cudaFree(d_pk);
-+    if (workspace != nullptr) destroy_workspace(workspace);
-+    if (randombytes != nullptr) release_entropy(randombytes);
-+
-+    return failure ? -1 : 0;
-+}
-+
-+template<class MLKEM_Decaps>
-+__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
-+    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
-+    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
-+}
-+
-+template<class MLKEM_Base>
-+int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-+    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
-+
-+    bool failure = false;
-+    uint8_t *workspace = nullptr;
-+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
-+
-+    // Allocate device workspaces
-+    try {
-+        workspace = make_workspace<MLKEM_Decaps>(1);
-+    } catch (const std::runtime_error& ex) {
-+        failure = true;
-+        goto cleanup;
-+    }
-+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
-+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
-+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
-+
-+    // Copy data to GPU
-+    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
-+    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
-+
-+    // Run routine
-+    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
-+
-+    // Copy data back to the host
-+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
-+
-+cleanup:
-+    // Free device memory
-+    if (d_ct != nullptr) cudaFree(d_ct);
-+    if (d_ss != nullptr) cudaFree(d_ss);
-+    if (d_sk != nullptr) cudaFree(d_sk);
-+    if (workspace != nullptr) destroy_workspace(workspace);
-+
-+    return failure ? -1 : 0;
-+}
-+
-+extern "C" {
-+    using KEM_512  = decltype(ML_KEM_512()  + Block());
-+    using KEM_768  = decltype(ML_KEM_768()  + Block());
-+    using KEM_1024 = decltype(ML_KEM_1024() + Block());
-+
-+#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-+    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
-+        return keypair<KEM_512>(pk, sk);
-+    }
-+    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-+        return encaps<KEM_512>(ct, ss, pk);
-+    }
-+    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-+        return decaps<KEM_512>(ss, ct, sk);
-+    }
-+#endif
-+
-+#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-+    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
-+        return keypair<KEM_768>(pk, sk);
-+    }
-+    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-+        return encaps<KEM_768>(ct, ss, pk);
-+    }
-+    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-+        return decaps<KEM_768>(ss, ct, sk);
-+    }
-+#endif
-+
-+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-+    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
-+        return keypair<KEM_1024>(pk, sk);
-+    }
-+    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-+        return encaps<KEM_1024>(ct, ss, pk);
-+    }
-+    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-+        return decaps<KEM_1024>(ss, ct, sk);
-+    }
-+#endif
-+}
 diff --git a/ref/indcpa.c b/ref/indcpa.c
 index 9a78c09..726cfa9 100644
 --- a/ref/indcpa.c
diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
index 100ddaab7e..72c6c71d2b 100644
--- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
+++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
@@ -36,7 +36,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in
         {%- elif impl['name'] == 'cupqc' %}
 
 if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %})
-    add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu)
+    add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT cupqc_{{ family.replace('_', '-') }}/cupqc_{{ family.replace('_', '-') }}.cu)
     target_include_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc)
     set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }})
diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index 50d5537d1b..9a6a935ac3 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -24,7 +24,7 @@ if(OQS_ENABLE_KEM_ml_kem_512_avx2)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    add_library(ml_kem_512_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu)
+    add_library(ml_kem_512_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu)
     target_include_libraries(ml_kem_512_cupqc cupqc)
     set_property(TARGET ml_kem_512_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options(ml_kem_512_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
@@ -50,7 +50,7 @@ if(OQS_ENABLE_KEM_ml_kem_768_avx2)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    add_library(ml_kem_768_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu)
+    add_library(ml_kem_768_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu)
     target_include_libraries(ml_kem_768_cupqc cupqc)
     set_property(TARGET ml_kem_768_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options(ml_kem_768_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
@@ -76,7 +76,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    add_library(ml_kem_1024_cupqc OBJECT pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu)
+    add_library(ml_kem_1024_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu)
     target_include_libraries(ml_kem_1024_cupqc cupqc)
     set_property(TARGET ml_kem_1024_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options(ml_kem_1024_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem/cupqc_ml-kem.cu
similarity index 100%
rename from src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-1024_cupqc/cupqc_ml-kem.cu
rename to src/kem/ml_kem/cupqc_ml-kem/cupqc_ml-kem.cu
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu
deleted file mode 100644
index 2935c2cd6a..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-512_cupqc/cupqc_ml-kem.cu
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright 2025 Nvidia Corporation
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *     http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-**/
-
-#include <cupqc.hpp>
-#include <stdexcept>
-#include <oqs/oqsconfig.h>
-
-using namespace cupqc;
-
-// Checks the return value from a CUDA API function
-#define CUDA_CHECK(err) \
-    if (err != cudaSuccess) { failure = true; goto cleanup; }
-
-template<class MLKEM_Keygen>
-__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
-    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
-    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
-}
-
-template<class MLKEM_Base>
-int keypair(uint8_t *pk, uint8_t *sk) {
-    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
-
-    bool failure = false;
-    uint8_t *workspace = nullptr, *randombytes=nullptr;
-    uint8_t *d_pk = nullptr, *d_sk = nullptr;
-
-    // Allocate device workspaces
-    try {
-        workspace   = make_workspace<MLKEM_Keygen>(1);
-        randombytes = get_entropy<MLKEM_Keygen>(1);
-    } catch (const std::runtime_error& ex) {
-        failure = true;
-        goto cleanup;
-    }
-    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
-
-    // Run routine
-    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
-
-    // Copy data back to the host
-    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
-    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
-
-cleanup:
-    // Free device memory
-    if (d_pk != nullptr) cudaFree(d_pk);
-    if (d_sk != nullptr) cudaFree(d_sk);
-    if (workspace != nullptr) destroy_workspace(workspace);
-    if (randombytes != nullptr) release_entropy(randombytes);
-
-    return failure ? -1 : 0;
-}
-
-template<class MLKEM_Encaps>
-__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
-    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
-    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
-}
-
-template<class MLKEM_Base>
-int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
-
-    bool failure = false;
-    uint8_t *workspace = nullptr, *randombytes=nullptr;
-    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
-
-    // Allocate device workspaces
-    try {
-        workspace   = make_workspace<MLKEM_Encaps>(1);
-        randombytes = get_entropy<MLKEM_Encaps>(1);
-    } catch (const std::runtime_error& ex) {
-        failure = true;
-        goto cleanup;
-    }
-    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
-
-    // Copy data to GPU
-    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
-
-    // Run routine
-    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
-
-    // Copy data back to the host
-    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
-    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
-
-cleanup:
-    // Free device memory
-    if (d_ct != nullptr) cudaFree(d_ct);
-    if (d_ss != nullptr) cudaFree(d_ss);
-    if (d_pk != nullptr) cudaFree(d_pk);
-    if (workspace != nullptr) destroy_workspace(workspace);
-    if (randombytes != nullptr) release_entropy(randombytes);
-
-    return failure ? -1 : 0;
-}
-
-template<class MLKEM_Decaps>
-__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
-    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
-    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
-}
-
-template<class MLKEM_Base>
-int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
-
-    bool failure = false;
-    uint8_t *workspace = nullptr;
-    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
-
-    // Allocate device workspaces
-    try {
-        workspace = make_workspace<MLKEM_Decaps>(1);
-    } catch (const std::runtime_error& ex) {
-        failure = true;
-        goto cleanup;
-    }
-    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
-
-    // Copy data to GPU
-    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
-    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
-
-    // Run routine
-    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
-
-    // Copy data back to the host
-    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
-
-cleanup:
-    // Free device memory
-    if (d_ct != nullptr) cudaFree(d_ct);
-    if (d_ss != nullptr) cudaFree(d_ss);
-    if (d_sk != nullptr) cudaFree(d_sk);
-    if (workspace != nullptr) destroy_workspace(workspace);
-
-    return failure ? -1 : 0;
-}
-
-extern "C" {
-    using KEM_512  = decltype(ML_KEM_512()  + Block());
-    using KEM_768  = decltype(ML_KEM_768()  + Block());
-    using KEM_1024 = decltype(ML_KEM_1024() + Block());
-
-#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_512>(pk, sk);
-    }
-    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_512>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_512>(ss, ct, sk);
-    }
-#endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_768>(pk, sk);
-    }
-    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_768>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_768>(ss, ct, sk);
-    }
-#endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_1024>(pk, sk);
-    }
-    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_1024>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_1024>(ss, ct, sk);
-    }
-#endif
-}
diff --git a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu b/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu
deleted file mode 100644
index 2935c2cd6a..0000000000
--- a/src/kem/ml_kem/pqcrystals-kyber-standard_ml-kem-768_cupqc/cupqc_ml-kem.cu
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright 2025 Nvidia Corporation
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *     http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-**/
-
-#include <cupqc.hpp>
-#include <stdexcept>
-#include <oqs/oqsconfig.h>
-
-using namespace cupqc;
-
-// Checks the return value from a CUDA API function
-#define CUDA_CHECK(err) \
-    if (err != cudaSuccess) { failure = true; goto cleanup; }
-
-template<class MLKEM_Keygen>
-__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
-    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
-    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
-}
-
-template<class MLKEM_Base>
-int keypair(uint8_t *pk, uint8_t *sk) {
-    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
-
-    bool failure = false;
-    uint8_t *workspace = nullptr, *randombytes=nullptr;
-    uint8_t *d_pk = nullptr, *d_sk = nullptr;
-
-    // Allocate device workspaces
-    try {
-        workspace   = make_workspace<MLKEM_Keygen>(1);
-        randombytes = get_entropy<MLKEM_Keygen>(1);
-    } catch (const std::runtime_error& ex) {
-        failure = true;
-        goto cleanup;
-    }
-    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
-
-    // Run routine
-    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
-
-    // Copy data back to the host
-    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
-    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
-
-cleanup:
-    // Free device memory
-    if (d_pk != nullptr) cudaFree(d_pk);
-    if (d_sk != nullptr) cudaFree(d_sk);
-    if (workspace != nullptr) destroy_workspace(workspace);
-    if (randombytes != nullptr) release_entropy(randombytes);
-
-    return failure ? -1 : 0;
-}
-
-template<class MLKEM_Encaps>
-__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
-    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
-    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
-}
-
-template<class MLKEM_Base>
-int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
-
-    bool failure = false;
-    uint8_t *workspace = nullptr, *randombytes=nullptr;
-    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
-
-    // Allocate device workspaces
-    try {
-        workspace   = make_workspace<MLKEM_Encaps>(1);
-        randombytes = get_entropy<MLKEM_Encaps>(1);
-    } catch (const std::runtime_error& ex) {
-        failure = true;
-        goto cleanup;
-    }
-    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
-
-    // Copy data to GPU
-    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
-
-    // Run routine
-    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
-
-    // Copy data back to the host
-    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
-    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
-
-cleanup:
-    // Free device memory
-    if (d_ct != nullptr) cudaFree(d_ct);
-    if (d_ss != nullptr) cudaFree(d_ss);
-    if (d_pk != nullptr) cudaFree(d_pk);
-    if (workspace != nullptr) destroy_workspace(workspace);
-    if (randombytes != nullptr) release_entropy(randombytes);
-
-    return failure ? -1 : 0;
-}
-
-template<class MLKEM_Decaps>
-__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
-    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
-    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
-}
-
-template<class MLKEM_Base>
-int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
-
-    bool failure = false;
-    uint8_t *workspace = nullptr;
-    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
-
-    // Allocate device workspaces
-    try {
-        workspace = make_workspace<MLKEM_Decaps>(1);
-    } catch (const std::runtime_error& ex) {
-        failure = true;
-        goto cleanup;
-    }
-    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
-    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
-
-    // Copy data to GPU
-    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
-    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
-
-    // Run routine
-    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
-
-    // Copy data back to the host
-    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
-
-cleanup:
-    // Free device memory
-    if (d_ct != nullptr) cudaFree(d_ct);
-    if (d_ss != nullptr) cudaFree(d_ss);
-    if (d_sk != nullptr) cudaFree(d_sk);
-    if (workspace != nullptr) destroy_workspace(workspace);
-
-    return failure ? -1 : 0;
-}
-
-extern "C" {
-    using KEM_512  = decltype(ML_KEM_512()  + Block());
-    using KEM_768  = decltype(ML_KEM_768()  + Block());
-    using KEM_1024 = decltype(ML_KEM_1024() + Block());
-
-#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_512>(pk, sk);
-    }
-    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_512>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_512>(ss, ct, sk);
-    }
-#endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_768>(pk, sk);
-    }
-    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_768>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_768>(ss, ct, sk);
-    }
-#endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_1024>(pk, sk);
-    }
-    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_1024>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_1024>(ss, ct, sk);
-    }
-#endif
-}

From 73ba48f8e5864bdceebb98f89d0fc93353cb4eee Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Fri, 17 Jan 2025 01:21:43 +0100
Subject: [PATCH 05/17] Stop piggybacking on pqcrystals-kyber-standard and move
 cupqc_ml-kem metadata to separate upstream repo

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .CMake/alg_support.cmake                      |   6 +-
 docs/algorithms/kem/ml_kem.md                 |   7 +
 docs/algorithms/kem/ml_kem.yml                |  34 +++
 .../copy_from_upstream/copy_from_upstream.py  |  17 +-
 .../copy_from_upstream/copy_from_upstream.yml |  12 ++
 .../patches/pqcrystals-ml_kem.patch           |  67 +-----
 .../src/kem/family/CMakeLists.txt             |   6 +-
 .../src/kem/family/kem_scheme.c               |  28 +--
 .../update_upstream_alg_docs.py               |  78 ++++---
 src/kem/ml_kem/CMakeLists.txt                 |  36 ++--
 .../cupqc_ml-kem.cu                           |   0
 .../cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu     | 198 ++++++++++++++++++
 .../cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu     | 198 ++++++++++++++++++
 src/kem/ml_kem/kem_ml_kem_1024.c              |  26 +--
 src/kem/ml_kem/kem_ml_kem_512.c               |  26 +--
 src/kem/ml_kem/kem_ml_kem_768.c               |  26 +--
 src/oqsconfig.h.cmake                         |   6 +-
 17 files changed, 581 insertions(+), 190 deletions(-)
 rename src/kem/ml_kem/{cupqc_ml-kem => cupqc_ml-kem-1024_cuda}/cupqc_ml-kem.cu (100%)
 create mode 100644 src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu
 create mode 100644 src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu

diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake
index de22dbceb9..96677676ed 100644
--- a/.CMake/alg_support.cmake
+++ b/.CMake/alg_support.cmake
@@ -340,7 +340,7 @@ endif()
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
 if(OQS_USE_CUPQC)
-    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF)
+    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_512_cuda "" ON "OQS_ENABLE_KEM_ml_kem_512" OFF)
 endif()
 endif()
 
@@ -352,7 +352,7 @@ endif()
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
 if(OQS_USE_CUPQC)
-    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF)
+    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_768_cuda "" ON "OQS_ENABLE_KEM_ml_kem_768" OFF)
 endif()
 endif()
 
@@ -364,7 +364,7 @@ endif()
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Darwin")
 if(OQS_USE_CUPQC)
-    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_cupqc "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF)
+    cmake_dependent_option(OQS_ENABLE_KEM_ml_kem_1024_cuda "" ON "OQS_ENABLE_KEM_ml_kem_1024" OFF)
 endif()
 endif()
 
diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md
index d1806517ba..2d65aacff2 100644
--- a/docs/algorithms/kem/ml_kem.md
+++ b/docs/algorithms/kem/ml_kem.md
@@ -9,6 +9,10 @@
 - **Primary Source**<a name="primary-source"></a>:
   - **Source**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches
   - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
+- **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches
+  - **cupqc-cuda**:<a name="cupqc-cuda"></a>
+      - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30
+      - **Implementation license (SPDX-Identifier)**: https://docs.nvidia.com/cuda/cupqc/license.html
 
 
 ## Parameter set summary
@@ -25,6 +29,7 @@
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                 |
 | [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                 |
+|     [cupqc-cuda](#cupqc-cuda)     | cuda                     | CUDA                        | Linux,Darwin                    | None                    | False                              | False                                          | False                 |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -36,6 +41,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                |
 | [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                |
+|     [cupqc-cuda](#cupqc-cuda)     | cuda                     | CUDA                        | Linux,Darwin                    | None                    | False                              | False                                          | False                |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
@@ -45,6 +51,7 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 |:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
 | [Primary Source](#primary-source) | ref                      | All                         | All                             | None                    | True                               | True                                           | False                |
 | [Primary Source](#primary-source) | avx2                     | x86\_64                     | Linux,Darwin                    | AVX2,BMI2,POPCNT        | True                               | True                                           | False                |
+|     [cupqc-cuda](#cupqc-cuda)     | cuda                     | CUDA                        | Linux,Darwin                    | None                    | False                              | False                                          | False                |
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml
index 81ef2b6c4a..529e4c870c 100644
--- a/docs/algorithms/kem/ml_kem.yml
+++ b/docs/algorithms/kem/ml_kem.yml
@@ -20,6 +20,10 @@ primary-upstream:
   source: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd
     with copy_from_upstream patches
   spdx-license-identifier: CC0-1.0 or Apache-2.0
+optimized-upstreams:
+  cupqc-cuda:
+    source: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30
+    spdx-license-identifier: https://docs.nvidia.com/cuda/cupqc/license.html
 parameter-sets:
 - name: ML-KEM-512
   claimed-nist-level: 1
@@ -54,6 +58,16 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
+  - upstream: cupqc-cuda
+    upstream-id: cuda
+    supported-platforms:
+    - architecture: CUDA
+      operating_systems:
+      - Linux
+      - Darwin
+    no-secret-dependent-branching-claimed: false
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
 - name: ML-KEM-768
   claimed-nist-level: 3
   claimed-security: IND-CCA2
@@ -87,6 +101,16 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
+  - upstream: cupqc-cuda
+    upstream-id: cuda
+    supported-platforms:
+    - architecture: CUDA
+      operating_systems:
+      - Linux
+      - Darwin
+    no-secret-dependent-branching-claimed: false
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
 - name: ML-KEM-1024
   claimed-nist-level: 5
   claimed-security: IND-CCA2
@@ -120,3 +144,13 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
+  - upstream: cupqc-cuda
+    upstream-id: cuda
+    supported-platforms:
+    - architecture: CUDA
+      operating_systems:
+      - Linux
+      - Darwin
+    no-secret-dependent-branching-claimed: false
+    no-secret-dependent-branching-checked-by-valgrind: false
+    large-stack-usage: false
diff --git a/scripts/copy_from_upstream/copy_from_upstream.py b/scripts/copy_from_upstream/copy_from_upstream.py
index ccab976b7f..46968fa33c 100755
--- a/scripts/copy_from_upstream/copy_from_upstream.py
+++ b/scripts/copy_from_upstream/copy_from_upstream.py
@@ -599,14 +599,15 @@ def process_families(instructions, basedir, with_kat, with_generator, with_libja
                             # when provided to the compiler; OQS uses the term ARM_NEON
                             if req['architecture'] == 'arm_8':
                                 req['architecture'] = 'ARM64_V8'
-                            if req['architecture'] == 'ARM64_V8' and 'asimd' in req['required_flags']:
-                                req['required_flags'].remove('asimd')
-                                req['required_flags'].append('arm_neon')
-                            if req['architecture'] == 'ARM64_V8' and 'sha3' in req['required_flags']:
-                                req['required_flags'].remove('sha3')
-                                req['required_flags'].append('arm_sha3')
-                            impl['required_flags'] = req['required_flags']
-                            family['all_required_flags'].update(req['required_flags'])
+                            if 'required_flags' in req:
+                                if req['architecture'] == 'ARM64_V8' and 'asimd' in req['required_flags']:
+                                    req['required_flags'].remove('asimd')
+                                    req['required_flags'].append('arm_neon')
+                                if req['architecture'] == 'ARM64_V8' and 'sha3' in req['required_flags']:
+                                    req['required_flags'].remove('sha3')
+                                    req['required_flags'].append('arm_sha3')
+                                impl['required_flags'] = req['required_flags']
+                                family['all_required_flags'].update(req['required_flags'])
                     except KeyError as ke:
                         if (impl['name'] != family['default_implementation']):
                             print("No required flags found for %s (KeyError %s on impl %s)" % (
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index f80f0979d5..b2f74f566e 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -38,6 +38,14 @@ upstreams:
     kem_meta_path: '{pretty_name_full}_META.yml'
     kem_scheme_path: '.'
     patches: [pqcrystals-ml_kem.patch]
+  - 
+    name: cupqc
+    git_url: https://github.com/praveksharma/cupqc-mlkem.git
+    git_branch: main
+    git_commit: adb8454e56979628c07b67eb7d90f9337be6dc30
+    kem_meta_path: '{pretty_name_full}_META.yml'
+    kem_scheme_path: '.'
+    patches: []
   -
     name: pqcrystals-dilithium
     git_url: https://github.com/pq-crystals/dilithium.git
@@ -166,6 +174,10 @@ kems:
   -
     name: ml_kem
     default_implementation: ref
+    arch_specific_implementations:
+                                      cuda: cuda
+    arch_specific_upstream_locations:
+                                      cuda: cupqc
     upstream_location: pqcrystals-kyber-standard
     schemes:
       -
diff --git a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
index 2e7068cd83..d65eea2f30 100644
--- a/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
+++ b/scripts/copy_from_upstream/patches/pqcrystals-ml_kem.patch
@@ -1,8 +1,8 @@
 diff --git a/Kyber1024_META.yml b/ML-KEM-1024_META.yml
-similarity index 50%
+similarity index 55%
 rename from Kyber1024_META.yml
 rename to ML-KEM-1024_META.yml
-index baa5ca3..98e0e5c 100644
+index baa5ca3..67243b8 100644
 --- a/Kyber1024_META.yml
 +++ b/ML-KEM-1024_META.yml
 @@ -1,4 +1,4 @@
@@ -55,29 +55,11 @@ index baa5ca3..98e0e5c 100644
      supported_platforms:
        - architecture: x86_64
          operating_systems:
-@@ -47,3 +45,17 @@ implementations:
-           - avx2
-           - bmi2
-           - popcnt
-+  - name: cupqc
-+    version: FIPS203
-+    compile_opts: $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>
-+    signature_keypair: ml_kem_1024_cupqc_keypair
-+    signature_enc: ml_kem_1024_cupqc_enc
-+    signature_dec: ml_kem_1024_cupqc_dec
-+    sources:
-+    supported_platforms:
-+      - architecture: CUDA
-+        operating_systems:
-+          - Linux
-+          - Darwin
-+        required_flags:
-+          - dummy
 diff --git a/Kyber512_META.yml b/ML-KEM-512_META.yml
-similarity index 50%
+similarity index 55%
 rename from Kyber512_META.yml
 rename to ML-KEM-512_META.yml
-index b251701..0df6843 100644
+index b251701..18c28b0 100644
 --- a/Kyber512_META.yml
 +++ b/ML-KEM-512_META.yml
 @@ -1,4 +1,4 @@
@@ -130,30 +112,11 @@ index b251701..0df6843 100644
      supported_platforms:
        - architecture: x86_64
          operating_systems:
-@@ -47,3 +45,17 @@ implementations:
-           - avx2
-           - bmi2
-           - popcnt
-+  - name: cupqc
-+    version: FIPS203
-+    compile_opts: $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>
-+    signature_keypair: ml_kem_512_cupqc_keypair
-+    signature_enc: ml_kem_512_cupqc_enc
-+    signature_dec: ml_kem_512_cupqc_dec
-+    sources:
-+    supported_platforms:
-+      - architecture: CUDA
-+        operating_systems:
-+          - Linux
-+          - Darwin
-+        required_flags:
-+          - dummy
-\ No newline at end of file
 diff --git a/Kyber768_META.yml b/ML-KEM-768_META.yml
-similarity index 50%
+similarity index 55%
 rename from Kyber768_META.yml
 rename to ML-KEM-768_META.yml
-index 7a0cc3d..f5f45f7 100644
+index 7a0cc3d..ccc03c9 100644
 --- a/Kyber768_META.yml
 +++ b/ML-KEM-768_META.yml
 @@ -1,4 +1,4 @@
@@ -206,24 +169,6 @@ index 7a0cc3d..f5f45f7 100644
      supported_platforms:
        - architecture: x86_64
          operating_systems:
-@@ -47,3 +45,17 @@ implementations:
-           - avx2
-           - bmi2
-           - popcnt
-+  - name: cupqc
-+    version: FIPS203
-+    compile_opts: $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>
-+    signature_keypair: ml_kem_768_cupqc_keypair
-+    signature_enc: ml_kem_768_cupqc_enc
-+    signature_dec: ml_kem_768_cupqc_dec
-+    sources:
-+    supported_platforms:
-+      - architecture: CUDA
-+        operating_systems:
-+          - Linux
-+          - Darwin
-+        required_flags:
-+          - dummy
 diff --git a/avx2/indcpa.c b/avx2/indcpa.c
 index 18b9d08..c4b2b3a 100644
 --- a/avx2/indcpa.c
diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
index 72c6c71d2b..004408aa3d 100644
--- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
+++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
@@ -33,10 +33,10 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in
     target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PUBLIC {{ impl['compile_opts'] }})
            {%- endif -%}
 
-        {%- elif impl['name'] == 'cupqc' %}
+        {%- elif impl['name'] == 'cuda' %}
 
 if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %})
-    add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT cupqc_{{ family.replace('_', '-') }}/cupqc_{{ family.replace('_', '-') }}.cu)
+    add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu)
     target_include_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc)
     set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }})
@@ -45,7 +45,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if
 if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %})
     add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {% for source_file in impl['sources']|sort -%}{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/{{ source_file }}{%- if not loop.last %} {% endif -%}{%- endfor -%})
         {%- endif %}
-        {%- if impl['name'] != 'cupqc' %}
+        {%- if impl['name'] != 'cuda' %}
     target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${CMAKE_CURRENT_LIST_DIR}/{{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }})
     target_include_directories({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
         {%- if impl['name'] != scheme['default_implementation'] and impl['required_flags'] -%}
diff --git a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
index 95eafa703e..630aee1389 100644
--- a/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
+++ b/scripts/copy_from_upstream/src/kem/family/kem_scheme.c
@@ -93,7 +93,7 @@ extern int {{ scheme['metadata']['default_dec_signature']  }}(uint8_t *ss, const
     {%- endfor %}
 
     {%- for impl in scheme['metadata']['implementations'] if impl['name'] != scheme['default_implementation'] %}
-{% if impl['name'] == 'cupqc'%}
+{% if impl['name'] == 'cuda'%}
 #if defined(OQS_USE_CUPQC)
         {%- endif %}
 #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %}
@@ -115,7 +115,7 @@ extern int {{ impl['signature_dec'] }}(uint8_t *ss, const uint8_t *ct, const uin
 extern int PQCLEAN_{{ scheme['pqclean_scheme_c']|upper }}_{{ impl['name']|upper }}_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
         {%- endif %}
 #endif
-        {%- if impl['name'] == 'cupqc'%}
+        {%- if impl['name'] == 'cuda'%}
 #endif /* OQS_USE_CUPQC */
         {%- endif %}
     {%- endfor %}
@@ -171,12 +171,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_keypair(uint8_t *
 {% endfor -%}
 #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/
 {%- endif %}
-    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
+    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %}
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }})
 	return (OQS_STATUS) {{ impl['signature_keypair'] }}(public_key, secret_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */
     {%- endfor %}
-    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
+    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %}
     {%- if loop.first %}
 #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %}
     {%- else %}
@@ -250,12 +250,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_encaps(uint8_t *c
 {% endfor -%}
 #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/
 {%- endif %}
-    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
+    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %}
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }})
 	return (OQS_STATUS) {{ impl['signature_enc'] }}(ciphertext, shared_secret, public_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */
     {%- endfor %}
-    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
+    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %}
     {%- if loop.first %}
 #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %}
     {%- else %}
@@ -329,12 +329,12 @@ OQS_API OQS_STATUS OQS_KEM_{{ family }}_{{ scheme['scheme'] }}_decaps(uint8_t *s
 {% endfor -%}
 #else /*OQS_LIBJADE_BUILD && (OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['scheme'] }} {%- if 'alias_scheme' in scheme %} || OQS_ENABLE_LIBJADE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}{%- endif %})*/
 {%- endif %}
-    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cupqc' %}
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc)
+    {%- for impl in scheme['metadata']['implementations'] if impl['name'] == 'cuda' %}
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }})
 	return (OQS_STATUS) {{ impl['signature_dec'] }}(shared_secret, ciphertext, secret_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_cupqc */
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} */
     {%- endfor %}
-    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cupqc') %}
+    {%- for impl in scheme['metadata']['implementations'] if (impl['name'] != scheme['default_implementation'] and impl['name'] != 'cuda') %}
     {%- if loop.first %}
 #if defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }}) {%- if 'alias_scheme' in scheme %} || defined(OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}){%- endif %}
     {%- else %}
diff --git a/scripts/copy_from_upstream/update_upstream_alg_docs.py b/scripts/copy_from_upstream/update_upstream_alg_docs.py
index 33483067e2..ba765b84c1 100755
--- a/scripts/copy_from_upstream/update_upstream_alg_docs.py
+++ b/scripts/copy_from_upstream/update_upstream_alg_docs.py
@@ -95,8 +95,43 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes
             oqs_yaml_path = os.path.join(liboqs_root, 'docs', 'algorithms', 'kem', '{}.yml'.format(kem['name']))
             if os.path.isfile(oqs_yaml_path):
                oqs_yaml = load_yaml(oqs_yaml_path)
+
+            upstream_base_url = ui['git_url'][:-len(".git")]
+            # upstream is special: We will take the upstream git commit information 
+            # (possibly with added patch comment) as it is what drove the update
+
+            # Need to check if yml is of old format. If so, update to new format
+            if 'primary-upstream' not in oqs_yaml:
+                print("Updating format of {}. Please double check ordering of yaml file".format(scheme['pretty_name_full']))
+                lhs = oqs_yaml['upstream']
+                oqs_yaml['primary-upstream'] = dict()
+                oqs_yaml['primary-upstream']['spdx-license-identifier'] = oqs_yaml['spdx-license-identifier']
+                for i in range(len(oqs_yaml['parameter-sets'])):
+                    for j in range(len(oqs_yaml['parameter-sets'][i]['implementations'])):
+                        oqs_yaml['parameter-sets'][i]['implementations'][j]['upstream'] = 'primary-upstream'
             else:
-                continue
+                lhs = oqs_yaml['primary-upstream']['source']
+            oqs_yaml['primary-upstream']['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(upstream_base_url, ui['git_commit']), "primary-upstream")
+            if 'upstream' in oqs_yaml:
+                del oqs_yaml['upstream']
+                del oqs_yaml['spdx-license-identifier']
+            
+            if ouis:
+                for upstream in ouis:
+                    optimized_upstream_base_url = ouis[upstream]['git_url'][:-len(".git")]
+                    optimized_patches_done=""
+                    if 'patches' in ouis[upstream]:
+                        for patchfilename in ouis[upstream]['patches']:
+                            if kem['name'] in patchfilename:
+                                optimized_patches_done=" with copy_from_upstream patches"
+                    if 'optimized-upstreams' in oqs_yaml and upstream in oqs_yaml['optimized-upstreams']:
+                        lhs = oqs_yaml['optimized-upstreams'][upstream]['source']
+                    else:
+                        lhs = ''
+                        oqs_yaml['optimized-upstreams'] = oqs_yaml.get('optimized-upstreams', dict())
+                        oqs_yaml['optimized-upstreams'][upstream] = oqs_yaml['optimized-upstreams'].get(upstream, dict())
+                    git_commit = ouis[upstream]['git_commit']
+                    oqs_yaml['optimized-upstreams'][upstream]['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+optimized_patches_done).format(optimized_upstream_base_url, git_commit), "optimized-upstreams")
 
             # We cannot assume that the ordering of "parameter-sets"
             # in the OQS YAML files matches that of copy_from_upstream.yml
@@ -111,45 +146,6 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes
                 oqs_yaml['type'] = rhs_if_not_equal(oqs_yaml['type'], upstream_yaml['type'], "type")
                 oqs_yaml['principal-submitters'] = rhs_if_not_equal(oqs_yaml['principal-submitters'], upstream_yaml['principal-submitters'], "principal-submitters")
 
-                upstream_base_url = ui['git_url'][:-len(".git")]
-                # upstream is special: We will take the upstream git commit information 
-                # (possibly with added patch comment) as it is what drove the update
-
-                # Need to check if yml is of old format. If so, update to new format
-                if 'primary-upstream' not in oqs_yaml:
-                    print("Updating format of {}. Please double check ordering of yaml file".format(scheme['pretty_name_full']))
-                    lhs = oqs_yaml['upstream']
-                    oqs_yaml['primary-upstream'] = dict()
-                    oqs_yaml['primary-upstream']['spdx-license-identifier'] = oqs_yaml['spdx-license-identifier']
-                    for i in range(len(oqs_yaml['parameter-sets'])):
-                        for j in range(len(oqs_yaml['parameter-sets'][i]['implementations'])):
-                            oqs_yaml['parameter-sets'][i]['implementations'][j]['upstream'] = 'primary-upstream'
-                else:
-                    lhs = oqs_yaml['primary-upstream']['source']
-                oqs_yaml['primary-upstream']['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(upstream_base_url, ui['git_commit']), "primary-upstream")
-                if 'upstream' in oqs_yaml:
-                    del oqs_yaml['upstream']
-                    del oqs_yaml['spdx-license-identifier']
-
-                if ouis:
-                    for upstream in ouis:
-                        optimized_upstream_base_url = ouis[upstream]['git_url'][:-len(".git")]
-                        for patchfilename in ouis[upstream]['patches']:
-                            if kem['name'] in patchfilename:
-                                patches_done=" with copy_from_upstream patches"
-                        patches_done=""
-                        if 'patches' in ouis[upstream]:
-                            for patchfilename in ouis[upstream]['patches']:
-                                if kem['name'] in patchfilename:
-                                    patches_done=" with copy_from_upstream patches"
-                        if 'optimized-upstreams' in oqs_yaml and upstream in oqs_yaml['optimized-upstreams']:
-                            lhs = oqs_yaml['optimized-upstreams'][upstream]['source']
-                        else:
-                            lhs = ''
-                        git_commit = ouis[upstream]['git_commit']
-                        oqs_yaml['optimized-upstreams'][upstream]['source'] = rhs_if_not_equal(lhs, ("{}/commit/{}"+patches_done).format(optimized_upstream_base_url, git_commit), "optimized-upstreams")
-
-
                 if 'auxiliary-submitters' in upstream_yaml:
                         oqs_yaml['auxiliary-submitters'] = rhs_if_not_equal(oqs_yaml['auxiliary-submitters'] if 'auxiliary-submitters' in oqs_yaml else '', upstream_yaml['auxiliary-submitters'], "auxiliary-submitters")
 
@@ -204,7 +200,7 @@ def update_upstream_kem_alg_docs(liboqs_root, kems, upstream_info, write_changes
                                     upstream_impl['supported_platforms'][i]['architecture'] = 'ARM64_V8'
                                     if 'asimd' in upstream_impl['supported_platforms'][i]['required_flags']:
                                         upstream_impl['supported_platforms'][i]['required_flags'].remove('asimd')
-                                if not upstream_impl['supported_platforms'][i]['required_flags']:
+                                if 'required_flags' in upstream_impl['supported_platforms'][i] and not upstream_impl['supported_platforms'][i]['required_flags']:
                                     del upstream_impl['supported_platforms'][i]['required_flags']
 
                             impl['supported-platforms'] = rhs_if_not_equal(impl['supported-platforms'], upstream_impl['supported_platforms'], "supported-platforms")
diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index 9a6a935ac3..76b53851b1 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -23,12 +23,12 @@ if(OQS_ENABLE_KEM_ml_kem_512_avx2)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_avx2>)
 endif()
 
-if(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    add_library(ml_kem_512_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu)
-    target_include_libraries(ml_kem_512_cupqc cupqc)
-    set_property(TARGET ml_kem_512_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
-    target_compile_options(ml_kem_512_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
-    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_cupqc>)
+if(OQS_ENABLE_KEM_ml_kem_512_cuda)
+    add_library(ml_kem_512_cuda OBJECT cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu)
+    target_include_libraries(ml_kem_512_cuda cupqc)
+    set_property(TARGET ml_kem_512_cuda PROPERTY CUDA_ARCHITECTURES OFF)
+    target_compile_options(ml_kem_512_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
+    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_cuda>)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768)
@@ -49,12 +49,12 @@ if(OQS_ENABLE_KEM_ml_kem_768_avx2)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_avx2>)
 endif()
 
-if(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    add_library(ml_kem_768_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu)
-    target_include_libraries(ml_kem_768_cupqc cupqc)
-    set_property(TARGET ml_kem_768_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
-    target_compile_options(ml_kem_768_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
-    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_cupqc>)
+if(OQS_ENABLE_KEM_ml_kem_768_cuda)
+    add_library(ml_kem_768_cuda OBJECT cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu)
+    target_include_libraries(ml_kem_768_cuda cupqc)
+    set_property(TARGET ml_kem_768_cuda PROPERTY CUDA_ARCHITECTURES OFF)
+    target_compile_options(ml_kem_768_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
+    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_cuda>)
 endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024)
@@ -75,12 +75,12 @@ if(OQS_ENABLE_KEM_ml_kem_1024_avx2)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_avx2>)
 endif()
 
-if(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    add_library(ml_kem_1024_cupqc OBJECT cupqc_ml-kem/cupqc_ml-kem.cu)
-    target_include_libraries(ml_kem_1024_cupqc cupqc)
-    set_property(TARGET ml_kem_1024_cupqc PROPERTY CUDA_ARCHITECTURES OFF)
-    target_compile_options(ml_kem_1024_cupqc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
-    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_cupqc>)
+if(OQS_ENABLE_KEM_ml_kem_1024_cuda)
+    add_library(ml_kem_1024_cuda OBJECT cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu)
+    target_include_libraries(ml_kem_1024_cuda cupqc)
+    set_property(TARGET ml_kem_1024_cuda PROPERTY CUDA_ARCHITECTURES OFF)
+    target_compile_options(ml_kem_1024_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
+    set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_cuda>)
 endif()
 
 set(ML_KEM_OBJS ${_ML_KEM_OBJS} PARENT_SCOPE)
diff --git a/src/kem/ml_kem/cupqc_ml-kem/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu
similarity index 100%
rename from src/kem/ml_kem/cupqc_ml-kem/cupqc_ml-kem.cu
rename to src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu
diff --git a/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu
new file mode 100644
index 0000000000..2935c2cd6a
--- /dev/null
+++ b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2025 Nvidia Corporation
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+**/
+
+#include <cupqc.hpp>
+#include <stdexcept>
+#include <oqs/oqsconfig.h>
+
+using namespace cupqc;
+
+// Checks the return value from a CUDA API function
+#define CUDA_CHECK(err) \
+    if (err != cudaSuccess) { failure = true; goto cleanup; }
+
+template<class MLKEM_Keygen>
+__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
+    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int keypair(uint8_t *pk, uint8_t *sk) {
+    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_pk = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Keygen>(1);
+        randombytes = get_entropy<MLKEM_Keygen>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
+
+    // Run routine
+    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Encaps>
+__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
+    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Encaps>(1);
+        randombytes = get_entropy<MLKEM_Encaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
+
+    // Run routine
+    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Decaps>
+__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
+    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
+    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace = make_workspace<MLKEM_Decaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
+
+    // Run routine
+    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+
+    return failure ? -1 : 0;
+}
+
+extern "C" {
+    using KEM_512  = decltype(ML_KEM_512()  + Block());
+    using KEM_768  = decltype(ML_KEM_768()  + Block());
+    using KEM_1024 = decltype(ML_KEM_1024() + Block());
+
+#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_512>(pk, sk);
+    }
+    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_512>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_512>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_768>(pk, sk);
+    }
+    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_768>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_768>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_1024>(pk, sk);
+    }
+    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_1024>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_1024>(ss, ct, sk);
+    }
+#endif
+}
diff --git a/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu
new file mode 100644
index 0000000000..2935c2cd6a
--- /dev/null
+++ b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2025 Nvidia Corporation
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+**/
+
+#include <cupqc.hpp>
+#include <stdexcept>
+#include <oqs/oqsconfig.h>
+
+using namespace cupqc;
+
+// Checks the return value from a CUDA API function
+#define CUDA_CHECK(err) \
+    if (err != cudaSuccess) { failure = true; goto cleanup; }
+
+template<class MLKEM_Keygen>
+__global__ void keygen_kernel(uint8_t *pk, uint8_t *sk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Keygen::shared_memory_size];
+    MLKEM_Keygen().execute(pk, sk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int keypair(uint8_t *pk, uint8_t *sk) {
+    using MLKEM_Keygen = decltype(MLKEM_Base() + Function<function::Keygen>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_pk = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Keygen>(1);
+        randombytes = get_entropy<MLKEM_Keygen>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Keygen::public_key_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Keygen::secret_key_size));
+
+    // Run routine
+    keygen_kernel<MLKEM_Keygen><<<1, MLKEM_Keygen::BlockDim>>>(d_pk, d_sk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(pk, d_pk, MLKEM_Keygen::public_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(sk, d_sk, MLKEM_Keygen::secret_key_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Encaps>
+__global__ void encaps_kernel(uint8_t *ct, uint8_t *ss, const uint8_t *pk, uint8_t *workspace, uint8_t *randombytes) {
+    __shared__ uint8_t smem_ptr[MLKEM_Encaps::shared_memory_size];
+    MLKEM_Encaps().execute(ct, ss, pk, randombytes, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int encaps(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+    using MLKEM_Encaps = decltype(MLKEM_Base() + Function<function::Encaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr, *randombytes=nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_pk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace   = make_workspace<MLKEM_Encaps>(1);
+        randombytes = get_entropy<MLKEM_Encaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Encaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Encaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_pk, MLKEM_Encaps::public_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_pk, pk, MLKEM_Encaps::public_key_size, cudaMemcpyDefault));
+
+    // Run routine
+    encaps_kernel<MLKEM_Encaps><<<1, MLKEM_Encaps::BlockDim>>>(d_ct, d_ss, d_pk, workspace, randombytes);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ct, d_ct, MLKEM_Encaps::ciphertext_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Encaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_pk != nullptr) cudaFree(d_pk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+    if (randombytes != nullptr) release_entropy(randombytes);
+
+    return failure ? -1 : 0;
+}
+
+template<class MLKEM_Decaps>
+__global__ void decaps_kernel(uint8_t *ss, const uint8_t *ct, const uint8_t *sk, uint8_t *workspace) {
+    __shared__ uint8_t smem_ptr[MLKEM_Decaps::shared_memory_size];
+    MLKEM_Decaps().execute(ss, ct, sk, workspace, smem_ptr);
+}
+
+template<class MLKEM_Base>
+int decaps(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+    using MLKEM_Decaps = decltype(MLKEM_Base() + Function<function::Decaps>());
+
+    bool failure = false;
+    uint8_t *workspace = nullptr;
+    uint8_t *d_ct = nullptr, *d_ss = nullptr, *d_sk = nullptr;
+
+    // Allocate device workspaces
+    try {
+        workspace = make_workspace<MLKEM_Decaps>(1);
+    } catch (const std::runtime_error& ex) {
+        failure = true;
+        goto cleanup;
+    }
+    CUDA_CHECK(cudaMalloc((void**)&d_ct, MLKEM_Decaps::ciphertext_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_ss, MLKEM_Decaps::shared_secret_size));
+    CUDA_CHECK(cudaMalloc((void**)&d_sk, MLKEM_Decaps::secret_key_size));
+
+    // Copy data to GPU
+    CUDA_CHECK(cudaMemcpy(d_sk, sk, MLKEM_Decaps::secret_key_size, cudaMemcpyDefault));
+    CUDA_CHECK(cudaMemcpy(d_ct, ct, MLKEM_Decaps::ciphertext_size, cudaMemcpyDefault));
+
+    // Run routine
+    decaps_kernel<MLKEM_Decaps><<<1, MLKEM_Decaps::BlockDim>>>(d_ss, d_ct, d_sk, workspace);
+
+    // Copy data back to the host
+    CUDA_CHECK(cudaMemcpy(ss, d_ss, MLKEM_Decaps::shared_secret_size, cudaMemcpyDefault));
+
+cleanup:
+    // Free device memory
+    if (d_ct != nullptr) cudaFree(d_ct);
+    if (d_ss != nullptr) cudaFree(d_ss);
+    if (d_sk != nullptr) cudaFree(d_sk);
+    if (workspace != nullptr) destroy_workspace(workspace);
+
+    return failure ? -1 : 0;
+}
+
+extern "C" {
+    using KEM_512  = decltype(ML_KEM_512()  + Block());
+    using KEM_768  = decltype(ML_KEM_768()  + Block());
+    using KEM_1024 = decltype(ML_KEM_1024() + Block());
+
+#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_512>(pk, sk);
+    }
+    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_512>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_512>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_768>(pk, sk);
+    }
+    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_768>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_768>(ss, ct, sk);
+    }
+#endif
+
+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
+        return keypair<KEM_1024>(pk, sk);
+    }
+    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
+        return encaps<KEM_1024>(ct, ss, pk);
+    }
+    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
+        return decaps<KEM_1024>(ss, ct, sk);
+    }
+#endif
+}
diff --git a/src/kem/ml_kem/kem_ml_kem_1024.c b/src/kem/ml_kem/kem_ml_kem_1024.c
index 2d4d46df8b..1e471af58a 100644
--- a/src/kem/ml_kem/kem_ml_kem_1024.c
+++ b/src/kem/ml_kem/kem_ml_kem_1024.c
@@ -41,17 +41,17 @@ extern int pqcrystals_ml_kem_1024_avx2_dec(uint8_t *ss, const uint8_t *ct, const
 #endif
 
 #if defined(OQS_USE_CUPQC)
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-extern int ml_kem_1024_cupqc_keypair(uint8_t *pk, uint8_t *sk);
-extern int ml_kem_1024_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int ml_kem_1024_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cuda)
+extern int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk);
+extern int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 #endif
 #endif /* OQS_USE_CUPQC */
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *secret_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-	return (OQS_STATUS) ml_kem_1024_cupqc_keypair(public_key, secret_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_1024_keypair(public_key, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -68,9 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_keypair(uint8_t *public_key, uint8_t *sec
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-	return (OQS_STATUS) ml_kem_1024_cupqc_enc(ciphertext, shared_secret, public_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_1024_enc(ciphertext, shared_secret, public_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -87,9 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_encaps(uint8_t *ciphertext, uint8_t *shar
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_1024_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-	return (OQS_STATUS) ml_kem_1024_cupqc_dec(shared_secret, ciphertext, secret_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_1024_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_1024_dec(shared_secret, ciphertext, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_1024_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_1024_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
diff --git a/src/kem/ml_kem/kem_ml_kem_512.c b/src/kem/ml_kem/kem_ml_kem_512.c
index 5e60018b37..41805f91be 100644
--- a/src/kem/ml_kem/kem_ml_kem_512.c
+++ b/src/kem/ml_kem/kem_ml_kem_512.c
@@ -41,17 +41,17 @@ extern int pqcrystals_ml_kem_512_avx2_dec(uint8_t *ss, const uint8_t *ct, const
 #endif
 
 #if defined(OQS_USE_CUPQC)
-#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-extern int ml_kem_512_cupqc_keypair(uint8_t *pk, uint8_t *sk);
-extern int ml_kem_512_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int ml_kem_512_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+#if defined(OQS_ENABLE_KEM_ml_kem_512_cuda)
+extern int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk);
+extern int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 #endif
 #endif /* OQS_USE_CUPQC */
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secret_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-	return (OQS_STATUS) ml_kem_512_cupqc_keypair(public_key, secret_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_512_keypair(public_key, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -68,9 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_keypair(uint8_t *public_key, uint8_t *secr
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-	return (OQS_STATUS) ml_kem_512_cupqc_enc(ciphertext, shared_secret, public_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_512_enc(ciphertext, shared_secret, public_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -87,9 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_512_encaps(uint8_t *ciphertext, uint8_t *share
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_512_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-	return (OQS_STATUS) ml_kem_512_cupqc_dec(shared_secret, ciphertext, secret_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_512_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_512_dec(shared_secret, ciphertext, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_512_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_512_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
diff --git a/src/kem/ml_kem/kem_ml_kem_768.c b/src/kem/ml_kem/kem_ml_kem_768.c
index ceb651cedb..11a7421b20 100644
--- a/src/kem/ml_kem/kem_ml_kem_768.c
+++ b/src/kem/ml_kem/kem_ml_kem_768.c
@@ -41,17 +41,17 @@ extern int pqcrystals_ml_kem_768_avx2_dec(uint8_t *ss, const uint8_t *ct, const
 #endif
 
 #if defined(OQS_USE_CUPQC)
-#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-extern int ml_kem_768_cupqc_keypair(uint8_t *pk, uint8_t *sk);
-extern int ml_kem_768_cupqc_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
-extern int ml_kem_768_cupqc_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
+#if defined(OQS_ENABLE_KEM_ml_kem_768_cuda)
+extern int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk);
+extern int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk);
+extern int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk);
 #endif
 #endif /* OQS_USE_CUPQC */
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secret_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-	return (OQS_STATUS) ml_kem_768_cupqc_keypair(public_key, secret_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_768_keypair(public_key, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -68,9 +68,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_keypair(uint8_t *public_key, uint8_t *secr
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-	return (OQS_STATUS) ml_kem_768_cupqc_enc(ciphertext, shared_secret, public_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_768_enc(ciphertext, shared_secret, public_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
@@ -87,9 +87,9 @@ OQS_API OQS_STATUS OQS_KEM_ml_kem_768_encaps(uint8_t *ciphertext, uint8_t *share
 }
 
 OQS_API OQS_STATUS OQS_KEM_ml_kem_768_decaps(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key) {
-#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-	return (OQS_STATUS) ml_kem_768_cupqc_dec(shared_secret, ciphertext, secret_key);
-#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cupqc */
+#if defined(OQS_USE_CUPQC) && defined(OQS_ENABLE_KEM_ml_kem_768_cuda)
+	return (OQS_STATUS) cupqc_ml_kem_768_dec(shared_secret, ciphertext, secret_key);
+#endif /* OQS_USE_CUPQC && OQS_ENABLE_KEM_ml_kem_768_cuda */
 #if defined(OQS_ENABLE_KEM_ml_kem_768_avx2)
 #if defined(OQS_DIST_BUILD)
 	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2) && OQS_CPU_has_extension(OQS_CPU_EXT_BMI2) && OQS_CPU_has_extension(OQS_CPU_EXT_POPCNT)) {
diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake
index 875584f74a..eb21d7b003 100644
--- a/src/oqsconfig.h.cmake
+++ b/src/oqsconfig.h.cmake
@@ -131,13 +131,13 @@
 #cmakedefine OQS_ENABLE_KEM_ML_KEM 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_512 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_512_avx2 1
-#cmakedefine OQS_ENABLE_KEM_ml_kem_512_cupqc 1
+#cmakedefine OQS_ENABLE_KEM_ml_kem_512_cuda 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_768 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_768_avx2 1
-#cmakedefine OQS_ENABLE_KEM_ml_kem_768_cupqc 1
+#cmakedefine OQS_ENABLE_KEM_ml_kem_768_cuda 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024 1
 #cmakedefine OQS_ENABLE_KEM_ml_kem_1024_avx2 1
-#cmakedefine OQS_ENABLE_KEM_ml_kem_1024_cupqc 1
+#cmakedefine OQS_ENABLE_KEM_ml_kem_1024_cuda 1
 
 #cmakedefine OQS_ENABLE_SIG_DILITHIUM 1
 #cmakedefine OQS_ENABLE_SIG_dilithium_2 1

From cf46255bbe60c36e0885745ef79102d99c382018 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Fri, 17 Jan 2025 17:45:28 +0100
Subject: [PATCH 06/17] Update licensing information

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 CONFIGURE.md                   | 2 +-
 docs/algorithms/kem/ml_kem.md  | 2 +-
 docs/algorithms/kem/ml_kem.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CONFIGURE.md b/CONFIGURE.md
index 67fd7d0aab..02a8da2015 100644
--- a/CONFIGURE.md
+++ b/CONFIGURE.md
@@ -127,7 +127,7 @@ Only has an effect if the system supports `dlopen` and ELF binary format, such a
 
 ### OQS_USE_CUPQC
 
-Can be `ON` or `OFF`.  When `ON`, use NVIDIA's cuPQC library where able (currently just ML-KEM).  When this option is enabled, liboqs may not run correctly on machines that lack supported GPUs. To download cuPQC follow the instructions at (https://developer.nvidia.com/cupqc-download/). Detailed descriptions of the API, requirments, and installation guide are in the cuPQC documentation (https://docs.nvidia.com/cuda/cupqc/index.html). 
+Can be `ON` or `OFF`.  When `ON`, use NVIDIA's cuPQC library where able (currently just ML-KEM).  When this option is enabled, liboqs may not run correctly on machines that lack supported GPUs. To download cuPQC follow the instructions at (https://developer.nvidia.com/cupqc-download/). Detailed descriptions of the API, requirements, and installation guide are in the cuPQC documentation (https://docs.nvidia.com/cuda/cupqc/index.html). While the code shipped by liboqs required to use cuPQC is licensed under Apache 2.0 the cuPQC SDK comes with its own license agreement (https://docs.nvidia.com/cuda/cupqc/license.html). 
 
 **Default**: `OFF`
 
diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md
index 2d65aacff2..b0a7e786b3 100644
--- a/docs/algorithms/kem/ml_kem.md
+++ b/docs/algorithms/kem/ml_kem.md
@@ -12,7 +12,7 @@
 - **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches
   - **cupqc-cuda**:<a name="cupqc-cuda"></a>
       - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30
-      - **Implementation license (SPDX-Identifier)**: https://docs.nvidia.com/cuda/cupqc/license.html
+      - **Implementation license (SPDX-Identifier)**: Apache-2.0
 
 
 ## Parameter set summary
diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml
index 529e4c870c..d48f056080 100644
--- a/docs/algorithms/kem/ml_kem.yml
+++ b/docs/algorithms/kem/ml_kem.yml
@@ -23,7 +23,7 @@ primary-upstream:
 optimized-upstreams:
   cupqc-cuda:
     source: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30
-    spdx-license-identifier: https://docs.nvidia.com/cuda/cupqc/license.html
+    spdx-license-identifier: Apache-2.0
 parameter-sets:
 - name: ML-KEM-512
   claimed-nist-level: 1

From ae3fd37b65497b763a8b83a11b9d7a6b9e46e103 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Fri, 17 Jan 2025 20:28:05 +0100
Subject: [PATCH 07/17] Update PLATFORMS.md

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 PLATFORMS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PLATFORMS.md b/PLATFORMS.md
index f1b3fc5ebd..544edd9319 100644
--- a/PLATFORMS.md
+++ b/PLATFORMS.md
@@ -63,3 +63,4 @@ In this policy, the words "must" and "must not" specify absolute requirements th
 - ppc641e for Ubuntu (Focal)
 - s390x for Ubuntu (Focal)
 - loongarch64 for Debian Linux (trixie)
+- NVIDIA GPU architectures 70, 75, 80, 86, 89, and 90 with a x86_64 CPU for Linux

From cf03b8da4832afdf13838474ce4d9358686c301c Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Mon, 20 Jan 2025 22:50:22 +0100
Subject: [PATCH 08/17] Fix kem_family cmakelists template

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 scripts/copy_from_upstream/src/kem/family/CMakeLists.txt | 2 +-
 src/kem/ml_kem/CMakeLists.txt                            | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
index 004408aa3d..bd648d101d 100644
--- a/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
+++ b/scripts/copy_from_upstream/src/kem/family/CMakeLists.txt
@@ -37,7 +37,7 @@ if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}{%- if 'alias_scheme' in
 
 if(OQS_ENABLE_KEM_{{ family }}_{{ scheme['scheme_c'] }}_{{ impl['name'] }}{%- if 'alias_scheme' in scheme %} OR OQS_ENABLE_KEM_{{ family }}_{{ scheme['alias_scheme'] }}_{{ impl['name'] }}{%- endif %})
     add_library({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} OBJECT {{ impl['upstream']['name'] }}_{{ scheme['pqclean_scheme'] }}_{{ impl['name'] }}/cupqc_ml-kem.cu)
-    target_include_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc)
+    target_link_libraries({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} cupqc)
     set_property(TARGET {{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options({{ family }}_{{ scheme['scheme'] }}_{{ impl['name'] }} PRIVATE {{ impl['compile_opts'] }})
         {%- else %}
diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt
index 76b53851b1..8af79b6d05 100644
--- a/src/kem/ml_kem/CMakeLists.txt
+++ b/src/kem/ml_kem/CMakeLists.txt
@@ -25,7 +25,7 @@ endif()
 
 if(OQS_ENABLE_KEM_ml_kem_512_cuda)
     add_library(ml_kem_512_cuda OBJECT cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu)
-    target_include_libraries(ml_kem_512_cuda cupqc)
+    target_link_libraries(ml_kem_512_cuda cupqc)
     set_property(TARGET ml_kem_512_cuda PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options(ml_kem_512_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_512_cuda>)
@@ -51,7 +51,7 @@ endif()
 
 if(OQS_ENABLE_KEM_ml_kem_768_cuda)
     add_library(ml_kem_768_cuda OBJECT cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu)
-    target_include_libraries(ml_kem_768_cuda cupqc)
+    target_link_libraries(ml_kem_768_cuda cupqc)
     set_property(TARGET ml_kem_768_cuda PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options(ml_kem_768_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_768_cuda>)
@@ -77,7 +77,7 @@ endif()
 
 if(OQS_ENABLE_KEM_ml_kem_1024_cuda)
     add_library(ml_kem_1024_cuda OBJECT cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu)
-    target_include_libraries(ml_kem_1024_cuda cupqc)
+    target_link_libraries(ml_kem_1024_cuda cupqc)
     set_property(TARGET ml_kem_1024_cuda PROPERTY CUDA_ARCHITECTURES OFF)
     target_compile_options(ml_kem_1024_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true -dlto -arch=compute_70>)
     set(_ML_KEM_OBJS ${_ML_KEM_OBJS} $<TARGET_OBJECTS:ml_kem_1024_cuda>)

From 5dc3f495e50dad1c3a12e27ed8ddfd6de460e123 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Tue, 21 Jan 2025 19:14:23 +0100
Subject: [PATCH 09/17] Run copy_from_upsream.py and pull updated upstream

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 docs/algorithms/kem/ml_kem.md                 |  2 +-
 docs/algorithms/kem/ml_kem.yml                |  2 +-
 .../copy_from_upstream/copy_from_upstream.yml |  2 +-
 .../cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu    | 28 +------------------
 .../cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu     | 28 +------------------
 .../cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu     | 28 +------------------
 6 files changed, 6 insertions(+), 84 deletions(-)

diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md
index b0a7e786b3..eeaf299dde 100644
--- a/docs/algorithms/kem/ml_kem.md
+++ b/docs/algorithms/kem/ml_kem.md
@@ -11,7 +11,7 @@
   - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0
 - **Optimized Implementation sources**: https://github.com/pq-crystals/kyber/commit/10b478fc3cc4ff6215eb0b6a11bd758bf0929cbd with copy_from_upstream patches
   - **cupqc-cuda**:<a name="cupqc-cuda"></a>
-      - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30
+      - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e
       - **Implementation license (SPDX-Identifier)**: Apache-2.0
 
 
diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml
index d48f056080..498617ff45 100644
--- a/docs/algorithms/kem/ml_kem.yml
+++ b/docs/algorithms/kem/ml_kem.yml
@@ -22,7 +22,7 @@ primary-upstream:
   spdx-license-identifier: CC0-1.0 or Apache-2.0
 optimized-upstreams:
   cupqc-cuda:
-    source: https://github.com/praveksharma/cupqc-mlkem/commit/adb8454e56979628c07b67eb7d90f9337be6dc30
+    source: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e
     spdx-license-identifier: Apache-2.0
 parameter-sets:
 - name: ML-KEM-512
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index b2f74f566e..23d1f3a22d 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -42,7 +42,7 @@ upstreams:
     name: cupqc
     git_url: https://github.com/praveksharma/cupqc-mlkem.git
     git_branch: main
-    git_commit: adb8454e56979628c07b67eb7d90f9337be6dc30
+    git_commit: b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e
     kem_meta_path: '{pretty_name_full}_META.yml'
     kem_scheme_path: '.'
     patches: []
diff --git a/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu
index 2935c2cd6a..188e2f100d 100644
--- a/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu
+++ b/src/kem/ml_kem/cupqc_ml-kem-1024_cuda/cupqc_ml-kem.cu
@@ -156,35 +156,9 @@ cleanup:
 }
 
 extern "C" {
-    using KEM_512  = decltype(ML_KEM_512()  + Block());
-    using KEM_768  = decltype(ML_KEM_768()  + Block());
     using KEM_1024 = decltype(ML_KEM_1024() + Block());
 
-#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_512>(pk, sk);
-    }
-    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_512>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_512>(ss, ct, sk);
-    }
-#endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_768>(pk, sk);
-    }
-    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_768>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_768>(ss, ct, sk);
-    }
-#endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
+#if defined(OQS_ENABLE_KEM_ml_kem_1024_cuda)
     int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
         return keypair<KEM_1024>(pk, sk);
     }
diff --git a/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu
index 2935c2cd6a..48ca5ca31a 100644
--- a/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu
+++ b/src/kem/ml_kem/cupqc_ml-kem-512_cuda/cupqc_ml-kem.cu
@@ -157,10 +157,8 @@ cleanup:
 
 extern "C" {
     using KEM_512  = decltype(ML_KEM_512()  + Block());
-    using KEM_768  = decltype(ML_KEM_768()  + Block());
-    using KEM_1024 = decltype(ML_KEM_1024() + Block());
 
-#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
+#if defined(OQS_ENABLE_KEM_ml_kem_512_cuda)
     int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
         return keypair<KEM_512>(pk, sk);
     }
@@ -171,28 +169,4 @@ extern "C" {
         return decaps<KEM_512>(ss, ct, sk);
     }
 #endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
-    int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_768>(pk, sk);
-    }
-    int cupqc_ml_kem_768_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_768>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_768_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_768>(ss, ct, sk);
-    }
-#endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_1024>(pk, sk);
-    }
-    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_1024>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_1024>(ss, ct, sk);
-    }
-#endif
 }
diff --git a/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu
index 2935c2cd6a..594c1f4c24 100644
--- a/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu
+++ b/src/kem/ml_kem/cupqc_ml-kem-768_cuda/cupqc_ml-kem.cu
@@ -156,23 +156,9 @@ cleanup:
 }
 
 extern "C" {
-    using KEM_512  = decltype(ML_KEM_512()  + Block());
     using KEM_768  = decltype(ML_KEM_768()  + Block());
-    using KEM_1024 = decltype(ML_KEM_1024() + Block());
 
-#if defined(OQS_ENABLE_KEM_ml_kem_512_cupqc)
-    int cupqc_ml_kem_512_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_512>(pk, sk);
-    }
-    int cupqc_ml_kem_512_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_512>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_512_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_512>(ss, ct, sk);
-    }
-#endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_768_cupqc)
+#if defined(OQS_ENABLE_KEM_ml_kem_768_cuda)
     int cupqc_ml_kem_768_keypair(uint8_t *pk, uint8_t *sk) {
         return keypair<KEM_768>(pk, sk);
     }
@@ -183,16 +169,4 @@ extern "C" {
         return decaps<KEM_768>(ss, ct, sk);
     }
 #endif
-
-#if defined(OQS_ENABLE_KEM_ml_kem_1024_cupqc)
-    int cupqc_ml_kem_1024_keypair(uint8_t *pk, uint8_t *sk) {
-        return keypair<KEM_1024>(pk, sk);
-    }
-    int cupqc_ml_kem_1024_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) {
-        return encaps<KEM_1024>(ct, ss, pk);
-    }
-    int cupqc_ml_kem_1024_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) {
-        return decaps<KEM_1024>(ss, ct, sk);
-    }
-#endif
 }

From a3126fc778881c41f416e0b231e2ce829cb51c91 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Tue, 21 Jan 2025 23:13:52 +0100
Subject: [PATCH 10/17] Add cupqc build test to basic.yml

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .github/workflows/basic.yml | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml
index 57b6374934..64589aac7d 100644
--- a/.github/workflows/basic.yml
+++ b/.github/workflows/basic.yml
@@ -89,6 +89,36 @@ jobs:
         run: ninja gen_docs
         working-directory: ${{ env.RANDOM_BUILD_DIR }}
 
+  cupqc-buildcheck:
+    name: Check that code build with OQS_USE_CUPQC=ON
+    needs: [buildcheck]
+    runs-on: ubuntu-latest
+    container: openquantumsafe/ci-ubuntu-latest:latest
+    steps:
+      - name: Create random build folder
+        run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV
+      - name: 
+      - name: Checkout code
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
+      - name: Configure
+        run: |
+          cuPQC_DIR=$cuPQC_DIR \
+            cmake \
+            -B ${{ env.RANDOM_BUILD_DIR }} \
+            -GNinja \
+            -DOQS_USE_CUPQC=ON \
+            -DOQS_STRICT_WARNINGS=ON \
+            --warn-uninitialized . > config.log 2>&1 && \
+          cat config.log && \
+          cmake -LA -N . && \
+          ! (grep -i "uninitialized variable" config.log)
+      - name: Build code
+        run: ninja
+        working-directory: ${{ env.RANDOM_BUILD_DIR }}
+      - name: Build documentation
+        run: ninja gen_docs
+        working-directory: ${{ env.RANDOM_BUILD_DIR }}
+
   cppcheck:
     name: Check C++ linking with example program
     runs-on: ubuntu-latest

From 0fa3031bb9477613b6f449b480f83cc236e15014 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Tue, 21 Jan 2025 23:31:01 +0100
Subject: [PATCH 11/17] Move cupqc build test from basic.yml to linux.yml

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .github/workflows/basic.yml | 30 ------------------------------
 .github/workflows/linux.yml | 27 +++++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml
index 64589aac7d..57b6374934 100644
--- a/.github/workflows/basic.yml
+++ b/.github/workflows/basic.yml
@@ -89,36 +89,6 @@ jobs:
         run: ninja gen_docs
         working-directory: ${{ env.RANDOM_BUILD_DIR }}
 
-  cupqc-buildcheck:
-    name: Check that code build with OQS_USE_CUPQC=ON
-    needs: [buildcheck]
-    runs-on: ubuntu-latest
-    container: openquantumsafe/ci-ubuntu-latest:latest
-    steps:
-      - name: Create random build folder
-        run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV
-      - name: 
-      - name: Checkout code
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
-      - name: Configure
-        run: |
-          cuPQC_DIR=$cuPQC_DIR \
-            cmake \
-            -B ${{ env.RANDOM_BUILD_DIR }} \
-            -GNinja \
-            -DOQS_USE_CUPQC=ON \
-            -DOQS_STRICT_WARNINGS=ON \
-            --warn-uninitialized . > config.log 2>&1 && \
-          cat config.log && \
-          cmake -LA -N . && \
-          ! (grep -i "uninitialized variable" config.log)
-      - name: Build code
-        run: ninja
-        working-directory: ${{ env.RANDOM_BUILD_DIR }}
-      - name: Build documentation
-        run: ninja gen_docs
-        working-directory: ${{ env.RANDOM_BUILD_DIR }}
-
   cppcheck:
     name: Check C++ linking with example program
     runs-on: ubuntu-latest
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index bb412d17c1..2ac89c43d5 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -188,6 +188,33 @@ jobs:
                                                 --numprocesses=auto \
                                                 --ignore=tests/test_code_conventions.py ${{ matrix.PYTEST_ARGS }}"
 
+  cupqc-buildcheck:
+    name: Check that code builds with OQS_USE_CUPQC=ON
+    needs: [buildcheck]
+    runs-on: ubuntu-latest
+    container: openquantumsafe/ci-ubuntu-latest:latest
+    steps:
+      - name: Create random build folder
+        run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV
+      - name: 
+      - name: Checkout code
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
+      - name: Configure
+        run: |
+          cuPQC_DIR=$cuPQC_DIR \
+            cmake \
+            -B ${{ env.RANDOM_BUILD_DIR }} \
+            -GNinja \
+            -DOQS_USE_CUPQC=ON \
+            -DOQS_STRICT_WARNINGS=ON \
+            --warn-uninitialized . > config.log 2>&1 && \
+          cat config.log && \
+          cmake -LA -N . && \
+          ! (grep -i "uninitialized variable" config.log)
+      - name: Build code
+        run: ninja
+        working-directory: ${{ env.RANDOM_BUILD_DIR }}
+
   linux_cross_compile:
     runs-on: ubuntu-latest
     container: openquantumsafe/ci-ubuntu-latest:latest

From 512d07115479fbe392c4c31a3ec2a00a8d9d11b1 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Tue, 21 Jan 2025 23:39:03 +0100
Subject: [PATCH 12/17] Fix error in linux.yml

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .github/workflows/linux.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 2ac89c43d5..b85d02a055 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -196,7 +196,6 @@ jobs:
     steps:
       - name: Create random build folder
         run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV
-      - name: 
       - name: Checkout code
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
       - name: Configure

From 7f2eb3017e2cae2f4473ee18837f4cf75f50e92b Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Tue, 21 Jan 2025 23:41:06 +0100
Subject: [PATCH 13/17] fixup! Fix error in linux.yml

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .github/workflows/linux.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index b85d02a055..e0a7daccff 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -190,7 +190,6 @@ jobs:
 
   cupqc-buildcheck:
     name: Check that code builds with OQS_USE_CUPQC=ON
-    needs: [buildcheck]
     runs-on: ubuntu-latest
     container: openquantumsafe/ci-ubuntu-latest:latest
     steps:

From 7fbdd134174e311f2bb5eac1605198147f85d719 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Thu, 23 Jan 2025 19:48:53 +0100
Subject: [PATCH 14/17] Redo cupqc build check

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .github/workflows/linux.yml | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index e0a7daccff..03d206f0ff 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -193,25 +193,13 @@ jobs:
     runs-on: ubuntu-latest
     container: openquantumsafe/ci-ubuntu-latest:latest
     steps:
-      - name: Create random build folder
-        run: tmp_build=$(mktemp -d) && echo "RANDOM_BUILD_DIR=$tmp_build" >> $GITHUB_ENV
       - name: Checkout code
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
       - name: Configure
-        run: |
-          cuPQC_DIR=$cuPQC_DIR \
-            cmake \
-            -B ${{ env.RANDOM_BUILD_DIR }} \
-            -GNinja \
-            -DOQS_USE_CUPQC=ON \
-            -DOQS_STRICT_WARNINGS=ON \
-            --warn-uninitialized . > config.log 2>&1 && \
-          cat config.log && \
-          cmake -LA -N . && \
-          ! (grep -i "uninitialized variable" config.log)
+        run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR cmake -GNinja -DOQS_USE_CUPQC=ON .. && cmake -LA -N ..
       - name: Build code
         run: ninja
-        working-directory: ${{ env.RANDOM_BUILD_DIR }}
+        working-directory: build
 
   linux_cross_compile:
     runs-on: ubuntu-latest

From 083b26792c352d9e002a979ad65a310724b88f52 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Thu, 23 Jan 2025 20:54:17 +0100
Subject: [PATCH 15/17] Supply default CUDA arch to cupqc-buildcheck
 configuration stage

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .github/workflows/linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 03d206f0ff..603ab7466b 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -196,7 +196,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
       - name: Configure
-        run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR cmake -GNinja -DOQS_USE_CUPQC=ON .. && cmake -LA -N ..
+        run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N ..
       - name: Build code
         run: ninja
         working-directory: build

From 1ee99730a67ad14b235cf334b07b213a3321763b Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Thu, 23 Jan 2025 21:22:42 +0100
Subject: [PATCH 16/17] Specify CUDAXX in cupqc-buildcheck

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .github/workflows/linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 603ab7466b..a12931c1c8 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -196,7 +196,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
       - name: Configure
-        run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N ..
+        run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR CUDACXX=/usr/local/cuda-12.6/bin/nvcc cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N ..
       - name: Build code
         run: ninja
         working-directory: build

From c94b7c73dea4c8af15bcb4e98f6fa70f5557e574 Mon Sep 17 00:00:00 2001
From: Pravek Sharma <sharmapravek@gmail.com>
Date: Thu, 23 Jan 2025 21:51:04 +0100
Subject: [PATCH 17/17] Make cuPQC_DIR explicit in cupqc-buildcheck

Signed-off-by: Pravek Sharma <sharmapravek@gmail.com>
---
 .github/workflows/linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index a12931c1c8..8059ffc993 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -196,7 +196,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4
       - name: Configure
-        run: mkdir build && cd build && cuPQC_DIR=$cuPQC_DIR CUDACXX=/usr/local/cuda-12.6/bin/nvcc cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N ..
+        run: mkdir build && cd build && cuPQC_DIR=/cupqc/cupqc/cupqc-pkg-0.2.0/cmake/ CUDACXX=/usr/local/cuda-12.6/bin/nvcc cmake -GNinja -DOQS_USE_CUPQC=ON -DCMAKE_CUDA_ARCHITECTURES=80 .. && cmake -LA -N ..
       - name: Build code
         run: ninja
         working-directory: build