NVIDIA · khalatepradnya · Oct 8, 2024 · Aug 22, 2024 · Aug 27, 2024 · Aug 28, 2024
diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst
@@ -44,6 +44,8 @@ Common
 .. doxygenclass:: cudaq::async_result
     :members:
 
+.. doxygentypedef:: async_sample_result
+
 
 .. doxygenstruct:: cudaq::ExecutionResult
     :members:
@@ -168,7 +170,9 @@ Platform
 
 .. doxygenclass:: cudaq::BaseRemoteSimulatorQPU
 
-.. doxygenclass:: cudaq::BaseNvcfSimulatorQPU    
+.. doxygenclass:: cudaq::BaseNvcfSimulatorQPU
+
+.. doxygenclass:: cudaq::OrcaRemoteRESTQPU 
 
 .. doxygenclass:: cudaq::quantum_platform
     :members:
@@ -231,5 +235,7 @@ Namespaces
 .. doxygennamespace:: cudaq::orca
     :desc-only:
 
-.. doxygenfunction:: cudaq::orca::sample(std::vector<std::size_t> &input_state, std::vector<std::size_t> &loop_lengths, std::vector<double> &bs_angles, int n_samples = 10000)
-.. doxygenfunction:: cudaq::orca::sample(std::vector<std::size_t> &input_state, std::vector<std::size_t> &loop_lengths, std::vector<double> &bs_angles, std::vector<double> &ps_angles, int n_samples = 10000)
+.. doxygenfunction:: cudaq::orca::sample(std::vector<std::size_t> &input_state, std::vector<std::size_t> &loop_lengths, std::vector<double> &bs_angles, int n_samples = 10000, std::size_t qpu_id = 0)
+.. doxygenfunction:: cudaq::orca::sample(std::vector<std::size_t> &input_state, std::vector<std::size_t> &loop_lengths, std::vector<double> &bs_angles, std::vector<double> &ps_angles, int n_samples = 10000, std::size_t qpu_id = 0)
+.. doxygenfunction:: cudaq::orca::sample_async(std::vector<std::size_t> &input_state, std::vector<std::size_t> &loop_lengths, std::vector<double> &bs_angles, int n_samples = 10000, std::size_t qpu_id = 0)
+.. doxygenfunction:: cudaq::orca::sample_async(std::vector<std::size_t> &input_state, std::vector<std::size_t> &loop_lengths, std::vector<double> &bs_angles, std::vector<double> &ps_angles, int n_samples = 10000, std::size_t qpu_id = 0)
diff --git a/docs/sphinx/examples/cpp/providers/orca.cpp b/docs/sphinx/examples/cpp/providers/orca.cpp
@@ -8,6 +8,12 @@
 #include "cudaq/orca.h"
 #include "cudaq.h"
 
+#include <fstream>
+#include <iostream>
+
+#include <chrono>
+#include <thread>
+
 // define helper function to generate linear spaced vectors
 template <typename T>
 void linear_spaced_vector(std::vector<T> &xs, T min, T max, std::size_t N) {
@@ -20,6 +26,8 @@ void linear_spaced_vector(std::vector<T> &xs, T min, T max, std::size_t N) {
 }
 
 int main() {
+  using namespace std::this_thread;     // sleep_for, sleep_until
+  using namespace std::chrono_literals; // `ns`, `us`, `ms`, `s`, `h`, etc.
 
   // A time-bin boson sampling experiment: An input state of 4 indistinguishable
   // photons mixed with 4 vacuum states across 8 time bins (modes) enter the
@@ -60,11 +68,15 @@ int main() {
   // we can also set number of requested samples
   int n_samples{10000};
 
-  // Submit to ORCA synchronously (e.g., wait for the job result to be returned
-  // before proceeding with the rest of the execution).
+  // Submit to ORCA synchronously (e.g., wait for the job result to be
+  // returned before proceeding with the rest of the execution).
+  std::cout << "Submitting to ORCA Server synchronously" << std::endl;
   auto counts =
       cudaq::orca::sample(input_state, loop_lengths, bs_angles, n_samples);
 
+  // Print the results
+  counts.dump();
+
   // If the system includes phase shifters, the phase shifter angles can be
   // included in the call
 
@@ -73,8 +85,27 @@ int main() {
   //                                   ps_angles, n_samples);
   // ```
 
-  // Print the results
-  counts.dump();
+  // Alternatively we can submit to ORCA asynchronously (e.g., continue
+  // executing code in the file until the job has been returned).
+  std::cout << "Submitting to ORCA Server asynchronously" << std::endl;
+  auto async_results = cudaq::orca::sample_async(input_state, loop_lengths,
+                                                 bs_angles, n_samples);
+
+  // Can write the future to file:
+  {
+    std::ofstream out("saveMe.json");
+    out << async_results;
+  }
+
+  // Then come back and read it in later.
+  cudaq::async_result<cudaq::sample_result> readIn;
+  std::ifstream in("saveMe.json");
+  in >> readIn;
+
+  sleep_for(200ms); // wait for the job to be processed
+  // Get the results of the read in future.
+  auto async_counts = readIn.get();
+  async_counts.dump();
 
   return 0;
 }
diff --git a/docs/sphinx/examples/python/providers/orca.py b/docs/sphinx/examples/python/providers/orca.py
@@ -1,4 +1,5 @@
 import cudaq
+import time
 
 import numpy as np
 import os
@@ -45,9 +46,11 @@
 # we can also set number of requested samples
 n_samples = 10000
 
+# Option A:
 # By using the synchronous `cudaq.orca.sample`, the execution of
 # any remaining classical code in the file will occur only
 # after the job has been returned from ORCA Server.
+print("Submitting to ORCA Server synchronously")
 counts = cudaq.orca.sample(input_state, loop_lengths, bs_angles, n_samples)
 
 # If the system includes phase shifters, the phase shifter angles can be
@@ -59,3 +62,32 @@
 
 # Print the results
 print(counts)
+
+# Option B:
+# By using the asynchronous `cudaq.orca.sample_async`, the remaining
+# classical code will be executed while the job is being handled
+# by Orca. This is ideal when submitting via a queue over
+# the cloud.
+print("Submitting to ORCA Server asynchronously")
+async_results = cudaq.orca.sample_async(input_state, loop_lengths, bs_angles,
+                                        n_samples)
+# ... more classical code to run ...
+
+# We can either retrieve the results later in the program with
+# ```
+# async_counts = async_results.get()
+# ```
+# or we can also write the job reference (`async_results`) to
+# a file and load it later or from a different process.
+file = open("future.txt", "w")
+file.write(str(async_results))
+file.close()
+
+# We can later read the file content and retrieve the job
+# information and results.
+time.sleep(0.2)  # wait for the job to be processed
+same_file = open("future.txt", "r")
+retrieved_async_results = cudaq.AsyncSampleResult(str(same_file.read()))
+
+counts = retrieved_async_results.get()
+print(counts)
diff --git a/docs/sphinx/using/backends/hardware.rst b/docs/sphinx/using/backends/hardware.rst
@@ -312,6 +312,16 @@ configuration.
 
   export ORCA_ACCESS_URL="https://<ORCA API Server>"
 
+
+Sometimes the requests to the PT-1 require an authentication token. This token can be set as an
+environment variable named ``ORCA_AUTH_TOKEN``. For example, if the token is :code:`AbCdEf123456`,
+you can set the environment variable as follows:
+
+.. code:: bash
+
+  export ORCA_AUTH_TOKEN="AbCdEf123456"
+
+
 Submission from C++
 `````````````````````````
 

diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
@@ -70,10 +70,13 @@ declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
     ../runtime/utils/PyRemoteSimulatorQPU.cpp
     ../runtime/utils/PyRestRemoteClient.cpp
     ../utils/LinkedLibraryHolder.cpp
+    ../../runtime/common/ArgumentConversion.cpp
     ../../runtime/cudaq/platform/common/QuantumExecutionQueue.cpp
     ../../runtime/cudaq/platform/default/rest_server/RemoteRuntimeClient.cpp
+    ../../runtime/cudaq/platform/orca/OrcaExecutor.cpp
     ../../runtime/cudaq/platform/orca/OrcaQPU.cpp
-    ../../runtime/common/ArgumentConversion.cpp
+    ../../runtime/cudaq/platform/orca/OrcaRemoteRESTQPU.cpp
+    ../../runtime/cudaq/platform/orca/OrcaServerHelper.cpp
 
   EMBED_CAPI_LINK_LIBS
    CUDAQuantumMLIRCAPI

diff --git a/python/extension/CUDAQuantumExtension.cpp b/python/extension/CUDAQuantumExtension.cpp
@@ -166,20 +166,41 @@ PYBIND11_MODULE(_quakeDialects, m) {
   orcaSubmodule.def(
       "sample",
       py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, std::vector<double> &, int>(
-          &cudaq::orca::sample),
+                        std::vector<double> &, std::vector<double> &, int,
+                        std::size_t>(&cudaq::orca::sample),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
       py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("ps_angles") = nullptr, py::arg("n_samples") = 10000);
+      py::arg("ps_angles"), py::arg("n_samples") = 10000,
+      py::arg("qpu_id") = 0);
   orcaSubmodule.def(
       "sample",
       py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
-                        std::vector<double> &, int>(&cudaq::orca::sample),
+                        std::vector<double> &, int, std::size_t>(
+          &cudaq::orca::sample),
+      "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
+      "ORCA's backends",
+      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
+      py::arg("n_samples") = 10000, py::arg("qpu_id") = 0);
+  orcaSubmodule.def(
+      "sample_async",
+      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
+                        std::vector<double> &, std::vector<double> &, int,
+                        std::size_t>(&cudaq::orca::sample_async),
       "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
       "ORCA's backends",
       py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
-      py::arg("n_samples") = 10000);
+      py::arg("ps_angles"), py::arg("n_samples") = 10000,
+      py::arg("qpu_id") = 0);
+  orcaSubmodule.def(
+      "sample_async",
+      py::overload_cast<std::vector<std::size_t> &, std::vector<std::size_t> &,
+                        std::vector<double> &, int, std::size_t>(
+          &cudaq::orca::sample_async),
+      "Performs Time Bin Interferometer (TBI) boson sampling experiments on "
+      "ORCA's backends",
+      py::arg("input_state"), py::arg("loop_lengths"), py::arg("bs_angles"),
+      py::arg("n_samples") = 10000, py::arg("qpu_id") = 0);
 
   auto photonicsSubmodule = cudaqRuntime.def_submodule("photonics");
   photonicsSubmodule.def(
@@ -217,7 +238,6 @@ PYBIND11_MODULE(_quakeDialects, m) {
         cudaq::getExecutionManager()->returnQudit(cudaq::QuditInfo(level, id));
       },
       "Release a qudit of given id.", py::arg("level"), py::arg("id"));
-
   cudaqRuntime.def("cloneModule",
                    [](MlirModule mod) { return wrap(unwrap(mod).clone()); });
   cudaqRuntime.def("isTerminator", [](MlirOperation op) {

diff --git a/runtime/cudaq/platform/orca/CMakeLists.txt b/runtime/cudaq/platform/orca/CMakeLists.txt
@@ -8,8 +8,14 @@
 
 set(LIBRARY_NAME cudaq-orca-qpu)
 message(STATUS "Building ORCA REST QPU.")
+set(ORCA_SRC
+  OrcaExecutor.cpp
+  OrcaQPU.cpp
+  OrcaRemoteRESTQPU.cpp
+  OrcaServerHelper.cpp
+)
 
-add_library(${LIBRARY_NAME} SHARED OrcaQPU.cpp)
+add_library(${LIBRARY_NAME} SHARED ${ORCA_SRC})
 
 target_include_directories(${LIBRARY_NAME} PRIVATE .
     PUBLIC 
@@ -30,9 +36,4 @@ target_link_libraries(${LIBRARY_NAME}
 install(TARGETS ${LIBRARY_NAME} DESTINATION lib)
 install(TARGETS ${LIBRARY_NAME} EXPORT cudaq-orca-qpu-targets DESTINATION lib)
 
-# install(EXPORT cudaq-orca-qpu-targets
-#         FILE CUDAQQPUOrcaTargets.cmake
-#         NAMESPACE cudaq::orca::
-#         DESTINATION lib/cmake/cudaq)
-
-add_target_config(orca)
+add_target_config(orca)
diff --git a/runtime/cudaq/platform/orca/OrcaExecutor.cpp b/runtime/cudaq/platform/orca/OrcaExecutor.cpp
@@ -0,0 +1,47 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "OrcaExecutor.h"
+#include "OrcaServerHelper.h"
+#include "common/Logger.h"
+
+namespace cudaq {
+
+details::future OrcaExecutor::execute(cudaq::orca::TBIParameters params,
+                                      const std::string &kernelName) {
+  auto orcaServerHelper = dynamic_cast<OrcaServerHelper *>(serverHelper);
+  assert(orcaServerHelper);
+  orcaServerHelper->setShots(shots);
+  cudaq::info("Executor creating job to execute with the {} helper.",
+              orcaServerHelper->name());
+  // Create the Job Payload, composed of job post path, headers,
+  // and the job json messages themselves
+  auto [jobPostPath, headers, jobs] = orcaServerHelper->createJob(params);
+  auto job = jobs[0];
+  auto config = orcaServerHelper->getConfig();
+  std::vector<cudaq::details::future::Job> ids;
+  cudaq::info("Job created, posting to {}", jobPostPath);
+  // Post it, get the response
+  auto response = client.post(jobPostPath, "", job, headers);
+  cudaq::info("Job posted, response was {}", response.dump());
+  // Add the job id and the job name.
+  auto job_id = orcaServerHelper->extractJobId(response);
+  if (job_id.empty()) {
+    nlohmann::json tmp(job.at("job_id"));
+    orcaServerHelper->constructGetJobPath(tmp[0]);
+    job_id = tmp[0].at("job_id");
+  }
+  ids.emplace_back(job_id, kernelName);
+  config["output_names." + job_id] = kernelName;
+
+  config.insert({"shots", std::to_string(shots)});
+  std::string name = orcaServerHelper->name();
+  return cudaq::details::future(ids, name, config);
+}
+
+} // namespace cudaq
diff --git a/runtime/cudaq/platform/orca/OrcaExecutor.h b/runtime/cudaq/platform/orca/OrcaExecutor.h
@@ -0,0 +1,26 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/Executor.h"
+#include "orca_qpu.h"
+
+namespace cudaq {
+
+/// @brief The Executor subclass for ORCA target which has a distinct sampling
+/// API.
+class OrcaExecutor : public Executor {
+public:
+  /// @brief Execute the provided ORCA quantum parameters and return a future
+  /// object. The caller can make this synchronous by just immediately calling
+  /// .get().
+  details::future execute(cudaq::orca::TBIParameters params,
+                          const std::string &kernelName);
+};
+} // namespace cudaq