NVIDIA · bmhowe23 · Oct 16, 2023 · Oct 3, 2023 · Oct 4, 2023 · Oct 4, 2023
diff --git a/include/cudaq/Optimizer/CodeGen/Peephole.h b/include/cudaq/Optimizer/CodeGen/Peephole.h
@@ -55,6 +55,17 @@ inline mlir::Value createMeasureCall(mlir::PatternRewriter &builder,
   return {};
 }
 
+inline mlir::Value createReadResultCall(mlir::PatternRewriter &builder,
+                                        mlir::Location loc,
+                                        mlir::OpResult result) {
+  auto i1Ty = mlir::IntegerType::get(builder.getContext(), 1);
+  return builder
+      .create<mlir::LLVM::CallOp>(loc, mlir::TypeRange{i1Ty},
+                                  cudaq::opt::QIRReadResultBody,
+                                  mlir::ArrayRef<mlir::Value>{result})
+      .getResult();
+}
+
 namespace {
 #include "cudaq/Optimizer/CodeGen/Peephole.inc"
 }
diff --git a/include/cudaq/Optimizer/CodeGen/Peephole.td b/include/cudaq/Optimizer/CodeGen/Peephole.td
@@ -148,4 +148,30 @@ def MeasureToRegisterCallConv : Pat<
     (CreateMeasureCall $call, $args),
     [(IsaMeasureToRegisterCall:$callee), (IsaIntToPtrOperand $args)]>;
 
+//===----------------------------------------------------------------------===//
+
+def HasI1PtrType : Constraint<CPred<
+    "$_self.getType() == cudaq::opt::factory::getPointerType("
+    "  mlir::IntegerType::get($_self.getContext(), 1))">>;
+
+def HasResultType : Constraint<CPred<
+    "$_self.getType() == cudaq::opt::getResultType($_self.getContext())">>;
+
+def IsaIntAttr : Constraint<CPred<"$_self.isa<mlir::IntegerAttr>()">>;
+
+def CreateReadResultCall : NativeCodeCall<
+    "createReadResultCall($_builder, $_loc, $0)">;
+
+// %1 = llvm.constant 1
+// %2 = llvm.inttoptr %1 : i64 -> Result*
+// %3 = llvm.bitcast %2 : Result* -> i1*
+// %4 = llvm.load %3
+// ─────────────────────────────────────
+// %4 = call @read_result %2
+def LoadMeasureResult : Pat<
+    (LLVM_LoadOp:$load (LLVM_BitcastOp:$bitcast (LLVM_IntToPtrOp:$cast
+                       (LLVM_ConstantOp $attr))), $_, $_, $_, $_, $_, $_),
+    (CreateReadResultCall $cast),
+    [(HasI1PtrType:$bitcast), (HasResultType:$cast), (IsaIntAttr:$attr)]>;
+
 #endif
diff --git a/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h b/include/cudaq/Optimizer/CodeGen/QIRFunctionNames.h
@@ -22,6 +22,8 @@ constexpr static const char QIRMeasureToRegister[] =
 
 constexpr static const char QIRCnot[] = "__quantum__qis__cnot";
 constexpr static const char QIRCphase[] = "__quantum__qis__cphase";
+constexpr static const char QIRReadResultBody[] =
+    "__quantum__qis__read_result__body";
 
 constexpr static const char NVQIRInvokeWithControlBits[] =
     "invokeWithControlQubits";

diff --git a/lib/Optimizer/CodeGen/LowerToQIRProfile.cpp b/lib/Optimizer/CodeGen/LowerToQIRProfile.cpp
@@ -65,12 +65,11 @@ namespace {
 struct FunctionAnalysisData {
   std::size_t nQubits = 0;
   std::size_t nResults = 0;
-  // Use std::map to keep these sorted in ascending order.
-  // map[qb] --> [result,regName]
-  std::map<std::size_t, std::pair<std::size_t, StringAttr>> resultPtrValues;
-  // Additionally store by result to prevent collisions on a single qubit having
+  // Store by result to prevent collisions on a single qubit having
   // multiple measurements (Adaptive Profile)
   // map[result] --> [qb,regName]
+  // Use std::map to keep these sorted in ascending order. While this isn't
+  // required, it makes viewing the QIR easier.
   std::map<std::size_t, std::pair<std::size_t, std::string>> resultQubitVals;
   DenseMap<Operation *, std::size_t> allocationOffsets;
 };
@@ -164,26 +163,18 @@ struct FunctionProfileAnalysis {
         }
         if (optQb) {
           auto qb = *optQb;
-          auto iter = data.resultPtrValues.find(qb);
           auto *ctx = callOp.getContext();
           auto intTy = IntegerType::get(ctx, 64);
-          if (iter == data.resultPtrValues.end()) {
-            auto resIdx = IntegerAttr::get(intTy, data.nResults);
-            callOp->setAttr(resultIndexName, resIdx);
-            auto regName = [&]() -> StringAttr {
-              if (auto nameAttr = callOp->getAttr("registerName")
-                                      .dyn_cast_or_null<StringAttr>())
-                return nameAttr;
-              return {};
-            }();
-            data.resultQubitVals.insert(std::make_pair(
-                data.nResults, std::make_pair(qb, regName.data())));
-            data.resultPtrValues.insert(
-                std::make_pair(qb, std::make_pair(data.nResults++, regName)));
-          } else {
-            auto resIdx = IntegerAttr::get(intTy, iter->second.first);
-            callOp->setAttr(resultIndexName, resIdx);
-          }
+          auto resIdx = IntegerAttr::get(intTy, data.nResults);
+          callOp->setAttr(resultIndexName, resIdx);
+          auto regName = [&]() -> StringAttr {
+            if (auto nameAttr = callOp->getAttr("registerName")
+                                    .dyn_cast_or_null<StringAttr>())
+              return nameAttr;
+            return {};
+          }();
+          data.resultQubitVals.insert(std::make_pair(
+              data.nResults++, std::make_pair(qb, regName.data())));
         } else {
           callOp.emitError("could not trace offset value");
         }
@@ -237,13 +228,13 @@ struct AddFuncAttribute : public OpRewritePattern<LLVM::LLVMFuncOp> {
     auto resultTy = cudaq::opt::getResultType(rewriter.getContext());
     auto i64Ty = rewriter.getI64Type();
     auto module = op->getParentOfType<ModuleOp>();
-    for (auto &iv : info.resultPtrValues) {
+    for (auto &iv : info.resultQubitVals) {
       auto &rec = iv.second;
-      Value idx = builder.create<LLVM::ConstantOp>(loc, i64Ty, rec.first);
+      Value idx = builder.create<LLVM::ConstantOp>(loc, i64Ty, iv.first);
       Value ptr = builder.create<LLVM::IntToPtrOp>(loc, resultTy, idx);
       auto regName = [&]() -> Value {
         auto charPtrTy = cudaq::opt::getCharPointerType(builder.getContext());
-        if (rec.second) {
+        if (!rec.second.empty()) {
           // Note: it should be the case that this string literal has already
           // been added to the IR, so this step does not actually update the
           // module.
@@ -422,6 +413,8 @@ struct QIRToQIRProfileQIRPass
                     CalleeConv, EraseArrayAlloc, EraseArrayRelease,
                     EraseDeadArrayGEP, MeasureCallConv,
                     MeasureToRegisterCallConv, XCtrlOneTargetToCNot>(context);
+    if (convertTo.getValue() == "qir-adaptive")
+      patterns.insert<LoadMeasureResult>(context);
     if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
       signalPassFailure();
     LLVM_DEBUG(llvm::dbgs() << "After QIR profile:\n" << *op << '\n');
@@ -472,6 +465,10 @@ struct QIRProfilePreparationPass
         {cudaq::opt::getQubitType(ctx), cudaq::opt::getResultType(ctx)},
         module);
 
+    cudaq::opt::factory::createLLVMFunctionSymbol(
+        cudaq::opt::QIRReadResultBody, IntegerType::get(ctx, 1),
+        {cudaq::opt::getResultType(ctx)}, module);
+
     // Add record functions for any
     // measurements.
     cudaq::opt::factory::createLLVMFunctionSymbol(
@@ -585,6 +582,7 @@ cudaq::opt::verifyQIRProfilePass(llvm::StringRef convertTo) {
 
 void cudaq::opt::addQIRProfilePipeline(OpPassManager &pm,
                                        llvm::StringRef convertTo) {
+  assert(convertTo == "qir-adaptive" || convertTo == "qir-base");
   pm.addPass(createQIRProfilePreparationPass());
   pm.addNestedPass<LLVM::LLVMFuncOp>(createConvertToQIRFuncPass(convertTo));
   pm.addPass(createQIRToQIRProfilePass(convertTo));

diff --git a/lib/Optimizer/Transforms/QuakeAddMetadata.cpp b/lib/Optimizer/Transforms/QuakeAddMetadata.cpp
@@ -98,6 +98,8 @@ struct QuakeFunctionAnalysis {
         auto allocValue = storeOp.getOperand(1);
         if (auto cp = allocValue.getDefiningOp<cudaq::cc::ComputePtrOp>())
           allocValue = cp.getBase();
+        if (auto castOp = allocValue.getDefiningOp<cudaq::cc::CastOp>())
+          allocValue = castOp.getOperand();
 
         if (auto allocaOp = allocValue.getDefiningOp<cudaq::cc::AllocaOp>()) {
           // Get the alloca users
@@ -122,6 +124,35 @@ struct QuakeFunctionAnalysis {
                 return WalkResult::interrupt();
               }
             }
+
+            // Look for any future cast/compute_ptr/load, and if that load is
+            // used by a conditional statement
+            if (auto cast = dyn_cast<cudaq::cc::CastOp>(allocUser)) {
+              for (auto castUser : cast->getUsers()) {
+                if (auto cp = dyn_cast<cudaq::cc::ComputePtrOp>(castUser)) {
+                  for (auto cpUser : cp->getUsers()) {
+                    if (auto load = dyn_cast<cudaq::cc::LoadOp>(cpUser)) {
+                      auto loadUser = *load->getUsers().begin();
+
+                      // Loaded Val could be used directly or by an Arith
+                      // boolean operation
+                      while (loadUser->getDialect()->getNamespace() ==
+                             "arith") {
+                        auto res = loadUser->getResult(0);
+                        loadUser = *res.getUsers().begin();
+                      }
+
+                      // At this point we should be able to check if we are
+                      // being used by a conditional
+                      if (isa<cudaq::cc::IfOp, cf::CondBranchOp>(loadUser)) {
+                        data.hasConditionalsOnMeasure = true;
+                        return WalkResult::interrupt();
+                      }
+                    }
+                  }
+                }
+              }
+            }
           }
         }
       }

diff --git a/runtime/common/RuntimeMLIR.cpp b/runtime/common/RuntimeMLIR.cpp
@@ -142,7 +142,8 @@ void applyWriteOnlyAttributes(llvm::Module *llvmModule) {
 // overlap.
 // Reference:
 // https://github.com/qir-alliance/qir-spec/blob/main/specification/under_development/profiles/Base_Profile.md?plain=1#L237
-mlir::LogicalResult verifyMeasurementOrdering(llvm::Module *llvmModule) {
+mlir::LogicalResult
+verifyBaseProfileMeasurementOrdering(llvm::Module *llvmModule) {
   bool irreversibleSeenYet = false;
   for (llvm::Function &func : *llvmModule)
     for (llvm::BasicBlock &block : func)
@@ -321,14 +322,32 @@ mlir::LogicalResult verifyQubitAndResultRanges(llvm::Module *llvmModule) {
 }
 
 // Verify that only the allowed LLVM instructions are present
-mlir::LogicalResult verifyLLVMInstructions(llvm::Module *llvmModule) {
+mlir::LogicalResult verifyLLVMInstructions(llvm::Module *llvmModule,
+                                           bool isBaseProfile) {
+  bool isAdaptiveProfile = !isBaseProfile;
   for (llvm::Function &func : *llvmModule)
     for (llvm::BasicBlock &block : func)
       for (llvm::Instruction &inst : block) {
-        // Only call, br, and ret instructions are allowed at the top level.
-        if (!llvm::isa<llvm::CallBase>(inst) &&
-            !llvm::isa<llvm::BranchInst>(inst) &&
-            !llvm::isa<llvm::ReturnInst>(inst)) {
+        // Only specific instructions are allowed at the top level, depending on
+        // the specific profile
+        bool isValidBaseProfileInstruction =
+            llvm::isa<llvm::CallBase>(inst) ||
+            llvm::isa<llvm::BranchInst>(inst) ||
+            llvm::isa<llvm::ReturnInst>(inst);
+        // Note: there is an outstanding question about the adaptive profile
+        // with respect to `switch` and `select` instructions. They are
+        // currently described as "optional" in the spec, but there is no way to
+        // specify their presence via module flags. So to be cautious, for now
+        // we will assume they are not allowed in cuda-quantum programs.
+        bool isValidAdaptiveProfileInstruction = isValidBaseProfileInstruction;
+        // bool isValidAdaptiveProfileInstruction =
+        //     isValidBaseProfileInstruction ||
+        //     llvm::isa<llvm::SwitchInst>(inst) ||
+        //     llvm::isa<llvm::SelectInst>(inst);
+        if (isBaseProfile && !isValidBaseProfileInstruction) {
+          llvm::errs() << "error - invalid instruction found: " << inst << '\n';
+          return failure();
+        } else if (isAdaptiveProfile && !isValidAdaptiveProfileInstruction) {
           llvm::errs() << "error - invalid instruction found: " << inst << '\n';
           return failure();
         }
@@ -366,6 +385,9 @@ qirProfileTranslationFunction(const char *qirProfile, Operation *op,
   const uint32_t qir_major_version = 1;
   const uint32_t qir_minor_version = 0;
 
+  const bool isAdaptiveProfile = std::string{qirProfile} == "qir-adaptive";
+  const bool isBaseProfile = !isAdaptiveProfile;
+
   auto context = op->getContext();
   PassManager pm(context);
   if (printIntermediateMLIR)
@@ -398,26 +420,56 @@ qirProfileTranslationFunction(const char *qirProfile, Operation *op,
                             "dynamic_qubit_management", falseValue);
   llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
                             "dynamic_result_management", falseValue);
+  if (isAdaptiveProfile) {
+    auto trueValue =
+        llvm::ConstantInt::getTrue(llvm::Type::getInt1Ty(*llvmContext));
+    llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
+                              "qubit_resetting", trueValue);
+    llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
+                              "classical_ints", falseValue);
+    llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
+                              "classical_floats", falseValue);
+    llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
+                              "classical_fixed_points", falseValue);
+    llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
+                              "user_functions", falseValue);
+    llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
+                              "dynamic_float_args", falseValue);
+    llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
+                              "extern_functions", falseValue);
+    llvmModule->addModuleFlag(llvm::Module::ModFlagBehavior::Error,
+                              "backwards_branching", falseValue);
+  }
 
   // Note: optimizeLLVM is the one that is setting nonnull attributes on
   // the @__quantum__rt__result_record_output calls.
   cudaq::optimizeLLVM(llvmModule.get());
   if (!cudaq::setupTargetTriple(llvmModule.get()))
     throw std::runtime_error("Failed to setup the llvm module target triple.");
 
+  // PyQIR currently requires named blocks. It's not clear if blocks can share
+  // names across functions, so we are being conservative by giving every block
+  // in the module a unique name for now.
+  int blockCounter = 0;
+  for (llvm::Function &func : *llvmModule)
+    for (llvm::BasicBlock &block : func)
+      if (!block.hasName())
+        block.setName(std::to_string(blockCounter++));
+
   if (printIR)
     llvm::errs() << *llvmModule;
 
   if (failed(verifyOutputRecordingFunctions(llvmModule.get())))
     return failure();
 
-  if (failed(verifyMeasurementOrdering(llvmModule.get())))
+  if (isBaseProfile &&
+      failed(verifyBaseProfileMeasurementOrdering(llvmModule.get())))
     return failure();
 
   if (failed(verifyQubitAndResultRanges(llvmModule.get())))
     return failure();
 
-  if (failed(verifyLLVMInstructions(llvmModule.get())))
+  if (failed(verifyLLVMInstructions(llvmModule.get(), isBaseProfile)))
     return failure();
 
   // Map the LLVM Module to Bitcode that can be submitted

diff --git a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
@@ -119,16 +119,20 @@ class RemoteRESTQPU : public cudaq::QPU {
   /// of JIT engines for invoking the kernels.
   std::vector<ExecutionEngine *> jitEngines;
 
-  /// @brief Invoke the kernel in the JIT engine and then delete the JIT engine.
-  void invokeJITKernelAndRelease(ExecutionEngine *jit,
-                                 const std::string &kernelName) {
+  /// @brief Invoke the kernel in the JIT engine
+  void invokeJITKernel(ExecutionEngine *jit, const std::string &kernelName) {
     auto funcPtr = jit->lookup(std::string("__nvqpp__mlirgen__") + kernelName);
     if (!funcPtr) {
       throw std::runtime_error(
           "cudaq::builder failed to get kernelReg function.");
     }
     reinterpret_cast<void (*)()>(*funcPtr)();
-    // We're done, delete the pointer.
+  }
+
+  /// @brief Invoke the kernel in the JIT engine and then delete the JIT engine.
+  void invokeJITKernelAndRelease(ExecutionEngine *jit,
+                                 const std::string &kernelName) {
+    invokeJITKernel(jit, kernelName);
     delete jit;
   }
 
@@ -165,7 +169,9 @@ class RemoteRESTQPU : public cudaq::QPU {
   bool isSimulator() override { return emulate; }
 
   /// @brief Return true if the current backend supports conditional feedback
-  bool supportsConditionalFeedback() override { return false; }
+  bool supportsConditionalFeedback() override {
+    return codegenTranslation == "qir-adaptive";
+  }
 
   /// Provide the number of shots
   void setShots(int _nShots) override {
@@ -491,6 +497,38 @@ class RemoteRESTQPU : public cudaq::QPU {
             if (seed > 0)
               cudaq::set_random_seed(seed);
 
+            bool hasConditionals =
+                cudaq::kernelHasConditionalFeedback(kernelName);
+            if (hasConditionals && codes.size() > 1)
+              throw std::runtime_error("error: spin_ops not yet supported with "
+                                       "kernels containing conditionals");
+            if (hasConditionals) {
+              executor->setShots(1); // run one shot at a time
+
+              // If this is adaptive profile and the kernel has conditionals,
+              // then you have to run the code localShots times instead of
+              // running the kernel once and sampling the state localShots
+              // times.
+              if (hasConditionals) {
+                // Populate `counts` one shot at a time
+                cudaq::sample_result counts;
+                for (std::size_t shot = 0; shot < localShots; shot++) {
+                  cudaq::ExecutionContext context("sample", 1);
+                  context.hasConditionalsOnMeasureResults = true;
+                  cudaq::getExecutionManager()->setExecutionContext(&context);
+                  invokeJITKernel(localJIT[0], kernelName);
+                  cudaq::getExecutionManager()->resetExecutionContext();
+                  counts += context.result;
+                }
+                // Process `counts` and store into `results`
+                for (auto &regName : counts.register_names()) {
+                  results.emplace_back(counts.to_map(regName), regName);
+                  results.back().sequentialData =
+                      counts.sequential_data(regName);
+                }
+              }
+            }
+
             for (std::size_t i = 0; i < codes.size(); i++) {
               cudaq::ExecutionContext context("sample", localShots);
               cudaq::getExecutionManager()->setExecutionContext(&context);