facebookincubator · arhimondr · Jan 23, 2025 · xiaoxmeng · Jan 23, 2025 · arhimondr
diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
@@ -113,6 +113,15 @@ class QueryConfig {
   static constexpr const char* kMaxMergeExchangeBufferSize =
       "merge_exchange.max_buffer_size";
 
+  /// The minimum number of bytes to accumulate in the ExchangeQueue
+  /// before unblocking a consumer. This is used to avoid creating tiny
+  /// batches which may have a negative impact on performance when the
+  /// cost of creating vectors is high (for example, when there are many
+  /// columns). To avoid latency degradation, the exchange client unblocks a
+  /// consumer when 1% of the data size observed so far is accumulated.
+  static constexpr const char* kMinExchangeOutputBatchBytes =
+      "min_exchange_output_batch_bytes";
+
   static constexpr const char* kMaxPartialAggregationMemory =
       "max_partial_aggregation_memory";
 
@@ -594,6 +603,11 @@ class QueryConfig {
     return get<uint64_t>(kMaxMergeExchangeBufferSize, kDefault);
   }
 
+  uint64_t minExchangeOutputBatchBytes() const {
+    static constexpr uint64_t kDefault = 2UL << 20;
+    return get<uint64_t>(kMinExchangeOutputBatchBytes, kDefault);
+  }
+
   uint64_t preferredOutputBatchBytes() const {
     static constexpr uint64_t kDefault = 10UL << 20;
     return get<uint64_t>(kPreferredOutputBatchBytes, kDefault);

diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst
@@ -89,6 +89,13 @@ Generic Configuration
      - Size of buffer in the exchange client that holds data fetched from other nodes before it is processed.
        A larger buffer can increase network throughput for larger clusters and thus decrease query processing time
        at the expense of reducing the amount of memory available for other usage.
+   * - min_exchange_output_batch_bytes
+     - integer
+     - 2MB
+     - The minimum number of bytes to accumulate in the ExchangeQueue before unblocking a consumer. This is used to avoid
+       creating tiny batches which may have a negative impact on performance when the cost of creating vectors is high
+       (for example, when there are many columns). To avoid latency degradation, the exchange client unblocks a consumer
+       when 1% of the data size observed so far is accumulated.
    * - merge_exchange.max_buffer_size
      - integer
      - 128MB
@@ -670,13 +677,13 @@ Each query can override the config by setting corresponding query session proper
      - Default AWS secret key to use.
    * - hive.s3.endpoint
      - string
-     - 
+     -
      - The S3 storage endpoint server. This can be used to connect to an S3-compatible storage system instead of AWS.
    * - hive.s3.endpoint.region
      - string
      - us-east-1
      - The S3 storage endpoint server region. Default is set by the AWS SDK. If not configured, region will be attempted
-       to be parsed from the hive.s3.endpoint value. 
+       to be parsed from the hive.s3.endpoint value.
    * - hive.s3.path-style-access
      - bool
      - false

diff --git a/velox/exec/Exchange.cpp b/velox/exec/Exchange.cpp
@@ -52,6 +52,7 @@ Exchange::Exchange(
           operatorCtx_->driverCtx()->queryConfig(),
           serdeKind_)},
       processSplits_{operatorCtx_->driverCtx()->driverId == 0},
+      driverId_{driverCtx->driverId},
       exchangeClient_{std::move(exchangeClient)} {}
 
 void Exchange::addTaskIds(std::vector<std::string>& taskIds) {
@@ -111,8 +112,8 @@ BlockingReason Exchange::isBlocked(ContinueFuture* future) {
   }
 
   ContinueFuture dataFuture;
-  currentPages_ =
-      exchangeClient_->next(preferredOutputBatchBytes_, &atEnd_, &dataFuture);
+  currentPages_ = exchangeClient_->next(
+      driverId_, preferredOutputBatchBytes_, &atEnd_, &dataFuture);
   if (!currentPages_.empty() || atEnd_) {
     if (atEnd_ && noMoreSplits_) {
       const auto numSplits = stats_.rlock()->numSplits;

diff --git a/velox/exec/Exchange.h b/velox/exec/Exchange.h
@@ -89,6 +89,8 @@ class Exchange : public SourceOperator {
   /// and passing these to ExchangeClient.
   const bool processSplits_;
 
+  const int driverId_;
+
   bool noMoreSplits_ = false;
 
   std::shared_ptr<ExchangeClient> exchangeClient_;

diff --git a/velox/exec/ExchangeClient.cpp b/velox/exec/ExchangeClient.cpp
@@ -118,10 +118,14 @@ folly::F14FastMap<std::string, RuntimeMetric> ExchangeClient::stats() const {
   return stats;
 }
 
-std::vector<std::unique_ptr<SerializedPage>>
-ExchangeClient::next(uint32_t maxBytes, bool* atEnd, ContinueFuture* future) {
+std::vector<std::unique_ptr<SerializedPage>> ExchangeClient::next(
+    int consumerId,
+    uint32_t maxBytes,
+    bool* atEnd,
+    ContinueFuture* future) {
   std::vector<RequestSpec> requestSpecs;
   std::vector<std::unique_ptr<SerializedPage>> pages;
+  ContinuePromise stalePromise = ContinuePromise::makeEmpty();
   {
     std::lock_guard<std::mutex> l(queue_->mutex());
     if (closed_) {
@@ -130,7 +134,8 @@ ExchangeClient::next(uint32_t maxBytes, bool* atEnd, ContinueFuture* future) {
     }
 
     *atEnd = false;
-    pages = queue_->dequeueLocked(maxBytes, atEnd, future);
+    pages = queue_->dequeueLocked(
+        consumerId, maxBytes, atEnd, future, &stalePromise);
     if (*atEnd) {
       return pages;
     }
@@ -143,6 +148,9 @@ ExchangeClient::next(uint32_t maxBytes, bool* atEnd, ContinueFuture* future) {
   }
 
   // Outside of lock
+  if (stalePromise.valid()) {
+    stalePromise.setValue();
+  }
   request(std::move(requestSpecs));
   return pages;
 }

diff --git a/velox/exec/ExchangeClient.h b/velox/exec/ExchangeClient.h
@@ -33,14 +33,18 @@ class ExchangeClient : public std::enable_shared_from_this<ExchangeClient> {
       std::string taskId,
       int destination,
       int64_t maxQueuedBytes,
+      int32_t numberOfConsumers,
+      uint64_t minOutputBatchBytes,
       memory::MemoryPool* pool,
       folly::Executor* executor)
       : taskId_{std::move(taskId)},
         destination_(destination),
         maxQueuedBytes_{maxQueuedBytes},
         pool_(pool),
         executor_(executor),
-        queue_(std::make_shared<ExchangeQueue>()) {
+        queue_(std::make_shared<ExchangeQueue>(
+            numberOfConsumers,
+            minOutputBatchBytes)) {
     VELOX_CHECK_NOT_NULL(pool_);
     VELOX_CHECK_NOT_NULL(executor_);
     // NOTE: the executor is used to run async response callback from the
@@ -91,7 +95,7 @@ class ExchangeClient : public std::enable_shared_from_this<ExchangeClient> {
   /// The data may be compressed, in which case 'maxBytes' applies to compressed
   /// size.
   std::vector<std::unique_ptr<SerializedPage>>
-  next(uint32_t maxBytes, bool* atEnd, ContinueFuture* future);
+  next(int consumerId, uint32_t maxBytes, bool* atEnd, ContinueFuture* future);
 
   std::string toString() const;
 

diff --git a/velox/exec/ExchangeQueue.cpp b/velox/exec/ExchangeQueue.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include "velox/exec/ExchangeQueue.h"
+#include <algorithm>
 
 namespace facebook::velox::exec {
 
@@ -64,6 +65,15 @@ void ExchangeQueue::close() {
   clearPromises(promises);
 }
 
+int64_t ExchangeQueue::minOutputBatchBytesLocked() const {
+  // always allow to unblock when at end
+  if (atEnd_) {
+    return 0;
+  }
+  // At most 1% of received bytes so far to minimize latency for small exchanges
+  return std::min<int64_t>(minOutputBatchBytes_, receivedBytes_ / 100);
+}
+
 void ExchangeQueue::enqueueLocked(
     std::unique_ptr<SerializedPage>&& page,
     std::vector<ContinuePromise>& promises) {
@@ -86,17 +96,45 @@ void ExchangeQueue::enqueueLocked(
   receivedBytes_ += page->size();
 
   queue_.push_back(std::move(page));
-  if (!promises_.empty()) {
+  const auto minBatchSize = minOutputBatchBytesLocked();
+  while (!promises_.empty()) {
+    VELOX_CHECK_LE(promises_.size(), numberOfConsumers_);
+    const int32_t unblockedConsumers = numberOfConsumers_ - promises_.size();
+    const int64_t unasignedBytes =
+        totalBytes_ - unblockedConsumers * minBatchSize;
+    if (unasignedBytes < minBatchSize) {
+      break;
+    }
     // Resume one of the waiting drivers.
-    promises.push_back(std::move(promises_.back()));
-    promises_.pop_back();
+    auto it = promises_.begin();
+    promises.push_back(std::move(it->second));
+    promises_.erase(it);
   }
 }
 
+void ExchangeQueue::addPromiseLocked(
+    int consumerId,
+    ContinueFuture* future,
+    ContinuePromise* stalePromise) {
+  ContinuePromise promise{"ExchangeQueue::dequeue"};
+  *future = promise.getSemiFuture();
+  auto it = promises_.find(consumerId);
+  if (it != promises_.end()) {
+    // resolve stale promises outside the lock to avoid broken promises
+    *stalePromise = std::move(it->second);
+    it->second = std::move(promise);
+  } else {
+    promises_[consumerId] = std::move(promise);
+  }
+  VELOX_CHECK_LE(promises_.size(), numberOfConsumers_);
+}
+
 std::vector<std::unique_ptr<SerializedPage>> ExchangeQueue::dequeueLocked(
+    int consumerId,
     uint32_t maxBytes,
     bool* atEnd,
-    ContinueFuture* future) {
+    ContinueFuture* future,
+    ContinuePromise* stalePromise) {
   VELOX_CHECK_NOT_NULL(future);
   if (!error_.empty()) {
     *atEnd = true;
@@ -105,15 +143,21 @@ std::vector<std::unique_ptr<SerializedPage>> ExchangeQueue::dequeueLocked(
 
   *atEnd = false;
 
+  // If we don't have enough bytes to return, we wait for more data to be
+  // available
+  if (totalBytes_ < minOutputBatchBytesLocked()) {
+    addPromiseLocked(consumerId, future, stalePromise);
+    return {};
+  }
+
   std::vector<std::unique_ptr<SerializedPage>> pages;
   uint32_t pageBytes = 0;
   for (;;) {
     if (queue_.empty()) {
       if (atEnd_) {
         *atEnd = true;
       } else if (pages.empty()) {
-        promises_.emplace_back("ExchangeQueue::dequeue");
-        *future = promises_.back().getSemiFuture();
+        addPromiseLocked(consumerId, future, stalePromise);
       }
       return pages;
     }

diff --git a/velox/exec/ExchangeQueue.h b/velox/exec/ExchangeQueue.h
@@ -81,6 +81,18 @@ class SerializedPage {
 /// for input.
 class ExchangeQueue {
  public:
+#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY
+  explicit ExchangeQueue() : ExchangeQueue(1, 0) {}
+#endif
+
+  explicit ExchangeQueue(
+      int32_t numberOfConsumers,
+      uint64_t minOutputBatchBytes)
+      : numberOfConsumers_{numberOfConsumers},
+        minOutputBatchBytes_{minOutputBatchBytes} {
+    VELOX_CHECK_GE(numberOfConsumers, 1);
+  }
+
   ~ExchangeQueue() {
     clearAllPromises();
   }
@@ -119,8 +131,20 @@ class ExchangeQueue {
   ///
   /// The data may be compressed, in which case 'maxBytes' applies to compressed
   /// size.
+  std::vector<std::unique_ptr<SerializedPage>> dequeueLocked(
+      int consumerId,
+      uint32_t maxBytes,
+      bool* atEnd,
+      ContinueFuture* future,
+      ContinuePromise* stalePromise);
+
+#ifdef VELOX_ENABLE_BACKWARD_COMPATIBILITY
   std::vector<std::unique_ptr<SerializedPage>>
-  dequeueLocked(uint32_t maxBytes, bool* atEnd, ContinueFuture* future);
+  dequeueLocked(uint32_t maxBytes, bool* atEnd, ContinueFuture* future) {
+    ContinuePromise stalePromise = ContinuePromise::makeEmpty();
+    return dequeueLocked(0, maxBytes, atEnd, future, &stalePromise);
+  }
+#endif
 
   /// Returns the total bytes held by SerializedPages in 'this'.
   int64_t totalBytes() const {
@@ -166,6 +190,11 @@ class ExchangeQueue {
     return {};
   }
 
+  void addPromiseLocked(
+      int consumerId,
+      ContinueFuture* future,
+      ContinuePromise* stalePromise);
+
   void clearAllPromises() {
     std::vector<ContinuePromise> promises;
     {
@@ -176,7 +205,14 @@ class ExchangeQueue {
   }
 
   std::vector<ContinuePromise> clearAllPromisesLocked() {
-    return std::move(promises_);
+    std::vector<ContinuePromise> promises(promises_.size());
+    auto it = promises_.begin();
+    while (it != promises_.end()) {
+      promises.push_back(std::move(it->second));
+      it = promises_.erase(it);
+    }
+    VELOX_CHECK(promises_.empty());
+    return promises;
   }
 
   static void clearPromises(std::vector<ContinuePromise>& promises) {
@@ -185,14 +221,21 @@ class ExchangeQueue {
     }
   }
 
+  int64_t minOutputBatchBytesLocked() const;
+
+  const int32_t numberOfConsumers_;
+  const uint64_t minOutputBatchBytes_;
+
   int numCompleted_{0};
   int numSources_{0};
   bool noMoreSources_{false};
   bool atEnd_{false};
 
   std::mutex mutex_;
   std::deque<std::unique_ptr<SerializedPage>> queue_;
-  std::vector<ContinuePromise> promises_;
+  // The map from consumer id to the waiting promise
+  folly::F14FastMap<int, ContinuePromise> promises_;
+
   // When set, all promises will be realized and the next dequeue will
   // throw an exception with this message.
   std::string error_;

diff --git a/velox/exec/MergeSource.cpp b/velox/exec/MergeSource.cpp
@@ -128,6 +128,9 @@ class MergeExchangeSource : public MergeSource {
             mergeExchange->taskId(),
             destination,
             maxQueuedBytes,
+            1,
+            // Deliver right away to avoid blocking other sources
+            0,
             pool,
             executor)) {
     client_->addRemoteTaskId(taskId);
@@ -146,7 +149,7 @@ class MergeExchangeSource : public MergeSource {
     }
 
     if (!currentPage_) {
-      auto pages = client_->next(1, &atEnd_, future);
+      auto pages = client_->next(0, 1, &atEnd_, future);
       VELOX_CHECK_LE(pages.size(), 1);
       currentPage_ = pages.empty() ? nullptr : std::move(pages.front());
 

diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp
@@ -970,7 +970,8 @@ void Task::initializePartitionOutput() {
       // exchange client for each merge source to fetch data as we can't mix
       // the data from different sources for merging.
       if (auto exchangeNodeId = factory->needsExchangeClient()) {
-        createExchangeClientLocked(pipeline, exchangeNodeId.value());
+        createExchangeClientLocked(
+            pipeline, exchangeNodeId.value(), factory->numDrivers);
       }
     }
   }
@@ -2982,7 +2983,8 @@ bool Task::pauseRequested(ContinueFuture* future) {
 
 void Task::createExchangeClientLocked(
     int32_t pipelineId,
-    const core::PlanNodeId& planNodeId) {
+    const core::PlanNodeId& planNodeId,
+    int32_t numberOfConsumers) {
   VELOX_CHECK_NULL(
       getExchangeClientLocked(pipelineId),
       "Exchange client has been created at pipeline: {} for planNode: {}",
@@ -2998,6 +3000,8 @@ void Task::createExchangeClientLocked(
       taskId_,
       destination_,
       queryCtx()->queryConfig().maxExchangeBufferSize(),
+      numberOfConsumers,
+      queryCtx()->queryConfig().minExchangeOutputBatchBytes(),
       addExchangeClientPool(planNodeId, pipelineId),
       queryCtx()->executor());
   exchangeClientByPlanNode_.emplace(planNodeId, exchangeClients_[pipelineId]);

diff --git a/velox/exec/Task.h b/velox/exec/Task.h
@@ -1003,7 +1003,8 @@ class Task : public std::enable_shared_from_this<Task> {
   // pipeline.
   void createExchangeClientLocked(
       int32_t pipelineId,
-      const core::PlanNodeId& planNodeId);
+      const core::PlanNodeId& planNodeId,
+      int32_t numberOfConsumers);
 
   // Get a shared reference to the exchange client with the specified exchange
   // plan node 'planNodeId'. The function returns null if there is no client