From 4a03718e09cfbe5fa5dad87f316ebc69b48a853f Mon Sep 17 00:00:00 2001 From: Yenda Li Date: Fri, 31 Jan 2025 15:05:07 -0800 Subject: [PATCH] refactor: use tryFindOrCompile instead of findOrCompile for regex [1/n] Summary: When evaluating regexes and the regex throws, we can simply capture the exception inside a status instead of doing a throw which is extremely expensive. This change just uses the pre-existing expected API for error handling. This allows queries which times out after 2 hours on an operator to finish in 47 minutes. Differential Revision: D68983392 --- velox/functions/lib/Re2Functions.cpp | 37 ++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/velox/functions/lib/Re2Functions.cpp b/velox/functions/lib/Re2Functions.cpp index ace6d0a6fa3f..6b2481dc8c62 100644 --- a/velox/functions/lib/Re2Functions.cpp +++ b/velox/functions/lib/Re2Functions.cpp @@ -264,7 +264,12 @@ class Re2Match final : public exec::VectorFunction { exec::LocalDecodedVector toSearch(context, *args[0], rows); exec::LocalDecodedVector pattern(context, *args[1], rows); context.applyToSelectedNoThrow(rows, [&](vector_size_t row) { - auto& re = *cache_.findOrCompile(pattern->valueAt(row)); + auto tryRe = cache_.tryFindOrCompile(pattern->valueAt(row)); + if (tryRe.hasError()) { + context.setStatus(row, tryRe.error()); + return; + } + const auto& re = *tryRe.value(); result.set(row, Fn(toSearch->valueAt(row), re)); }); } @@ -394,7 +399,13 @@ class Re2SearchAndExtract final : public exec::VectorFunction { if (args.size() == 2) { groups.resize(1); context.applyToSelectedNoThrow(rows, [&](vector_size_t i) { - auto& re = *cache_.findOrCompile(pattern->valueAt(i)); + auto tryRe = cache_.tryFindOrCompile(pattern->valueAt(i)); + if (tryRe.hasError()) { + context.setStatus(i, tryRe.error()); + return; + } + const auto& re = *tryRe.value(); + mustRefSourceStrings |= re2Extract(result, i, re, toSearch, groups, 0, emptyNoMatch_); }); @@ -402,7 +413,13 @@ class Re2SearchAndExtract final : public exec::VectorFunction { exec::LocalDecodedVector groupIds(context, *args[2], rows); context.applyToSelectedNoThrow(rows, [&](vector_size_t i) { const auto groupId = groupIds->valueAt(i); - auto& re = *cache_.findOrCompile(pattern->valueAt(i)); + auto tryRe = cache_.tryFindOrCompile(pattern->valueAt(i)); + if (tryRe.hasError()) { + context.setStatus(i, tryRe.error()); + return; + } + + const auto& re = *tryRe.value(); checkForBadGroupId(groupId, re); groups.resize(groupId + 1); mustRefSourceStrings |= @@ -1195,7 +1212,12 @@ class Re2ExtractAll final : public exec::VectorFunction { // groups.resize(1); context.applyToSelectedNoThrow(rows, [&](vector_size_t row) { - auto& re = *cache_.findOrCompile(pattern->valueAt(row)); + auto tryRe = cache_.tryFindOrCompile(pattern->valueAt(row)); + if (tryRe.hasError()) { + context.setStatus(row, std::move(tryRe.error())); + return; + } + const auto& re = *tryRe.value(); re2ExtractAll(resultWriter, re, inputStrs, row, groups, 0); }); } else { @@ -1204,7 +1226,12 @@ class Re2ExtractAll final : public exec::VectorFunction { exec::LocalDecodedVector groupIds(context, *args[2], rows); context.applyToSelectedNoThrow(rows, [&](vector_size_t row) { const T groupId = groupIds->valueAt(row); - auto& re = *cache_.findOrCompile(pattern->valueAt(row)); + auto tryRe = cache_.tryFindOrCompile(pattern->valueAt(row)); + if (tryRe.hasError()) { + context.setStatus(row, std::move(tryRe.error())); + return; + } + const auto& re = *tryRe.value(); checkForBadGroupId(groupId, re); groups.resize(groupId + 1); re2ExtractAll(resultWriter, re, inputStrs, row, groups, groupId);