From 35b79eb58eb08ca9d0e12aaba676ca157b00bfec Mon Sep 17 00:00:00 2001 From: NEUpanning Date: Tue, 24 Sep 2024 10:08:06 -0700 Subject: [PATCH] Introduce simple date time formatter (#10966) Summary: Introduce new DateTimeFormatterType called 'LENIENT_SIMPLE' and 'STRICT_SIMPLE' that are used when Spark legacy time parser policy is enabled for java.text.SimpleDateFormat in lenient and non-lenient mode. The implementation of 'LENIENT_SIMPLE' and 'STRICT_SIMPLE' is just copy from Joda in this PR and further PR will change the behavior to align with Spark. Spark functions using strict mode(lenient=false): 'from_unixtime', 'unix_timestamp', 'make_date', 'to_unix_timestamp', 'date_format'. Spark functions using lenient mode: cast timestamp to string. 'casting timestamp to string' will use LENIENT_SIMPLE only after the behavior of LENIENT_SIMPLE is aligned with Spark since it does not use Joda DateFormatter to do cast. Relates https://github.com/facebookincubator/velox/issues/10354 Pull Request resolved: https://github.com/facebookincubator/velox/pull/10966 Reviewed By: xiaoxmeng Differential Revision: D63261575 Pulled By: Yuhta fbshipit-source-id: 20ebdc1ad38a43d7064e5c232c9d52d361b7f474 --- velox/core/QueryConfig.h | 9 ++ velox/docs/configs.rst | 7 ++ velox/docs/functions/spark/datetime.rst | 9 +- velox/functions/lib/DateTimeFormatter.cpp | 125 +++++++++++++++++++ velox/functions/lib/DateTimeFormatter.h | 18 ++- velox/functions/sparksql/DateTimeFunctions.h | 55 ++++++-- velox/functions/sparksql/Split.h | 2 +- 7 files changed, 212 insertions(+), 13 deletions(-) diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h index ee71b1e86bc7..6bd934d0c4f5 100644 --- a/velox/core/QueryConfig.h +++ b/velox/core/QueryConfig.h @@ -287,6 +287,11 @@ class QueryConfig { /// The current spark partition id. static constexpr const char* kSparkPartitionId = "spark.partition_id"; + /// If true, simple date formatter is used for time formatting and parsing. + /// Joda date formatter is used by default. + static constexpr const char* kSparkLegacyDateFormatter = + "spark.legacy_date_formatter"; + /// The number of local parallel table writer operators per task. static constexpr const char* kTaskWriterCount = "task_writer_count"; @@ -741,6 +746,10 @@ class QueryConfig { return value; } + bool sparkLegacyDateFormatter() const { + return get(kSparkLegacyDateFormatter, false); + } + bool exprTrackCpuUsage() const { return get(kExprTrackCpuUsage, false); } diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst index 799a6cfe1823..87f12b445ce9 100644 --- a/velox/docs/configs.rst +++ b/velox/docs/configs.rst @@ -713,6 +713,13 @@ Spark-specific Configuration - integer - - The current task's Spark partition ID. It's set by the query engine (Spark) prior to task execution. + * - spark.legacy_date_formatter + - bool + - false + - If true, `Simple ` date formatter is used for time formatting and parsing. Joda date formatter is used by default. + - Joda date formatter performs strict checking of its input and uses different pattern string. + - For example, the 2015-07-22 10:00:00 timestamp cannot be parse if pattern is yyyy-MM-dd because the parser does not consume whole input. + - Another example is that the 'W' pattern, which means week in month, is not supported. For more differences, see :issue:`10354`. Tracing -------- diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index 85ca72628933..4b3305891dae 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -82,7 +82,9 @@ These functions support TIMESTAMP and DATE input types. Adjusts ``unixTime`` (elapsed seconds since UNIX epoch) to configured session timezone, then converts it to a formatted time string according to ``format``. Only supports BIGINT type for - ``unixTime``. + ``unixTime``. Using `Simple ` + date formatter in lenient mode that is align with Spark legacy date parser behavior or + `Joda ` date formatter depends on ``spark.legacy_date_formatter`` configuration. `Valid patterns for date format `_. Throws exception for invalid ``format``. This function will convert input to milliseconds, and integer overflow is @@ -285,7 +287,10 @@ These functions support TIMESTAMP and DATE input types. .. spark:function:: unix_timestamp() -> integer - Returns the current UNIX timestamp in seconds. + Returns the current UNIX timestamp in seconds. Using + `Simple ` date formatter in lenient mode + that is align with Spark legacy date parser behavior or `Joda ` date formatter + depends on the ``spark.legacy_date_formatter`` configuration. .. spark:function:: unix_timestamp(string) -> integer :noindex: diff --git a/velox/functions/lib/DateTimeFormatter.cpp b/velox/functions/lib/DateTimeFormatter.cpp index 515ba2e44443..5300dc47d586 100644 --- a/velox/functions/lib/DateTimeFormatter.cpp +++ b/velox/functions/lib/DateTimeFormatter.cpp @@ -1697,4 +1697,129 @@ std::shared_ptr buildJodaDateTimeFormatter( return builder.setType(DateTimeFormatterType::JODA).build(); } +std::shared_ptr buildSimpleDateTimeFormatter( + const std::string_view& format, + bool lenient) { + VELOX_USER_CHECK(!format.empty(), "Format pattern should not be empty."); + + DateTimeFormatterBuilder builder(format.size()); + const char* cur = format.data(); + const char* end = cur + format.size(); + + while (cur < end) { + const char* startTokenPtr = cur; + + // For literal case, literal should be quoted using single quotes ('). If + // there is no quotes, it is interpreted as pattern letters. If there is + // only single quote, a user error will be thrown. + if (*startTokenPtr == '\'') { + // Append single literal quote for 2 consecutive single quote. + if (cur + 1 < end && *(cur + 1) == '\'') { + builder.appendLiteral("'"); + cur += 2; + } else { + // Append literal characters from the start until the next closing + // literal sequence single quote. + int64_t count = numLiteralChars(startTokenPtr + 1, end); + VELOX_USER_CHECK_NE(count, -1, "No closing single quote for literal"); + for (int64_t i = 1; i <= count; i++) { + builder.appendLiteral(startTokenPtr + i, 1); + if (*(startTokenPtr + i) == '\'') { + i += 1; + } + } + cur += count + 2; + } + } else { + // Append format specifier according to pattern letters. If pattern letter + // is not supported, a user error will be thrown. + int count = 1; + ++cur; + while (cur < end && *startTokenPtr == *cur) { + ++count; + ++cur; + } + switch (*startTokenPtr) { + case 'a': + builder.appendHalfDayOfDay(); + break; + case 'C': + builder.appendCenturyOfEra(count); + break; + case 'd': + builder.appendDayOfMonth(count); + break; + case 'D': + builder.appendDayOfYear(count); + break; + case 'e': + builder.appendDayOfWeek1Based(count); + break; + case 'E': + builder.appendDayOfWeekText(count); + break; + case 'G': + builder.appendEra(); + break; + case 'h': + builder.appendClockHourOfHalfDay(count); + break; + case 'H': + builder.appendHourOfDay(count); + break; + case 'K': + builder.appendHourOfHalfDay(count); + break; + case 'k': + builder.appendClockHourOfDay(count); + break; + case 'm': + builder.appendMinuteOfHour(count); + break; + case 'M': + if (count <= 2) { + builder.appendMonthOfYear(count); + } else { + builder.appendMonthOfYearText(count); + } + break; + case 's': + builder.appendSecondOfMinute(count); + break; + case 'S': + builder.appendFractionOfSecond(count); + break; + case 'w': + builder.appendWeekOfWeekYear(count); + break; + case 'x': + builder.appendWeekYear(count); + break; + case 'y': + builder.appendYear(count); + break; + case 'Y': + builder.appendYearOfEra(count); + break; + case 'z': + builder.appendTimeZone(count); + break; + case 'Z': + builder.appendTimeZoneOffsetId(count); + break; + default: + if (isalpha(*startTokenPtr)) { + VELOX_UNSUPPORTED("Specifier {} is not supported.", *startTokenPtr); + } else { + builder.appendLiteral(startTokenPtr, cur - startTokenPtr); + } + break; + } + } + } + DateTimeFormatterType type = lenient ? DateTimeFormatterType::LENIENT_SIMPLE + : DateTimeFormatterType::STRICT_SIMPLE; + return builder.setType(type).build(); +} + } // namespace facebook::velox::functions diff --git a/velox/functions/lib/DateTimeFormatter.h b/velox/functions/lib/DateTimeFormatter.h index ef3c98255404..9fbbcc1eba42 100644 --- a/velox/functions/lib/DateTimeFormatter.h +++ b/velox/functions/lib/DateTimeFormatter.h @@ -23,7 +23,19 @@ namespace facebook::velox::functions { -enum class DateTimeFormatterType { JODA, MYSQL, UNKNOWN }; +enum class DateTimeFormatterType { + JODA, + MYSQL, + // Corresponding to java.text.SimpleDateFormat in lenient mode. It is used by + // the 'date_format', 'from_unixtime', 'unix_timestamp' and + // 'to_unix_timestamp' Spark functions. + // TODO: this is currently no different from STRICT_SIMPLE. + LENIENT_SIMPLE, + // Corresponding to java.text.SimpleDateFormat in strict(lenient=false) mode. + // It is used by Spark 'cast date to string'. + STRICT_SIMPLE, + UNKNOWN +}; enum class DateTimeFormatSpecifier : uint8_t { // Era, e.g: "AD" @@ -209,6 +221,10 @@ std::shared_ptr buildMysqlDateTimeFormatter( std::shared_ptr buildJodaDateTimeFormatter( const std::string_view& format); +std::shared_ptr buildSimpleDateTimeFormatter( + const std::string_view& format, + bool lenient); + } // namespace facebook::velox::functions template <> diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 07b39ff09896..89facdd01503 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -25,6 +25,22 @@ namespace facebook::velox::functions::sparksql { +namespace detail { +std::shared_ptr getDateTimeFormatter( + const std::string_view& format, + DateTimeFormatterType type) { + switch (type) { + case DateTimeFormatterType::STRICT_SIMPLE: + return buildSimpleDateTimeFormatter(format, /*lenient=*/false); + case DateTimeFormatterType::LENIENT_SIMPLE: + return buildSimpleDateTimeFormatter(format, /*lenient=*/true); + default: + return buildJodaDateTimeFormatter( + std::string_view(format.data(), format.size())); + } +} +} // namespace detail + template struct YearFunction : public InitSessionTimezone { VELOX_DEFINE_FUNCTION_TYPES(T); @@ -156,7 +172,10 @@ struct UnixTimestampParseFunction { const std::vector& /*inputTypes*/, const core::QueryConfig& config, const arg_type* /*input*/) { - format_ = buildJodaDateTimeFormatter(kDefaultFormat_); + format_ = detail::getDateTimeFormatter( + kDefaultFormat_, + config.sparkLegacyDateFormatter() ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); setTimezone(config); } @@ -205,10 +224,13 @@ struct UnixTimestampParseWithFormatFunction const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { + legacyFormatter_ = config.sparkLegacyDateFormatter(); if (format != nullptr) { try { - this->format_ = buildJodaDateTimeFormatter( - std::string_view(format->data(), format->size())); + this->format_ = detail::getDateTimeFormatter( + std::string_view(format->data(), format->size()), + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } catch (const VeloxUserError&) { invalidFormat_ = true; } @@ -228,8 +250,10 @@ struct UnixTimestampParseWithFormatFunction // Format error returns null. try { if (!isConstFormat_) { - this->format_ = buildJodaDateTimeFormatter( - std::string_view(format.data(), format.size())); + this->format_ = detail::getDateTimeFormatter( + std::string_view(format.data(), format.size()), + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } } catch (const VeloxUserError&) { return false; @@ -248,6 +272,7 @@ struct UnixTimestampParseWithFormatFunction private: bool isConstFormat_{false}; bool invalidFormat_{false}; + bool legacyFormatter_{false}; }; // Parses unix time in seconds to a formatted string. @@ -260,6 +285,7 @@ struct FromUnixtimeFunction { const core::QueryConfig& config, const arg_type* /*unixtime*/, const arg_type* format) { + legacyFormatter_ = config.sparkLegacyDateFormatter(); sessionTimeZone_ = getTimeZoneFromConfig(config); if (format != nullptr) { setFormatter(*format); @@ -284,8 +310,10 @@ struct FromUnixtimeFunction { private: FOLLY_ALWAYS_INLINE void setFormatter(const arg_type& format) { - formatter_ = buildJodaDateTimeFormatter( - std::string_view(format.data(), format.size())); + formatter_ = detail::getDateTimeFormatter( + std::string_view(format.data(), format.size()), + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); maxResultSize_ = formatter_->maxResultSize(sessionTimeZone_); } @@ -293,6 +321,7 @@ struct FromUnixtimeFunction { std::shared_ptr formatter_; uint32_t maxResultSize_; bool isConstantTimeFormat_{false}; + bool legacyFormatter_{false}; }; template @@ -366,12 +395,16 @@ struct GetTimestampFunction { const core::QueryConfig& config, const arg_type* /*input*/, const arg_type* format) { + legacyFormatter_ = config.sparkLegacyDateFormatter(); auto sessionTimezoneName = config.sessionTimezone(); if (!sessionTimezoneName.empty()) { sessionTimeZone_ = tz::locateZone(sessionTimezoneName); } if (format != nullptr) { - formatter_ = buildJodaDateTimeFormatter(std::string_view(*format)); + formatter_ = detail::getDateTimeFormatter( + std::string_view(*format), + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); isConstantTimeFormat_ = true; } } @@ -381,7 +414,10 @@ struct GetTimestampFunction { const arg_type& input, const arg_type& format) { if (!isConstantTimeFormat_) { - formatter_ = buildJodaDateTimeFormatter(std::string_view(format)); + formatter_ = detail::getDateTimeFormatter( + std::string_view(format), + legacyFormatter_ ? DateTimeFormatterType::STRICT_SIMPLE + : DateTimeFormatterType::JODA); } auto dateTimeResult = formatter_->parse(std::string_view(input)); // Null as result for parsing error. @@ -404,6 +440,7 @@ struct GetTimestampFunction { std::shared_ptr formatter_{nullptr}; bool isConstantTimeFormat_{false}; const tz::TimeZone* sessionTimeZone_{tz::locateZone(0)}; // default to GMT. + bool legacyFormatter_{false}; }; template diff --git a/velox/functions/sparksql/Split.h b/velox/functions/sparksql/Split.h index 86fdde503ced..2cee345f77b2 100644 --- a/velox/functions/sparksql/Split.h +++ b/velox/functions/sparksql/Split.h @@ -165,6 +165,6 @@ struct Split { result.add_item().setNoCopy(StringView(start + pos, end - pos)); } - mutable detail::ReCache cache_; + mutable facebook::velox::functions::detail::ReCache cache_; }; } // namespace facebook::velox::functions::sparksql