From 11c56e302f9f59f1c3533c26627d9652e6c1b069 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 24 Jul 2023 13:44:16 -0700 Subject: [PATCH 01/67] add _perftimer file to helpers folder --- singer_sdk/helpers/_perftimer.py | 116 +++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 singer_sdk/helpers/_perftimer.py diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py new file mode 100644 index 000000000..8346e55fc --- /dev/null +++ b/singer_sdk/helpers/_perftimer.py @@ -0,0 +1,116 @@ +"""performace timers which deal with dynamic timing events.""" + +from __future__ import annotations + +import time + + +class PerfTimerError(Exception): + """A custom exception used to report errors in use of BatchPerfTimer class.""" + + +class PerfTimer: + """A Basic Performance Timer Class.""" + + _start_time: float = None + _stop_time: float = None + _lap_time: float = None + + @property + def start_time(self): + return self._start_time + + @property + def stop_time(self): + return self._stop_time + + @property + def lap_time(self): + return self._lap_time + + def start(self) -> None: + """Start the timer.""" + if self._start_time is not None: + msg = "Timer is running. Use .stop() to stop it" + raise PerfTimerError(msg) + + self._start_time = time.perf_counter() + + def stop(self) -> None: + """Stop the timer, Stores the elapsed time, and reset.""" + if self._start_time is None: + msg = "Timer is not running. Use .start() to start it" + raise PerfTimerError(msg) + + self._stop_time = time.perf_counter() + self._lap_time = self._stop_time - self._start_time + self._start_time = None + + +class BatchPerfTimer(PerfTimer): + """The Performance Timer for Target bulk inserts.""" + + def __init__( + self, + max_size: int | None = None, + max_perf_counter: float = 1, + ) -> None: + self._sink_max_size: int = max_size + self._max_perf_counter = max_perf_counter + + SINK_MAX_SIZE_CELING: int = 100000 + """The max size a bulk insert can be""" + + @property + def sink_max_size(self): + """The current MAX_SIZE_DEFAULT.""" + return self._sink_max_size + + @property + def max_perf_counter(self): + """How many seconds can pass before a insert.""" + return self._max_perf_counter + + @property + def perf_diff_allowed_min(self): + """The mininum negative variance allowed, 1/3 worse than wanted.""" + return -1.0 * (self.max_perf_counter * 0.33) + + @property + def perf_diff_allowed_max(self): + """The maximum postive variace allowed, # 3/4 better than wanted.""" + return self.max_perf_counter * 0.75 + + @property + def perf_diff(self) -> float: + """Difference between wanted elapsed time and actual elpased time.""" + if self._lap_time: + return self.max_perf_counter - self.lap_time + return None + + def counter_based_max_size(self) -> int: # noqa: C901 + """Caclulate performance based batch size.""" + correction = 0 + if self.perf_diff < self.perf_diff_allowed_min: + if self.sink_max_size >= 15000: # noqa: PLR2004 + correction = -5000 + elif self.sink_max_size >= 10000: # noqa: PLR2004 + correction = -1000 + elif self.sink_max_size >= 1000: # noqa: PLR2004 + correction = -100 + elif self.sink_max_size > 10: # noqa: PLR2004 + correction = 10 + if ( + self.perf_diff >= self.perf_diff_allowed_max + and self.sink_max_size < self.SINK_MAX_SIZE_CELING + ): + if self.sink_max_size >= 10000: # noqa: PLR2004 + correction = 10000 + elif self.sink_max_size >= 1000: # noqa: PLR2004 + correction = 1000 + elif self.sink_max_size >= 100: # noqa: PLR2004 + correction = 100 + elif self.sink_max_size >= 10: # noqa: PLR2004 + correction = 10 + self._sink_max_size += correction + return self.sink_max_size From 550ecff9335f0399af9bd819fa1dd29f1ed5c463 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 25 Jul 2023 08:40:24 -0700 Subject: [PATCH 02/67] add sink_timer, batch_size_rows, and batch_wait_limit_seconds --- singer_sdk/sinks/sql.py | 70 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/singer_sdk/sinks/sql.py b/singer_sdk/sinks/sql.py index 02fd307b9..274382485 100644 --- a/singer_sdk/sinks/sql.py +++ b/singer_sdk/sinks/sql.py @@ -15,6 +15,7 @@ from singer_sdk.connectors import SQLConnector from singer_sdk.exceptions import ConformedNameClashException from singer_sdk.helpers._conformers import replace_leading_digit +from singer_sdk.helpers._perftimer import BatchPerfTimer from singer_sdk.sinks.batch import BatchSink if t.TYPE_CHECKING: @@ -49,6 +50,22 @@ def __init__( """ self._connector: SQLConnector self._connector = connector or self.connector_class(dict(target.config)) + self._batch_size_rows: int = target.config.get( + "batch_size_rows", + super().MAX_SIZE_DEFAULT, + ) + self._batch_wait_limit_seconds: int | None = target.config.get( + "batch_wait_limit_seconds", + ) + self._sink_timer: BatchSink | None = None + + if self._batch_wait_limit_seconds is not None: + self._batch_size_rows = 100 + self._sink_timer: BatchPerfTimer | None = BatchPerfTimer( + self._batch_size_rows, + self._batch_wait_limit_seconds, + ) + super().__init__(target, stream_name, schema, key_properties) @property @@ -69,6 +86,51 @@ def connection(self) -> sqlalchemy.engine.Connection: """ return self.connector.connection + @property + def batch_size_rows(self) -> int: + """Get batch_size_rows object. + + Returns: + A batch_size_rows object. + """ + return self._batch_size_rows + + @batch_size_rows.setter + def batch_size_rows(self, new_value: int) -> None: + """Set batch_size_rows object to the given value. + + Args: + new_value: The value you want to set batch_size_rows too. + """ + self._batch_size_rows = new_value + + @property + def batch_wait_limit_seconds(self) -> int: + """Get batch_wait_limit_seconds object. + + Returns: + A batch_wait_limit_seconds object. + """ + return self._batch_wait_limit_seconds + + @property + def sink_timer(self) -> BatchSink: + """Get sink_timer object. + + Returns: + A sink_timer object of type BatchSink. + """ + return self._sink_timer + + @property + def max_size(self) -> int: + """Get max batch size. + + Returns: + Max number of records to batch before `is_full=True` + """ + return self.batch_size_rows + @property def table_name(self) -> str: """Return the table name, with no schema or database part. @@ -326,6 +388,14 @@ def bulk_insert_records( self.logger.info("Inserting with SQL: %s", insert_sql) with self.connector._connect() as conn, conn.begin(): conn.execute(insert_sql, conformed_records) + # Finish Line for max_size perf counter + if self.sink_timer is not None: + if self.sink_timer.start_time is not None: + self.sink_timer.stop() + self.batch_size_rows = self.sink_timer.counter_based_max_size() + + # Starting Line for max_size perf counter + self.sink_timer.start() return len(conformed_records) if isinstance(conformed_records, list) else None def merge_upsert_from_table( From 68c3484f0d3913d0ea8bf037cb348b2efed09567 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 25 Jul 2023 08:41:30 -0700 Subject: [PATCH 03/67] calcuation fix --- singer_sdk/helpers/_perftimer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index 8346e55fc..f24f24e97 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -99,7 +99,7 @@ def counter_based_max_size(self) -> int: # noqa: C901 elif self.sink_max_size >= 1000: # noqa: PLR2004 correction = -100 elif self.sink_max_size > 10: # noqa: PLR2004 - correction = 10 + correction = -10 if ( self.perf_diff >= self.perf_diff_allowed_max and self.sink_max_size < self.SINK_MAX_SIZE_CELING From 66f750ef84fe836c8f1058f5d3951cb588b88c3e Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 25 Jul 2023 08:50:26 -0700 Subject: [PATCH 04/67] correct class hint for sink_timer --- singer_sdk/sinks/sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/singer_sdk/sinks/sql.py b/singer_sdk/sinks/sql.py index 274382485..0aefb7354 100644 --- a/singer_sdk/sinks/sql.py +++ b/singer_sdk/sinks/sql.py @@ -114,11 +114,11 @@ def batch_wait_limit_seconds(self) -> int: return self._batch_wait_limit_seconds @property - def sink_timer(self) -> BatchSink: + def sink_timer(self) -> BatchPerfTimer: """Get sink_timer object. Returns: - A sink_timer object of type BatchSink. + A sink_timer object of type BatchPerfTimer. """ return self._sink_timer From 62c2c32337aa2ec86d6ba5ee100812643ea6ee5e Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 25 Jul 2023 09:51:34 -0700 Subject: [PATCH 05/67] mypy fixes round 1 --- singer_sdk/helpers/_perftimer.py | 14 +++++++------- singer_sdk/sinks/sql.py | 11 ++++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index f24f24e97..390544ce4 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -12,9 +12,9 @@ class PerfTimerError(Exception): class PerfTimer: """A Basic Performance Timer Class.""" - _start_time: float = None - _stop_time: float = None - _lap_time: float = None + _start_time: float | None = None + _stop_time: float | None = None + _lap_time: float | None = None @property def start_time(self): @@ -52,11 +52,11 @@ class BatchPerfTimer(PerfTimer): def __init__( self, - max_size: int | None = None, - max_perf_counter: float = 1, + max_size: int, + max_perf_counter: float, ) -> None: self._sink_max_size: int = max_size - self._max_perf_counter = max_perf_counter + self._max_perf_counter: float = max_perf_counter SINK_MAX_SIZE_CELING: int = 100000 """The max size a bulk insert can be""" @@ -86,7 +86,7 @@ def perf_diff(self) -> float: """Difference between wanted elapsed time and actual elpased time.""" if self._lap_time: return self.max_perf_counter - self.lap_time - return None + return float(0) def counter_based_max_size(self) -> int: # noqa: C901 """Caclulate performance based batch size.""" diff --git a/singer_sdk/sinks/sql.py b/singer_sdk/sinks/sql.py index 0aefb7354..85446d4be 100644 --- a/singer_sdk/sinks/sql.py +++ b/singer_sdk/sinks/sql.py @@ -57,11 +57,12 @@ def __init__( self._batch_wait_limit_seconds: int | None = target.config.get( "batch_wait_limit_seconds", ) - self._sink_timer: BatchSink | None = None + + self._sink_timer: BatchPerfTimer | None = None if self._batch_wait_limit_seconds is not None: self._batch_size_rows = 100 - self._sink_timer: BatchPerfTimer | None = BatchPerfTimer( + self._sink_timer = BatchPerfTimer( self._batch_size_rows, self._batch_wait_limit_seconds, ) @@ -105,7 +106,7 @@ def batch_size_rows(self, new_value: int) -> None: self._batch_size_rows = new_value @property - def batch_wait_limit_seconds(self) -> int: + def batch_wait_limit_seconds(self) -> int | None: """Get batch_wait_limit_seconds object. Returns: @@ -114,11 +115,11 @@ def batch_wait_limit_seconds(self) -> int: return self._batch_wait_limit_seconds @property - def sink_timer(self) -> BatchPerfTimer: + def sink_timer(self) -> BatchPerfTimer | None: """Get sink_timer object. Returns: - A sink_timer object of type BatchPerfTimer. + A sink_timer object. """ return self._sink_timer From 541a46d8572f0e2ff10f7bcfc8e934ae7e287dd3 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 25 Jul 2023 10:00:45 -0700 Subject: [PATCH 06/67] mypy fixes round 2 --- singer_sdk/helpers/_perftimer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index 390544ce4..5711270b8 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -62,31 +62,32 @@ def __init__( """The max size a bulk insert can be""" @property - def sink_max_size(self): + def sink_max_size(self) -> int: """The current MAX_SIZE_DEFAULT.""" return self._sink_max_size @property - def max_perf_counter(self): + def max_perf_counter(self) -> float: """How many seconds can pass before a insert.""" return self._max_perf_counter @property - def perf_diff_allowed_min(self): + def perf_diff_allowed_min(self) -> float: """The mininum negative variance allowed, 1/3 worse than wanted.""" return -1.0 * (self.max_perf_counter * 0.33) @property - def perf_diff_allowed_max(self): + def perf_diff_allowed_max(self) -> float: """The maximum postive variace allowed, # 3/4 better than wanted.""" return self.max_perf_counter * 0.75 @property def perf_diff(self) -> float: """Difference between wanted elapsed time and actual elpased time.""" + diff = 0 if self._lap_time: - return self.max_perf_counter - self.lap_time - return float(0) + diff = self.max_perf_counter - self.lap_time + return float(diff) def counter_based_max_size(self) -> int: # noqa: C901 """Caclulate performance based batch size.""" From 8a591c14096bd1100ca7b435b1b64b9eb66ef25d Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 8 Aug 2023 15:04:45 -0700 Subject: [PATCH 07/67] simple tests for PerfTimer and BatchPerfTimer --- tests/core/test_perf_timer.py | 98 +++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 tests/core/test_perf_timer.py diff --git a/tests/core/test_perf_timer.py b/tests/core/test_perf_timer.py new file mode 100644 index 000000000..d57407117 --- /dev/null +++ b/tests/core/test_perf_timer.py @@ -0,0 +1,98 @@ +"""Perf Timer tests.""" + +from __future__ import annotations + +import time + +import pytest + +from singer_sdk.helpers._perftimer import BatchPerfTimer, PerfTimer, PerfTimerError + + +def test_perftimer_properties(): + timer: PerfTimer = PerfTimer() + timer._start_time = 1.1 + timer._stop_time = 1.10 + timer._lap_time = 0.09 + assert timer.start_time is timer._start_time + assert timer.stop_time is timer.stop_time + assert timer._lap_time is timer.lap_time + assert timer.start_time == 1.1 + assert timer.stop_time == 1.10 + assert timer.lap_time == 0.09 + + +def test_perftimer_actions(): + timer: PerfTimer = PerfTimer() + timer.start() + assert timer.start_time is not None + assert timer.stop_time is None + assert timer.lap_time is None + time.sleep(1.1) + timer.stop() + assert timer.lap_time >= 1 + assert timer.lap_time < 1.2 + assert timer.start_time is None + assert timer.stop_time is None + + +def test_perftimer_errors(): + timer: PerfTimer = PerfTimer() + with pytest.raises( + PerfTimerError, + match=r"Timer is not running. Use .start\(\) to start it", + ): + timer.stop() + # starting a timer to test start() error + timer.start() + with pytest.raises( + PerfTimerError, + match=r"Timer is running. Use .stop\(\) to stop it", + ): + timer.start() + # stopping the timer at the end of the test + timer.stop() + + +def test_batchperftimer_properties(): + batchtimer: BatchPerfTimer = BatchPerfTimer(100, 1) + batchtimer._lap_time = 0.10 + assert batchtimer._sink_max_size is batchtimer.sink_max_size + assert batchtimer._max_perf_counter is batchtimer.max_perf_counter + assert batchtimer.sink_max_size == 100 + assert batchtimer.max_perf_counter == 1 + assert batchtimer.perf_diff_allowed_max == 0.75 + assert batchtimer.perf_diff_allowed_min == -0.33 + assert batchtimer.perf_diff == 0.90 + + +def test_batchperftimer_counter_based_max_size_additive(): + batchtimer: BatchPerfTimer = BatchPerfTimer(10, 1) + batchtimer._lap_time = 0.24 + assert batchtimer.perf_diff > batchtimer.perf_diff_allowed_max + assert batchtimer.counter_based_max_size() == 20 + batchtimer._sink_max_size = 100 + assert batchtimer.counter_based_max_size() == 200 + batchtimer._sink_max_size = 1000 + assert batchtimer.counter_based_max_size() == 2000 + batchtimer._sink_max_size = 10000 + assert batchtimer.counter_based_max_size() == 20000 + batchtimer._sink_max_size = 100000 + assert batchtimer.sink_max_size == batchtimer.SINK_MAX_SIZE_CELING + assert batchtimer.counter_based_max_size() == 100000 + + +def test_batchperftimer_counter_based_max_size_reductive(): + batchtimer: BatchPerfTimer = BatchPerfTimer(100000, 1) + batchtimer._lap_time = 1.5 + assert batchtimer.perf_diff < batchtimer.perf_diff_allowed_min + assert batchtimer.sink_max_size == batchtimer.SINK_MAX_SIZE_CELING + assert batchtimer.counter_based_max_size() == 95000 + batchtimer._sink_max_size = 10000 + assert batchtimer.counter_based_max_size() == 9000 + batchtimer._sink_max_size = 1000 + assert batchtimer.counter_based_max_size() == 900 + batchtimer._sink_max_size = 100 + assert batchtimer.counter_based_max_size() == 90 + batchtimer._sink_max_size = 10 + assert batchtimer.counter_based_max_size() == 10 From eb6c9e74dc339348ad63cd06530fcc81083cc27b Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 8 Aug 2023 15:05:56 -0700 Subject: [PATCH 08/67] clear _stop_time after lap is calculated --- singer_sdk/helpers/_perftimer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index 5711270b8..08b63f4ea 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -45,6 +45,7 @@ def stop(self) -> None: self._stop_time = time.perf_counter() self._lap_time = self._stop_time - self._start_time self._start_time = None + self._stop_time = None class BatchPerfTimer(PerfTimer): From f6bbf0c5ddba689ee1ab6df1f110529f35adf12f Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 8 Aug 2023 15:21:34 -0700 Subject: [PATCH 09/67] wider variation when testing perftimer lap_time --- tests/core/test_perf_timer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_perf_timer.py b/tests/core/test_perf_timer.py index d57407117..af73a6a32 100644 --- a/tests/core/test_perf_timer.py +++ b/tests/core/test_perf_timer.py @@ -31,7 +31,7 @@ def test_perftimer_actions(): time.sleep(1.1) timer.stop() assert timer.lap_time >= 1 - assert timer.lap_time < 1.2 + assert timer.lap_time < 1.5 assert timer.start_time is None assert timer.stop_time is None From 74b16533cbab619bcc516b124f522e0c180c8073 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Wed, 9 Aug 2023 22:40:30 -0600 Subject: [PATCH 10/67] Apply suggestions from code review Co-authored-by: Edgar R. M. --- singer_sdk/helpers/_perftimer.py | 18 ++++++++---------- tests/core/test_perf_timer.py | 4 ++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index 08b63f4ea..0cf6d1358 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -1,4 +1,4 @@ -"""performace timers which deal with dynamic timing events.""" +"""Performance timers which deal with dynamic timing events.""" from __future__ import annotations @@ -59,7 +59,7 @@ def __init__( self._sink_max_size: int = max_size self._max_perf_counter: float = max_perf_counter - SINK_MAX_SIZE_CELING: int = 100000 + SINK_MAX_SIZE_CEILING: int = 100000 """The max size a bulk insert can be""" @property @@ -74,24 +74,22 @@ def max_perf_counter(self) -> float: @property def perf_diff_allowed_min(self) -> float: - """The mininum negative variance allowed, 1/3 worse than wanted.""" + """The minimum negative variance allowed, 1/3 worse than wanted.""" return -1.0 * (self.max_perf_counter * 0.33) @property def perf_diff_allowed_max(self) -> float: - """The maximum postive variace allowed, # 3/4 better than wanted.""" + """The maximum positive variance allowed, # 3/4 better than wanted.""" return self.max_perf_counter * 0.75 @property def perf_diff(self) -> float: - """Difference between wanted elapsed time and actual elpased time.""" - diff = 0 - if self._lap_time: - diff = self.max_perf_counter - self.lap_time + """Difference between wanted elapsed time and actual elapsed time.""" + diff = self.max_perf_counter - self.lap_time if self._lap_time else 0 return float(diff) def counter_based_max_size(self) -> int: # noqa: C901 - """Caclulate performance based batch size.""" + """Calculate performance based batch size.""" correction = 0 if self.perf_diff < self.perf_diff_allowed_min: if self.sink_max_size >= 15000: # noqa: PLR2004 @@ -104,7 +102,7 @@ def counter_based_max_size(self) -> int: # noqa: C901 correction = -10 if ( self.perf_diff >= self.perf_diff_allowed_max - and self.sink_max_size < self.SINK_MAX_SIZE_CELING + and self.sink_max_size < self.SINK_MAX_SIZE_CEILING ): if self.sink_max_size >= 10000: # noqa: PLR2004 correction = 10000 diff --git a/tests/core/test_perf_timer.py b/tests/core/test_perf_timer.py index af73a6a32..94d7b722c 100644 --- a/tests/core/test_perf_timer.py +++ b/tests/core/test_perf_timer.py @@ -78,7 +78,7 @@ def test_batchperftimer_counter_based_max_size_additive(): batchtimer._sink_max_size = 10000 assert batchtimer.counter_based_max_size() == 20000 batchtimer._sink_max_size = 100000 - assert batchtimer.sink_max_size == batchtimer.SINK_MAX_SIZE_CELING + assert batchtimer.sink_max_size == batchtimer.SINK_MAX_SIZE_CEILING assert batchtimer.counter_based_max_size() == 100000 @@ -86,7 +86,7 @@ def test_batchperftimer_counter_based_max_size_reductive(): batchtimer: BatchPerfTimer = BatchPerfTimer(100000, 1) batchtimer._lap_time = 1.5 assert batchtimer.perf_diff < batchtimer.perf_diff_allowed_min - assert batchtimer.sink_max_size == batchtimer.SINK_MAX_SIZE_CELING + assert batchtimer.sink_max_size == batchtimer.SINK_MAX_SIZE_CEILING assert batchtimer.counter_based_max_size() == 95000 batchtimer._sink_max_size = 10000 assert batchtimer.counter_based_max_size() == 9000 From c151f84188807a4bef32edd4383ce55f2b31b96e Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 11:03:20 -0700 Subject: [PATCH 11/67] comment out batch timer additions --- singer_sdk/sinks/sql.py | 100 +++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 58 deletions(-) diff --git a/singer_sdk/sinks/sql.py b/singer_sdk/sinks/sql.py index 9a415bbb1..17697a445 100644 --- a/singer_sdk/sinks/sql.py +++ b/singer_sdk/sinks/sql.py @@ -15,7 +15,6 @@ from singer_sdk.connectors import SQLConnector from singer_sdk.exceptions import ConformedNameClashException from singer_sdk.helpers._conformers import replace_leading_digit -from singer_sdk.helpers._perftimer import BatchPerfTimer from singer_sdk.sinks.batch import BatchSink if t.TYPE_CHECKING: @@ -50,22 +49,15 @@ def __init__( """ self._connector: SQLConnector self._connector = connector or self.connector_class(dict(target.config)) - self._batch_size_rows: int = target.config.get( - "batch_size_rows", - super().MAX_SIZE_DEFAULT, - ) - self._batch_wait_limit_seconds: int | None = target.config.get( - "batch_wait_limit_seconds", - ) + # self._batch_size_rows: int = target.config.get( + # "batch_size_rows", + # self._batch_wait_limit_seconds: int | None = target.config.get( + # "batch_wait_limit_seconds", - self._sink_timer: BatchPerfTimer | None = None - - if self._batch_wait_limit_seconds is not None: - self._batch_size_rows = 100 - self._sink_timer = BatchPerfTimer( - self._batch_size_rows, - self._batch_wait_limit_seconds, - ) + # if self._batch_wait_limit_seconds is not None: + # self._sink_timer = BatchPerfTimer( + # self._batch_size_rows, + # self._batch_wait_limit_seconds, super().__init__(target, stream_name, schema, key_properties) @@ -87,50 +79,45 @@ def connection(self) -> sqlalchemy.engine.Connection: """ return self.connector.connection - @property - def batch_size_rows(self) -> int: - """Get batch_size_rows object. + # @property + # def batch_size_rows(self) -> int: + # """Get batch_size_rows object. - Returns: - A batch_size_rows object. - """ - return self._batch_size_rows + # Returns: + # A batch_size_rows object. + # """ - @batch_size_rows.setter - def batch_size_rows(self, new_value: int) -> None: - """Set batch_size_rows object to the given value. + # @batch_size_rows.setter + # def batch_size_rows(self, new_value: int) -> None: + # """Set batch_size_rows object to the given value. - Args: - new_value: The value you want to set batch_size_rows too. - """ - self._batch_size_rows = new_value + # Args: + # new_value: The value you want to set batch_size_rows too. + # """ - @property - def batch_wait_limit_seconds(self) -> int | None: - """Get batch_wait_limit_seconds object. + # @property + # def batch_wait_limit_seconds(self) -> int | None: + # """Get batch_wait_limit_seconds object. - Returns: - A batch_wait_limit_seconds object. - """ - return self._batch_wait_limit_seconds + # Returns: + # A batch_wait_limit_seconds object. + # """ - @property - def sink_timer(self) -> BatchPerfTimer | None: - """Get sink_timer object. + # @property + # def sink_timer(self) -> BatchPerfTimer | None: + # """Get sink_timer object. - Returns: - A sink_timer object. - """ - return self._sink_timer + # Returns: + # A sink_timer object. + # """ - @property - def max_size(self) -> int: - """Get max batch size. + # @property + # def max_size(self) -> int: + # """Get max batch size. - Returns: - Max number of records to batch before `is_full=True` - """ - return self.batch_size_rows + # Returns: + # Max number of records to batch before `is_full=True` + # """ @property def table_name(self) -> str: @@ -399,14 +386,11 @@ def bulk_insert_records( with self.connector._connect() as conn, conn.begin(): result = conn.execute(insert_sql, new_records) - # Finish Line for max_size perf counter - if self.sink_timer is not None: - if self.sink_timer.start_time is not None: - self.sink_timer.stop() - self.batch_size_rows = self.sink_timer.counter_based_max_size() + # # Finish Line for max_size perf counter + # if self.sink_timer is not None: + # if self.sink_timer.start_time is not None: - # Starting Line for max_size perf counter - self.sink_timer.start() + # # Starting Line for max_size perf counter return result.rowcount def merge_upsert_from_table( From 49c014c312a67c45e97c04a1b30d8f680165528c Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 11:06:51 -0700 Subject: [PATCH 12/67] add BatchPerfTimer and supporting properties --- singer_sdk/sinks/core.py | 73 ++++++++++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 10 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 19dfbc31a..77e019cb6 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -23,6 +23,7 @@ StorageTarget, ) from singer_sdk.helpers._compat import final +from singer_sdk.helpers._perftimer import BatchPerfTimer from singer_sdk.helpers._typing import ( DatetimeErrorTreatmentEnum, get_datelike_property_type, @@ -90,6 +91,23 @@ def __init__( self._batch_records_read: int = 0 self._batch_dupe_records_merged: int = 0 + # Batch Performace Timer + self._batch_size_rows: int = target.config.get( + "batch_size_rows", + self.MAX_SIZE_DEFAULT, + ) + self._batch_wait_limit_seconds: int | None = target.config.get( + "batch_wait_limit_seconds", + ) + + self._sink_timer: BatchPerfTimer | None = None + + if self._batch_wait_limit_seconds is not None: + self._batch_size_rows = 100 + self._sink_timer = BatchPerfTimer( + self._batch_size_rows, + self._batch_wait_limit_seconds, + ) self._validator = Draft7Validator(schema, format_checker=FormatChecker()) def _get_context(self, record: dict) -> dict: # noqa: ARG002 @@ -106,16 +124,6 @@ def _get_context(self, record: dict) -> dict: # noqa: ARG002 return {} # Size properties - - @property - def max_size(self) -> int: - """Get max batch size. - - Returns: - Max number of records to batch before `is_full=True` - """ - return self.MAX_SIZE_DEFAULT - @property def current_size(self) -> int: """Get current batch size. @@ -134,6 +142,51 @@ def is_full(self) -> bool: """ return self.current_size >= self.max_size + @property + def batch_size_rows(self) -> int: + """Get batch_size_rows object. + + Returns: + A batch_size_rows object. + """ + return self._batch_size_rows + + @batch_size_rows.setter + def batch_size_rows(self, new_value: int) -> None: + """Set batch_size_rows object to the given value. + + Args: + new_value: The value you want to set batch_size_rows too. + """ + self._batch_size_rows = new_value + + @property + def batch_wait_limit_seconds(self) -> int | None: + """Get batch_wait_limit_seconds object. + + Returns: + A batch_wait_limit_seconds object. + """ + return self._batch_wait_limit_seconds + + @property + def sink_timer(self) -> BatchPerfTimer | None: + """Get sink_timer object. + + Returns: + A sink_timer object. + """ + return self._sink_timer + + @property + def max_size(self) -> int: + """Get max batch size. + + Returns: + Max number of records to batch before `is_full=True` + """ + return self.batch_size_rows + # Tally methods @final From a4257747ada5d59cb1f148fa61b1ec5187343ce4 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 11:57:39 -0700 Subject: [PATCH 13/67] add BatchTimer start and finish to drain_one --- singer_sdk/target_base.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index a5386199f..2acf0a335 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -496,6 +496,20 @@ def drain_one(self, sink: Sink) -> None: draining_status = sink.start_drain() sink.process_batch(draining_status) + # Finish Line for max_size perf counter + if sink.sink_timer is not None: + timer_msg: str = ( + f"batch_size_rows: {sink.batch_size_rows}, " + f"min: {sink.sink_timer.perf_diff_allowed_min}, " + f"now: {sink.sink_timer.perf_diff:.4f}, " + f"max {sink.sink_timer.perf_diff_allowed_max}, " + ) + self.logger.info(timer_msg) + if sink.sink_timer.start_time is not None: + sink.sink_timer.stop() + sink.batch_size_rows = sink.sink_timer.counter_based_max_size() + # Starting Line for max_size perf counter + sink.sink_timer.start() sink.mark_drained() def _drain_all(self, sink_list: list[Sink], parallelism: int) -> None: From 379f8147457e934bba31b5f7a009da51a42d42fe Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 12:02:44 -0700 Subject: [PATCH 14/67] remove commented out code --- singer_sdk/sinks/sql.py | 55 ----------------------------------------- 1 file changed, 55 deletions(-) diff --git a/singer_sdk/sinks/sql.py b/singer_sdk/sinks/sql.py index 17697a445..238e83dec 100644 --- a/singer_sdk/sinks/sql.py +++ b/singer_sdk/sinks/sql.py @@ -49,16 +49,6 @@ def __init__( """ self._connector: SQLConnector self._connector = connector or self.connector_class(dict(target.config)) - # self._batch_size_rows: int = target.config.get( - # "batch_size_rows", - # self._batch_wait_limit_seconds: int | None = target.config.get( - # "batch_wait_limit_seconds", - - # if self._batch_wait_limit_seconds is not None: - # self._sink_timer = BatchPerfTimer( - # self._batch_size_rows, - # self._batch_wait_limit_seconds, - super().__init__(target, stream_name, schema, key_properties) @property @@ -79,46 +69,6 @@ def connection(self) -> sqlalchemy.engine.Connection: """ return self.connector.connection - # @property - # def batch_size_rows(self) -> int: - # """Get batch_size_rows object. - - # Returns: - # A batch_size_rows object. - # """ - - # @batch_size_rows.setter - # def batch_size_rows(self, new_value: int) -> None: - # """Set batch_size_rows object to the given value. - - # Args: - # new_value: The value you want to set batch_size_rows too. - # """ - - # @property - # def batch_wait_limit_seconds(self) -> int | None: - # """Get batch_wait_limit_seconds object. - - # Returns: - # A batch_wait_limit_seconds object. - # """ - - # @property - # def sink_timer(self) -> BatchPerfTimer | None: - # """Get sink_timer object. - - # Returns: - # A sink_timer object. - # """ - - # @property - # def max_size(self) -> int: - # """Get max batch size. - - # Returns: - # Max number of records to batch before `is_full=True` - # """ - @property def table_name(self) -> str: """Return the table name, with no schema or database part. @@ -386,11 +336,6 @@ def bulk_insert_records( with self.connector._connect() as conn, conn.begin(): result = conn.execute(insert_sql, new_records) - # # Finish Line for max_size perf counter - # if self.sink_timer is not None: - # if self.sink_timer.start_time is not None: - - # # Starting Line for max_size perf counter return result.rowcount def merge_upsert_from_table( From 0303f259115b7a465eda2eafffa39e103b1962de Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 12:04:36 -0700 Subject: [PATCH 15/67] change perf_diff_allowed_max to .25 better --- singer_sdk/helpers/_perftimer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index 0cf6d1358..65a7d6cd4 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -79,8 +79,8 @@ def perf_diff_allowed_min(self) -> float: @property def perf_diff_allowed_max(self) -> float: - """The maximum positive variance allowed, # 3/4 better than wanted.""" - return self.max_perf_counter * 0.75 + """The maximum positive variance allowed, 1/4 better than wanted.""" + return self.max_perf_counter * 0.25 @property def perf_diff(self) -> float: From 1fa97527040dbe06f743043b76ec8282036e6482 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 12:10:18 -0700 Subject: [PATCH 16/67] update test match perf_diff_allowed_max change --- tests/core/test_perf_timer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/test_perf_timer.py b/tests/core/test_perf_timer.py index 94d7b722c..1793241d9 100644 --- a/tests/core/test_perf_timer.py +++ b/tests/core/test_perf_timer.py @@ -61,7 +61,7 @@ def test_batchperftimer_properties(): assert batchtimer._max_perf_counter is batchtimer.max_perf_counter assert batchtimer.sink_max_size == 100 assert batchtimer.max_perf_counter == 1 - assert batchtimer.perf_diff_allowed_max == 0.75 + assert batchtimer.perf_diff_allowed_max == 0.25 assert batchtimer.perf_diff_allowed_min == -0.33 assert batchtimer.perf_diff == 0.90 From d2db4fb021e9b0b45abfa14418d60f714c275401 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 14:57:35 -0700 Subject: [PATCH 17/67] added tests for batch_size_rows and max_size --- tests/core/test_target_base.py | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index de344c7e3..923308f43 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -141,3 +141,44 @@ def test_add_sqlsink_and_get_sink(): target.get_sink( "bar", ) + + +def test_batch_size_rows_and_max_size(): + input_schema_1 = { + "properties": { + "id": { + "type": ["string", "null"], + }, + "col_ts": { + "format": "date-time", + "type": ["string", "null"], + }, + }, + } + key_properties = [] + target_default = TargetMock() + sink_default = BatchSinkMock( + target=target_default, + stream_name="foo", + schema=input_schema_1, + key_properties=key_properties, + ) + target_set = TargetMock(config={"batch_size_rows": 100000}) + sink_set = BatchSinkMock( + target=target_set, + stream_name="bar", + schema=input_schema_1, + key_properties=key_properties, + ) + assert sink_default.stream_name == "foo" + assert sink_default._batch_size_rows == 10000 + assert sink_default.batch_size_rows == 10000 + assert sink_default.max_size == 10000 + assert sink_set.stream_name == "bar" + assert sink_set._batch_size_rows == 100000 + assert sink_set.batch_size_rows == 100000 + assert sink_set.max_size == 100000 + sink_set.batch_size_rows = 99999 + assert sink_set._batch_size_rows == 99999 + assert sink_set.batch_size_rows == 99999 + assert sink_set.max_size == 99999 From 99a9fb5ed7c78982d359adf3c4ea3cf328972045 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 18:48:01 -0700 Subject: [PATCH 18/67] add batch_wait_limit_seconds tests --- tests/core/test_target_base.py | 47 ++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index 923308f43..7bad99309 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -182,3 +182,50 @@ def test_batch_size_rows_and_max_size(): assert sink_set._batch_size_rows == 99999 assert sink_set.batch_size_rows == 99999 assert sink_set.max_size == 99999 + + +def test_batch_wait_limit_seconds(): + input_schema_1 = { + "properties": { + "id": { + "type": ["string", "null"], + }, + "col_ts": { + "format": "date-time", + "type": ["string", "null"], + }, + }, + } + key_properties = [] + target_default = TargetMock() + sink_default = BatchSinkMock( + target=target_default, + stream_name="foo", + schema=input_schema_1, + key_properties=key_properties, + ) + target_set = TargetMock(config={"batch_wait_limit_seconds": 1}) + sink_set = BatchSinkMock( + target=target_set, + stream_name="bar", + schema=input_schema_1, + key_properties=key_properties, + ) + assert sink_default.stream_name == "foo" + assert sink_default.sink_timer is None + assert sink_default.batch_size_rows == 10000 + assert sink_default.max_size == 10000 + assert sink_set.stream_name == "bar" + assert sink_set.sink_timer is not None + assert sink_set.batch_size_rows == 100 + assert sink_set.max_size == 100 + sink_set._lap_manager() + assert sink_set.sink_timer.start_time > 0.0 + assert sink_set.sink_timer.stop_time is None + assert sink_set.sink_timer.lap_time is None + assert sink_set.sink_timer.perf_diff == 0.0 + sink_set._lap_manager() + assert sink_set.sink_timer.start_time > 0.0 + assert sink_set.sink_timer.stop_time is None + assert sink_set.sink_timer.lap_time > 0.0 + assert sink_set.sink_timer.perf_diff > 0.0 From 059c3926420a20064eaacc2133b5d49abcd29091 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 18:49:18 -0700 Subject: [PATCH 19/67] added _lap_manager --- singer_sdk/sinks/core.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 77e019cb6..f87028a56 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -277,6 +277,25 @@ def key_properties(self) -> list[str]: """ return self._key_properties + # Timer Management + + def _lap_manager(self) -> None: + """Start and Stop the Perf Time during drain. + + This Method is called during a target drane_one action. + """ + timer_msg: str = ( + f"batch_size_rows: {self.batch_size_rows}, " + f"min: {self.sink_timer.perf_diff_allowed_min}, " + f"now: {self.sink_timer.perf_diff:.4f}, " + f"max {self.sink_timer.perf_diff_allowed_max}, " + ) + self.logger.info(timer_msg) + if self.sink_timer.start_time is not None: + self.sink_timer.stop() + self.batch_size_rows = self.sink_timer.counter_based_max_size() + self.sink_timer.start() + # Record processing def _add_sdc_metadata_to_record( From 7ac5a6c12d8e631d0110a9ef841491bad4befe17 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 14 Aug 2023 18:50:38 -0700 Subject: [PATCH 20/67] calling _lap_manager in drain_one --- singer_sdk/target_base.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index 2acf0a335..3665bead2 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -496,20 +496,8 @@ def drain_one(self, sink: Sink) -> None: draining_status = sink.start_drain() sink.process_batch(draining_status) - # Finish Line for max_size perf counter if sink.sink_timer is not None: - timer_msg: str = ( - f"batch_size_rows: {sink.batch_size_rows}, " - f"min: {sink.sink_timer.perf_diff_allowed_min}, " - f"now: {sink.sink_timer.perf_diff:.4f}, " - f"max {sink.sink_timer.perf_diff_allowed_max}, " - ) - self.logger.info(timer_msg) - if sink.sink_timer.start_time is not None: - sink.sink_timer.stop() - sink.batch_size_rows = sink.sink_timer.counter_based_max_size() - # Starting Line for max_size perf counter - sink.sink_timer.start() + sink._lap_manager() sink.mark_drained() def _drain_all(self, sink_list: list[Sink], parallelism: int) -> None: From 0420db811bc7ef6a5a051ebb73eeb7c4f2ab3f2b Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 15 Aug 2023 07:58:22 -0700 Subject: [PATCH 21/67] mypy fix: moved sink_timer check into _lap_manager --- singer_sdk/sinks/core.py | 23 ++++++++++++----------- singer_sdk/target_base.py | 3 +-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index f87028a56..be9b5d146 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -284,17 +284,18 @@ def _lap_manager(self) -> None: This Method is called during a target drane_one action. """ - timer_msg: str = ( - f"batch_size_rows: {self.batch_size_rows}, " - f"min: {self.sink_timer.perf_diff_allowed_min}, " - f"now: {self.sink_timer.perf_diff:.4f}, " - f"max {self.sink_timer.perf_diff_allowed_max}, " - ) - self.logger.info(timer_msg) - if self.sink_timer.start_time is not None: - self.sink_timer.stop() - self.batch_size_rows = self.sink_timer.counter_based_max_size() - self.sink_timer.start() + if self.sink_timer is not None: + timer_msg: str = ( + f"batch_size_rows: {self.batch_size_rows}, " + f"min: {self.sink_timer.perf_diff_allowed_min}, " + f"now: {self.sink_timer.perf_diff:.4f}, " + f"max {self.sink_timer.perf_diff_allowed_max}, " + ) + self.logger.info(timer_msg) + if self.sink_timer.start_time is not None: + self.sink_timer.stop() + self.batch_size_rows = self.sink_timer.counter_based_max_size() + self.sink_timer.start() # Record processing diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index 3665bead2..5fb87fd54 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -496,8 +496,7 @@ def drain_one(self, sink: Sink) -> None: draining_status = sink.start_drain() sink.process_batch(draining_status) - if sink.sink_timer is not None: - sink._lap_manager() + sink._lap_manager() sink.mark_drained() def _drain_all(self, sink_list: list[Sink], parallelism: int) -> None: From 1aec608163d371dc59dc839129884bc348200d8d Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 15 Aug 2023 13:45:50 -0600 Subject: [PATCH 22/67] Apply suggestions from code review Co-authored-by: Edgar R. M. --- singer_sdk/sinks/core.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index be9b5d146..27b31b5cb 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -282,16 +282,17 @@ def key_properties(self) -> list[str]: def _lap_manager(self) -> None: """Start and Stop the Perf Time during drain. - This Method is called during a target drane_one action. + This method is called when the target triggers a drain on this sink. """ if self.sink_timer is not None: - timer_msg: str = ( - f"batch_size_rows: {self.batch_size_rows}, " - f"min: {self.sink_timer.perf_diff_allowed_min}, " - f"now: {self.sink_timer.perf_diff:.4f}, " - f"max {self.sink_timer.perf_diff_allowed_max}, " + timer_msg: str = "batch_size_rows: %f, min: %f, now: %.4f, max: %f" + self.logger.info( + timer_msg, + self.batch_size_rows, + self.sink_timer.perf_diff_allowed_min, + self.sink_timer.perf_diff, + self.sink_timer.perf_diff_allowed_max, ) - self.logger.info(timer_msg) if self.sink_timer.start_time is not None: self.sink_timer.stop() self.batch_size_rows = self.sink_timer.counter_based_max_size() From b5e93a6595450e8533a2f05e8f7b7f63177729dc Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 15 Aug 2023 13:14:17 -0700 Subject: [PATCH 23/67] added batch_wait_limit_seconds value test --- tests/core/test_target_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index 7bad99309..7ff360ca4 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -216,6 +216,7 @@ def test_batch_wait_limit_seconds(): assert sink_default.batch_size_rows == 10000 assert sink_default.max_size == 10000 assert sink_set.stream_name == "bar" + assert sink_set.batch_wait_limit_seconds == 1 assert sink_set.sink_timer is not None assert sink_set.batch_size_rows == 100 assert sink_set.max_size == 100 From 57b7f374e183b548d4ccd14da24d96b287652721 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 15 Aug 2023 13:54:36 -0700 Subject: [PATCH 24/67] add tests for new settings --- tests/core/test_target_base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index 7ff360ca4..1586b76fe 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -76,6 +76,8 @@ def test_target_about_info(): assert "flattening_max_depth" in about.settings["properties"] assert "batch_config" in about.settings["properties"] assert "add_record_metadata" in about.settings["properties"] + assert "batch_size_rows" in about.settings["properties"] + assert "batch_wait_limit_seconds" in about.settings["properties"] def test_sql_get_sink(): From f5874c98487bf81a247490dfe81b9b3b00710b1c Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 15 Aug 2023 13:56:27 -0700 Subject: [PATCH 25/67] BATCH_SIZE_ROWS_CONFIG added --- singer_sdk/helpers/capabilities.py | 7 +++++++ singer_sdk/target_base.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/singer_sdk/helpers/capabilities.py b/singer_sdk/helpers/capabilities.py index f5b5fa305..37dbebc03 100644 --- a/singer_sdk/helpers/capabilities.py +++ b/singer_sdk/helpers/capabilities.py @@ -106,6 +106,13 @@ description="Add metadata to records.", ), ).to_dict() +BATCH_SIZE_ROWS_CONFIG = PropertiesList( + Property( + "batch_size_rows", + IntegerType, + description="Maximum number of rows in each batch.", + ), +).to_dict() class DeprecatedEnum(Enum): diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index 5fb87fd54..1d194478f 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -19,6 +19,7 @@ from singer_sdk.helpers.capabilities import ( ADD_RECORD_METADATA_CONFIG, BATCH_CONFIG, + BATCH_SIZE_ROWS_CONFIG, TARGET_SCHEMA_CONFIG, CapabilitiesEnum, PluginCapabilities, @@ -598,6 +599,7 @@ def _merge_missing(source_jsonschema: dict, target_jsonschema: dict) -> None: target_jsonschema["properties"][k] = v _merge_missing(ADD_RECORD_METADATA_CONFIG, config_jsonschema) + _merge_missing(BATCH_SIZE_ROWS_CONFIG, config_jsonschema) capabilities = cls.capabilities From 3b73e649db596c521ab312c6977c5a698d46da01 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 15 Aug 2023 14:00:17 -0700 Subject: [PATCH 26/67] BATCH_WAIT_LIMIT_SECONDS_CONFIG added --- singer_sdk/helpers/capabilities.py | 7 +++++++ singer_sdk/target_base.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/singer_sdk/helpers/capabilities.py b/singer_sdk/helpers/capabilities.py index 37dbebc03..ef218ebdf 100644 --- a/singer_sdk/helpers/capabilities.py +++ b/singer_sdk/helpers/capabilities.py @@ -113,6 +113,13 @@ description="Maximum number of rows in each batch.", ), ).to_dict() +BATCH_WAIT_LIMIT_SECONDS_CONFIG = PropertiesList( + Property( + "batch_wait_limit_seconds", + IntegerType, + description="Maximum time to elapse for a batch to fill, drain, and load.", + ), +).to_dict() class DeprecatedEnum(Enum): diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index 1d194478f..92976558b 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -20,6 +20,7 @@ ADD_RECORD_METADATA_CONFIG, BATCH_CONFIG, BATCH_SIZE_ROWS_CONFIG, + BATCH_WAIT_LIMIT_SECONDS_CONFIG, TARGET_SCHEMA_CONFIG, CapabilitiesEnum, PluginCapabilities, @@ -600,6 +601,7 @@ def _merge_missing(source_jsonschema: dict, target_jsonschema: dict) -> None: _merge_missing(ADD_RECORD_METADATA_CONFIG, config_jsonschema) _merge_missing(BATCH_SIZE_ROWS_CONFIG, config_jsonschema) + _merge_missing(BATCH_WAIT_LIMIT_SECONDS_CONFIG, config_jsonschema) capabilities = cls.capabilities From b48f3d01b7baaa0f346cad0ae458062e61868e68 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Wed, 16 Aug 2023 10:47:07 -0700 Subject: [PATCH 27/67] move logging and update formatting. --- singer_sdk/sinks/core.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 27b31b5cb..44b58f2eb 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -285,16 +285,18 @@ def _lap_manager(self) -> None: This method is called when the target triggers a drain on this sink. """ if self.sink_timer is not None: - timer_msg: str = "batch_size_rows: %f, min: %f, now: %.4f, max: %f" - self.logger.info( - timer_msg, - self.batch_size_rows, - self.sink_timer.perf_diff_allowed_min, - self.sink_timer.perf_diff, - self.sink_timer.perf_diff_allowed_max, + timer_msg: str = ( + "batch_size_rows: %.0f, min_diff: %.2f, lap_diff: %.4f, max_diff: %.2f" ) if self.sink_timer.start_time is not None: self.sink_timer.stop() + self.logger.info( + timer_msg, + self.batch_size_rows, + self.sink_timer.perf_diff_allowed_min, + self.sink_timer.perf_diff, + self.sink_timer.perf_diff_allowed_max, + ) self.batch_size_rows = self.sink_timer.counter_based_max_size() self.sink_timer.start() From 3cee381c424e7889f6f866de795e90c44c53ef88 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 17 Aug 2023 09:51:45 -0700 Subject: [PATCH 28/67] add pertimer on_the_clock tests --- tests/core/test_perf_timer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/core/test_perf_timer.py b/tests/core/test_perf_timer.py index 1793241d9..78a532fcd 100644 --- a/tests/core/test_perf_timer.py +++ b/tests/core/test_perf_timer.py @@ -29,6 +29,7 @@ def test_perftimer_actions(): assert timer.stop_time is None assert timer.lap_time is None time.sleep(1.1) + assert timer.on_the_clock() >= 1 timer.stop() assert timer.lap_time >= 1 assert timer.lap_time < 1.5 @@ -43,6 +44,11 @@ def test_perftimer_errors(): match=r"Timer is not running. Use .start\(\) to start it", ): timer.stop() + with pytest.raises( + PerfTimerError, + match=r"Timer is not running. Use .start\(\) to start it", + ): + timer.on_the_clock() # starting a timer to test start() error timer.start() with pytest.raises( From 7025280c50c4a2c29c4acae7cad7752ea612418b Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 17 Aug 2023 10:05:05 -0700 Subject: [PATCH 29/67] add batchperftimer is_too_old tests --- tests/core/test_perf_timer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/core/test_perf_timer.py b/tests/core/test_perf_timer.py index 78a532fcd..3391be67f 100644 --- a/tests/core/test_perf_timer.py +++ b/tests/core/test_perf_timer.py @@ -72,6 +72,15 @@ def test_batchperftimer_properties(): assert batchtimer.perf_diff == 0.90 +def test_batchperftimer_is_too_old(): + batchtimer: BatchPerfTimer = BatchPerfTimer(100, 1) + assert batchtimer.is_too_old is False + batchtimer.start() + time.sleep(1.1) + assert batchtimer.is_too_old is True + batchtimer.stop() + + def test_batchperftimer_counter_based_max_size_additive(): batchtimer: BatchPerfTimer = BatchPerfTimer(10, 1) batchtimer._lap_time = 0.24 From 8bbb346600d3df9d5cbe9667ecfdbdb3d94d6e7e Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 17 Aug 2023 10:08:08 -0700 Subject: [PATCH 30/67] add PerfTimer.on_the_clock() --- singer_sdk/helpers/_perftimer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index 65a7d6cd4..b5d0cd54f 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -36,6 +36,14 @@ def start(self) -> None: self._start_time = time.perf_counter() + def on_the_clock(self) -> float: + """Give the time on the clock.""" + if self._start_time is None: + msg = "Timer is not running. Use .start() to start it" + raise PerfTimerError(msg) + + return time.perf_counter() - self._start_time + def stop(self) -> None: """Stop the timer, Stores the elapsed time, and reset.""" if self._start_time is None: From fd8f17142ea715361d65846626941454f54069e3 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 17 Aug 2023 10:10:10 -0700 Subject: [PATCH 31/67] add BatchPerfTimer.is_too_old --- singer_sdk/helpers/_perftimer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index b5d0cd54f..583c9d150 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -96,6 +96,14 @@ def perf_diff(self) -> float: diff = self.max_perf_counter - self.lap_time if self._lap_time else 0 return float(diff) + @property + def is_too_old(self) -> bool: + return ( + self.on_the_clock() > self.max_perf_counter + if self.start_time is not None + else False + ) + def counter_based_max_size(self) -> int: # noqa: C901 """Calculate performance based batch size.""" correction = 0 From 2cfa5167bf2e1e922045a2a7d110869f1a4fc5f6 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 17 Aug 2023 10:54:37 -0700 Subject: [PATCH 32/67] add Sink.is_too_old tests --- tests/core/test_target_base.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index 1586b76fe..5e9113579 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -1,6 +1,7 @@ from __future__ import annotations import copy +import time import pytest @@ -220,15 +221,18 @@ def test_batch_wait_limit_seconds(): assert sink_set.stream_name == "bar" assert sink_set.batch_wait_limit_seconds == 1 assert sink_set.sink_timer is not None - assert sink_set.batch_size_rows == 100 - assert sink_set.max_size == 100 - sink_set._lap_manager() - assert sink_set.sink_timer.start_time > 0.0 + assert sink_set.sink_timer.start_time is not None + assert sink_set.batch_size_rows == 10000 + assert sink_set.max_size == 10000 + assert sink_set.is_too_old is False + time.sleep(1.1) + assert sink_set.is_too_old is True + assert sink_set.sink_timer.start_time > 1.1 assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time is None assert sink_set.sink_timer.perf_diff == 0.0 sink_set._lap_manager() assert sink_set.sink_timer.start_time > 0.0 assert sink_set.sink_timer.stop_time is None - assert sink_set.sink_timer.lap_time > 0.0 - assert sink_set.sink_timer.perf_diff > 0.0 + assert sink_set.sink_timer.lap_time > 1.0 + assert sink_set.sink_timer.perf_diff < 0.0 From da2808a77d1fc00fef8d74147970417b801fe7f1 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 17 Aug 2023 11:01:23 -0700 Subject: [PATCH 33/67] added property Sink.is_too_old --- singer_sdk/sinks/core.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 44b58f2eb..e878f155e 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -103,11 +103,12 @@ def __init__( self._sink_timer: BatchPerfTimer | None = None if self._batch_wait_limit_seconds is not None: - self._batch_size_rows = 100 self._sink_timer = BatchPerfTimer( self._batch_size_rows, self._batch_wait_limit_seconds, ) + self._sink_timer.start() + self._validator = Draft7Validator(schema, format_checker=FormatChecker()) def _get_context(self, record: dict) -> dict: # noqa: ARG002 @@ -142,6 +143,15 @@ def is_full(self) -> bool: """ return self.current_size >= self.max_size + @property + def is_too_old(self) -> bool: + """Check against limit in seconds. + + Returns: + True if the sink needs to be drained. + """ + return self.sink_timer.is_too_old if self.sink_timer is not None else False + @property def batch_size_rows(self) -> int: """Get batch_size_rows object. From 01962e03ee6c815c6e12fa55e02f0eb52c088a97 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 12:59:15 -0700 Subject: [PATCH 34/67] remove is_too_old tests --- tests/core/test_perf_timer.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/core/test_perf_timer.py b/tests/core/test_perf_timer.py index 3391be67f..78a532fcd 100644 --- a/tests/core/test_perf_timer.py +++ b/tests/core/test_perf_timer.py @@ -72,15 +72,6 @@ def test_batchperftimer_properties(): assert batchtimer.perf_diff == 0.90 -def test_batchperftimer_is_too_old(): - batchtimer: BatchPerfTimer = BatchPerfTimer(100, 1) - assert batchtimer.is_too_old is False - batchtimer.start() - time.sleep(1.1) - assert batchtimer.is_too_old is True - batchtimer.stop() - - def test_batchperftimer_counter_based_max_size_additive(): batchtimer: BatchPerfTimer = BatchPerfTimer(10, 1) batchtimer._lap_time = 0.24 From c89fd744350a47108c42fbd3d6c853636d724fcc Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 13:08:45 -0700 Subject: [PATCH 35/67] move is_too_old to Sink --- singer_sdk/helpers/_perftimer.py | 8 -------- singer_sdk/sinks/core.py | 6 +++++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index 583c9d150..b5d0cd54f 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -96,14 +96,6 @@ def perf_diff(self) -> float: diff = self.max_perf_counter - self.lap_time if self._lap_time else 0 return float(diff) - @property - def is_too_old(self) -> bool: - return ( - self.on_the_clock() > self.max_perf_counter - if self.start_time is not None - else False - ) - def counter_based_max_size(self) -> int: # noqa: C901 """Calculate performance based batch size.""" correction = 0 diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index e878f155e..18503f709 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -150,7 +150,11 @@ def is_too_old(self) -> bool: Returns: True if the sink needs to be drained. """ - return self.sink_timer.is_too_old if self.sink_timer is not None else False + return ( + self.sink_timer.on_the_clock() > self.batch_wait_limit_seconds + if self.sink_timer.start_time is not None + else False + ) @property def batch_size_rows(self) -> int: From 2f51fd4fe2567f7857be1990a5bbc94bf416fcf1 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 13:27:24 -0700 Subject: [PATCH 36/67] Add is_too_old drain to _process_record_message --- singer_sdk/target_base.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index 92976558b..f1693c3ea 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -358,6 +358,14 @@ def _process_record_message(self, message_dict: dict) -> None: ) self.drain_one(sink) + if sink.is_too_old: + self.logger.info( + "Target sink for '%s' is to old. Current size is '%s'. Draining...", + sink.stream_name, + sink.current_size, + ) + self.drain_one(sink) + self._handle_max_record_age() def _process_schema_message(self, message_dict: dict) -> None: From bf582e00f0dd1a45c4f41469f98eadea4b62b02a Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 13:32:57 -0700 Subject: [PATCH 37/67] fix AttributeError NoneType has no start_time --- singer_sdk/sinks/core.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 18503f709..5105ac526 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -151,8 +151,12 @@ def is_too_old(self) -> bool: True if the sink needs to be drained. """ return ( - self.sink_timer.on_the_clock() > self.batch_wait_limit_seconds - if self.sink_timer.start_time is not None + ( + self.sink_timer.on_the_clock() > self.batch_wait_limit_seconds + if self.sink_timer.start_time is not None + else False + ) + if self.sink_timer is not None else False ) From 01705c6323ab85326b28d7a1049f06bd4b56b078 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 13:45:03 -0700 Subject: [PATCH 38/67] remove counter_based_max_size and logging --- singer_sdk/sinks/core.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 5105ac526..5fd522f0b 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -303,19 +303,8 @@ def _lap_manager(self) -> None: This method is called when the target triggers a drain on this sink. """ if self.sink_timer is not None: - timer_msg: str = ( - "batch_size_rows: %.0f, min_diff: %.2f, lap_diff: %.4f, max_diff: %.2f" - ) if self.sink_timer.start_time is not None: self.sink_timer.stop() - self.logger.info( - timer_msg, - self.batch_size_rows, - self.sink_timer.perf_diff_allowed_min, - self.sink_timer.perf_diff, - self.sink_timer.perf_diff_allowed_max, - ) - self.batch_size_rows = self.sink_timer.counter_based_max_size() self.sink_timer.start() # Record processing From 29ae3de33f8728c3b89fe8418c67bbc2530343ef Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 14:10:40 -0700 Subject: [PATCH 39/67] clean up test_batch_wait_limit_seconds --- tests/core/test_target_base.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index 5e9113579..c4f6bd7ec 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -200,25 +200,14 @@ def test_batch_wait_limit_seconds(): }, } key_properties = [] - target_default = TargetMock() - sink_default = BatchSinkMock( - target=target_default, - stream_name="foo", - schema=input_schema_1, - key_properties=key_properties, - ) target_set = TargetMock(config={"batch_wait_limit_seconds": 1}) sink_set = BatchSinkMock( target=target_set, - stream_name="bar", + stream_name="foo", schema=input_schema_1, key_properties=key_properties, ) - assert sink_default.stream_name == "foo" - assert sink_default.sink_timer is None - assert sink_default.batch_size_rows == 10000 - assert sink_default.max_size == 10000 - assert sink_set.stream_name == "bar" + assert sink_set.stream_name == "foo" assert sink_set.batch_wait_limit_seconds == 1 assert sink_set.sink_timer is not None assert sink_set.sink_timer.start_time is not None From 1b5cc6a097a290eeb3edcdb125951ae9fc552974 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 15:06:19 -0700 Subject: [PATCH 40/67] update test for batch_dynamic_managment --- tests/core/test_perf_timer.py | 47 +++++++++++++++++++------------ tests/core/test_target_base.py | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 17 deletions(-) diff --git a/tests/core/test_perf_timer.py b/tests/core/test_perf_timer.py index 78a532fcd..c3abce8df 100644 --- a/tests/core/test_perf_timer.py +++ b/tests/core/test_perf_timer.py @@ -61,44 +61,57 @@ def test_perftimer_errors(): def test_batchperftimer_properties(): - batchtimer: BatchPerfTimer = BatchPerfTimer(100, 1) - batchtimer._lap_time = 0.10 + batchtimer: BatchPerfTimer = BatchPerfTimer(100000, 30) + batchtimer._lap_time = 20 assert batchtimer._sink_max_size is batchtimer.sink_max_size assert batchtimer._max_perf_counter is batchtimer.max_perf_counter + assert batchtimer.SINK_MAX_SIZE_CEILING == 100000 assert batchtimer.sink_max_size == 100 - assert batchtimer.max_perf_counter == 1 - assert batchtimer.perf_diff_allowed_max == 0.25 - assert batchtimer.perf_diff_allowed_min == -0.33 - assert batchtimer.perf_diff == 0.90 + assert batchtimer.max_perf_counter == 30 + assert batchtimer.perf_diff_allowed_max == 7.5 + assert batchtimer.perf_diff_allowed_min == -9.9 + assert batchtimer.perf_diff == 10 def test_batchperftimer_counter_based_max_size_additive(): - batchtimer: BatchPerfTimer = BatchPerfTimer(10, 1) + batchtimer: BatchPerfTimer = BatchPerfTimer(100000, 1) batchtimer._lap_time = 0.24 assert batchtimer.perf_diff > batchtimer.perf_diff_allowed_max - assert batchtimer.counter_based_max_size() == 20 + batchtimer._sink_max_size = 10 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 20 batchtimer._sink_max_size = 100 - assert batchtimer.counter_based_max_size() == 200 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 200 batchtimer._sink_max_size = 1000 - assert batchtimer.counter_based_max_size() == 2000 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 2000 batchtimer._sink_max_size = 10000 - assert batchtimer.counter_based_max_size() == 20000 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 20000 batchtimer._sink_max_size = 100000 assert batchtimer.sink_max_size == batchtimer.SINK_MAX_SIZE_CEILING - assert batchtimer.counter_based_max_size() == 100000 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 100000 def test_batchperftimer_counter_based_max_size_reductive(): batchtimer: BatchPerfTimer = BatchPerfTimer(100000, 1) batchtimer._lap_time = 1.5 + batchtimer._sink_max_size = 100000 assert batchtimer.perf_diff < batchtimer.perf_diff_allowed_min assert batchtimer.sink_max_size == batchtimer.SINK_MAX_SIZE_CEILING - assert batchtimer.counter_based_max_size() == 95000 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 95000 batchtimer._sink_max_size = 10000 - assert batchtimer.counter_based_max_size() == 9000 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 9000 batchtimer._sink_max_size = 1000 - assert batchtimer.counter_based_max_size() == 900 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 900 batchtimer._sink_max_size = 100 - assert batchtimer.counter_based_max_size() == 90 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 90 batchtimer._sink_max_size = 10 - assert batchtimer.counter_based_max_size() == 10 + batchtimer.counter_based_max_size() + assert batchtimer.sink_max_size == 10 diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index c4f6bd7ec..426ec1511 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -225,3 +225,54 @@ def test_batch_wait_limit_seconds(): assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time > 1.0 assert sink_set.sink_timer.perf_diff < 0.0 + + +def test_batch_dynamic_management(): + input_schema_1 = { + "properties": { + "id": { + "type": ["string", "null"], + }, + "col_ts": { + "format": "date-time", + "type": ["string", "null"], + }, + }, + } + key_properties = [] + target_set = TargetMock( + config={ + "batch_size_rows": 100000, + "batch_wait_limit_seconds": 2, + "batch_dynamic_management": True, + }, + ) + sink_set = BatchSinkMock( + target=target_set, + stream_name="foo", + schema=input_schema_1, + key_properties=key_properties, + ) + assert sink_set.stream_name == "foo" + assert sink_set.batch_dynamic_management is True + assert sink_set.batch_wait_limit_seconds == 2 + assert sink_set.sink_timer is not None + assert sink_set.sink_timer.start_time is not None + assert sink_set.batch_size_rows == 100000 + assert sink_set.max_size == 100000 + assert sink_set.sink_timer.sink_max_size == 100 + time.sleep(1.1) + sink_set._lap_manager() + assert sink_set.sink_timer.start_time > 0.0 + assert sink_set.sink_timer.stop_time is None + assert sink_set.sink_timer.lap_time > 1.0 + assert sink_set.sink_timer.lap_time < 2.0 + assert sink_set.sink_timer.perf_diff > 0.0 + assert sink_set.sink_timer.sink_max_size == 200 + time.sleep(2.1) + sink_set._lap_manager() + assert sink_set.sink_timer.start_time > 0.0 + assert sink_set.sink_timer.stop_time is None + assert sink_set.sink_timer.lap_time > 1.0 + assert sink_set.sink_timer.perf_diff < 0.0 + assert sink_set.sink_timer.sink_max_size == 190 From 3d19a0391588860dfe4c226d770c6d01baf9e805 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 15:08:07 -0700 Subject: [PATCH 41/67] BatchPerfTimer take in a max and internally sink_max_size internally --- singer_sdk/helpers/_perftimer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index b5d0cd54f..5c35c9f2d 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -64,12 +64,15 @@ def __init__( max_size: int, max_perf_counter: float, ) -> None: - self._sink_max_size: int = max_size + self.SINK_MAX_SIZE_CEILING: int = max_size self._max_perf_counter: float = max_perf_counter - SINK_MAX_SIZE_CEILING: int = 100000 + SINK_MAX_SIZE_CEILING: int """The max size a bulk insert can be""" + _sink_max_size: int = 100 + """Hold the calculated batch size""" + @property def sink_max_size(self) -> int: """The current MAX_SIZE_DEFAULT.""" @@ -96,7 +99,7 @@ def perf_diff(self) -> float: diff = self.max_perf_counter - self.lap_time if self._lap_time else 0 return float(diff) - def counter_based_max_size(self) -> int: # noqa: C901 + def counter_based_max_size(self) -> None: # noqa: C901 """Calculate performance based batch size.""" correction = 0 if self.perf_diff < self.perf_diff_allowed_min: @@ -121,4 +124,3 @@ def counter_based_max_size(self) -> int: # noqa: C901 elif self.sink_max_size >= 10: # noqa: PLR2004 correction = 10 self._sink_max_size += correction - return self.sink_max_size From 50d00c2429daa7fae28c17f23816c8a2eaa39a94 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 15:11:26 -0700 Subject: [PATCH 42/67] add batch_dynamic_management to Sink --- singer_sdk/sinks/core.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 5fd522f0b..3deb3bdd5 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -99,6 +99,9 @@ def __init__( self._batch_wait_limit_seconds: int | None = target.config.get( "batch_wait_limit_seconds", ) + self._batch_dynamic_management: bool | None = target.config.get( + "batch_dynamic_management", + ) self._sink_timer: BatchPerfTimer | None = None @@ -187,6 +190,15 @@ def batch_wait_limit_seconds(self) -> int | None: """ return self._batch_wait_limit_seconds + @property + def batch_dynamic_management(self) -> bool: + """Get batch_dynamic_management object. + + Returns: + A batch_dynamic_management object. + """ + return self._batch_dynamic_management + @property def sink_timer(self) -> BatchPerfTimer | None: """Get sink_timer object. From fb5d6f2fad735d3eb03c7e3b56b33b2e3d383d34 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 15:21:03 -0700 Subject: [PATCH 43/67] updated perf_diff tests to utilize allowed min and max --- tests/core/test_target_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index 426ec1511..e1f27b69e 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -267,12 +267,12 @@ def test_batch_dynamic_management(): assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time > 1.0 assert sink_set.sink_timer.lap_time < 2.0 - assert sink_set.sink_timer.perf_diff > 0.0 + assert sink_set.sink_timer.perf_diff > sink_set.sink_timer.perf_diff_allowed_max assert sink_set.sink_timer.sink_max_size == 200 - time.sleep(2.1) + time.sleep(3.1) sink_set._lap_manager() assert sink_set.sink_timer.start_time > 0.0 assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time > 1.0 - assert sink_set.sink_timer.perf_diff < 0.0 + assert sink_set.sink_timer.perf_diff < sink_set.sink_timer.perf_diff_allowed_min assert sink_set.sink_timer.sink_max_size == 190 From e574003429f1a1dfa1e267dc3f84d50c13825d8e Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 15:24:37 -0700 Subject: [PATCH 44/67] add call to counter_based_max_size and logging to _lap_manager --- singer_sdk/sinks/core.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 3deb3bdd5..ac8a396b4 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -317,6 +317,18 @@ def _lap_manager(self) -> None: if self.sink_timer is not None: if self.sink_timer.start_time is not None: self.sink_timer.stop() + timer_msg: str = ( + "max_size: %.0f, min_diff: %.2f, lap_diff: %.4f, max_diff: %.2f" + ) + if self.batch_dynamic_management: + self.logger.info( + timer_msg, + self.sink_timer.sink_max_size, + self.sink_timer.perf_diff_allowed_min, + self.sink_timer.perf_diff, + self.sink_timer.perf_diff_allowed_max, + ) + self.sink_timer.counter_based_max_size() self.sink_timer.start() # Record processing From 7f48881940710c5e4d64f6309121c21f38811699 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 18 Aug 2023 15:40:44 -0700 Subject: [PATCH 45/67] expand is_full to utilize sink_timer.sink_max_size --- singer_sdk/sinks/core.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index ac8a396b4..8a6492775 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -144,7 +144,11 @@ def is_full(self) -> bool: Returns: True if the sink needs to be drained. """ - return self.current_size >= self.max_size + return ( + self.current_size >= self.sink_timer.sink_max_size + if self.batch_dynamic_management + else self.current_size >= self.max_size + ) @property def is_too_old(self) -> bool: From 1261c04ae8f733c1f0dc52993b3a93409136aab4 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 22 Aug 2023 10:12:36 -0700 Subject: [PATCH 46/67] mypy fixes --- singer_sdk/helpers/_perftimer.py | 6 +++++- singer_sdk/sinks/core.py | 6 ++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index 5c35c9f2d..b9294af70 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -42,7 +42,11 @@ def on_the_clock(self) -> float: msg = "Timer is not running. Use .start() to start it" raise PerfTimerError(msg) - return time.perf_counter() - self._start_time + return ( + time.perf_counter() - self._start_time + if self._start_time is not None + else 0.0 + ) def stop(self) -> None: """Stop the timer, Stores the elapsed time, and reset.""" diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 8a6492775..4677281e3 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -99,8 +99,9 @@ def __init__( self._batch_wait_limit_seconds: int | None = target.config.get( "batch_wait_limit_seconds", ) - self._batch_dynamic_management: bool | None = target.config.get( + self._batch_dynamic_management: bool = target.config.get( "batch_dynamic_management", + False, ) self._sink_timer: BatchPerfTimer | None = None @@ -146,7 +147,7 @@ def is_full(self) -> bool: """ return ( self.current_size >= self.sink_timer.sink_max_size - if self.batch_dynamic_management + if self.batch_dynamic_management and self.sink_timer is not None else self.current_size >= self.max_size ) @@ -161,6 +162,7 @@ def is_too_old(self) -> bool: ( self.sink_timer.on_the_clock() > self.batch_wait_limit_seconds if self.sink_timer.start_time is not None + and self.batch_wait_limit_seconds is not None else False ) if self.sink_timer is not None From 0436a748f9fe2f0951c9f5c84cc3221d16c091ac Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 22 Aug 2023 11:52:25 -0700 Subject: [PATCH 47/67] added defualt for when only batch management is set --- singer_sdk/sinks/core.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 4677281e3..6ac307112 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -46,6 +46,7 @@ class Sink(metaclass=abc.ABCMeta): logger: Logger MAX_SIZE_DEFAULT = 10000 + WAIT_LIMIT_SECONDS_DEFAULT = 30 def __init__( self, @@ -106,10 +107,12 @@ def __init__( self._sink_timer: BatchPerfTimer | None = None - if self._batch_wait_limit_seconds is not None: + if self._batch_wait_limit_seconds is not None or self._batch_dynamic_management: self._sink_timer = BatchPerfTimer( self._batch_size_rows, - self._batch_wait_limit_seconds, + self.WAIT_LIMIT_SECONDS_DEFAULT + if self._batch_wait_limit_seconds is None + else self._batch_wait_limit_seconds, ) self._sink_timer.start() From b92d8c1f4081a5cc4be559cce531d4a24ca8f891 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Wed, 23 Aug 2023 10:52:58 -0700 Subject: [PATCH 48/67] add about_info test for batch_dynamic_management --- tests/core/test_target_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index e1f27b69e..1f0ea9e43 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -79,6 +79,7 @@ def test_target_about_info(): assert "add_record_metadata" in about.settings["properties"] assert "batch_size_rows" in about.settings["properties"] assert "batch_wait_limit_seconds" in about.settings["properties"] + assert "batch_dynamic_management" in about.settings["properties"] def test_sql_get_sink(): From f78cc0a1ed47b2d4d67c08898cebb57b48922fc0 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Wed, 23 Aug 2023 10:58:35 -0700 Subject: [PATCH 49/67] added batch_dynamic_management to capabilities --- singer_sdk/helpers/capabilities.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/singer_sdk/helpers/capabilities.py b/singer_sdk/helpers/capabilities.py index ef218ebdf..3f0b68009 100644 --- a/singer_sdk/helpers/capabilities.py +++ b/singer_sdk/helpers/capabilities.py @@ -120,6 +120,16 @@ description="Maximum time to elapse for a batch to fill, drain, and load.", ), ).to_dict() +BATCH_DYNAMIC_MANAGEMENT_CONFIG = PropertiesList( + Property( + "batch_dynamic_management", + BooleanType, + description=( + "Manages sink_max_size per stream to be" + "the largest size for the batch wait limit." + ), + ), +).to_dict() class DeprecatedEnum(Enum): From ec57105b04502fc02298a7b6fc894737022d0528 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Wed, 23 Aug 2023 10:59:50 -0700 Subject: [PATCH 50/67] add batch_dynamic_management to config if missing --- singer_sdk/target_base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index f1693c3ea..4f75a8540 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -19,6 +19,7 @@ from singer_sdk.helpers.capabilities import ( ADD_RECORD_METADATA_CONFIG, BATCH_CONFIG, + BATCH_DYNAMIC_MANAGEMENT_CONFIG, BATCH_SIZE_ROWS_CONFIG, BATCH_WAIT_LIMIT_SECONDS_CONFIG, TARGET_SCHEMA_CONFIG, @@ -610,6 +611,7 @@ def _merge_missing(source_jsonschema: dict, target_jsonschema: dict) -> None: _merge_missing(ADD_RECORD_METADATA_CONFIG, config_jsonschema) _merge_missing(BATCH_SIZE_ROWS_CONFIG, config_jsonschema) _merge_missing(BATCH_WAIT_LIMIT_SECONDS_CONFIG, config_jsonschema) + _merge_missing(BATCH_DYNAMIC_MANAGEMENT_CONFIG, config_jsonschema) capabilities = cls.capabilities From 68a520ca9be5833d88c7fbd157d521b632774781 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 24 Aug 2023 14:32:47 -0700 Subject: [PATCH 51/67] add and or update tests for updated is_full utilizing _drain_function --- tests/core/test_target_base.py | 114 ++++++++++++++++++++++++++++++--- 1 file changed, 105 insertions(+), 9 deletions(-) diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index 1f0ea9e43..a3fba486b 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -175,17 +175,13 @@ def test_batch_size_rows_and_max_size(): key_properties=key_properties, ) assert sink_default.stream_name == "foo" - assert sink_default._batch_size_rows == 10000 - assert sink_default.batch_size_rows == 10000 + assert sink_default._batch_size_rows is None + assert sink_default.batch_size_rows is None assert sink_default.max_size == 10000 assert sink_set.stream_name == "bar" assert sink_set._batch_size_rows == 100000 assert sink_set.batch_size_rows == 100000 assert sink_set.max_size == 100000 - sink_set.batch_size_rows = 99999 - assert sink_set._batch_size_rows == 99999 - assert sink_set.batch_size_rows == 99999 - assert sink_set.max_size == 99999 def test_batch_wait_limit_seconds(): @@ -212,11 +208,11 @@ def test_batch_wait_limit_seconds(): assert sink_set.batch_wait_limit_seconds == 1 assert sink_set.sink_timer is not None assert sink_set.sink_timer.start_time is not None - assert sink_set.batch_size_rows == 10000 + assert sink_set.batch_size_rows is None assert sink_set.max_size == 10000 - assert sink_set.is_too_old is False + assert sink_set.is_too_old() is False time.sleep(1.1) - assert sink_set.is_too_old is True + assert sink_set.is_too_old() is True assert sink_set.sink_timer.start_time > 1.1 assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time is None @@ -277,3 +273,103 @@ def test_batch_dynamic_management(): assert sink_set.sink_timer.lap_time > 1.0 assert sink_set.sink_timer.perf_diff < sink_set.sink_timer.perf_diff_allowed_min assert sink_set.sink_timer.sink_max_size == 190 + + +@pytest.mark.parametrize( + "config,rows,wait_time,expected", + [ + pytest.param( + {}, + 10001, + None, + True, + id="default_config_true", + ), + pytest.param( + {}, + 100, + None, + False, + id="default_config_false", + ), + pytest.param( + {"batch_size_rows": 100000}, + 100001, + None, + True, + id="batch_size_rows_only_true", + ), + pytest.param( + {"batch_size_rows": 100000}, + 1, + None, + False, + id="batch_size_rows_only_false", + ), + pytest.param( + {"batch_wait_limit_seconds": 1}, + None, + 2, + True, + id="wait_limit_seconds_only_true", + ), + pytest.param( + {"batch_wait_limit_seconds": 2}, + None, + 1, + False, + id="wait_limit_seconds_only_false", + ), + pytest.param( + {"batch_size_rows": 100000, "batch_wait_limit_seconds": 2}, + 100001, + 1, + True, + id="batch_rows_and_wait_limit_seconds_true_rows", + ), + pytest.param( + {"batch_size_rows": 100000, "batch_wait_limit_seconds": 1}, + 100, + 2, + True, + id="batch_rows_and_wait_limit_seconds_true_wait", + ), + pytest.param( + {"batch_size_rows": 100000, "batch_wait_limit_seconds": 2}, + 100, + 1, + False, + id="batch_rows_and_wait_limit_seconds_false", + ), + ], +) +def test_is_full( + config: str, + rows: int | None, + wait_time: int | None, + expected: bool, +): + input_schema_1 = { + "properties": { + "id": { + "type": ["string", "null"], + }, + "col_ts": { + "format": "date-time", + "type": ["string", "null"], + }, + }, + } + key_properties = [] + target = TargetMock(config=config) + sink = BatchSinkMock( + target=target, + stream_name="foo", + schema=input_schema_1, + key_properties=key_properties, + ) + if rows is not None: + sink._batch_records_read = rows + if wait_time is not None: + time.sleep(wait_time) + assert sink.is_full == expected From b36acf84c9787d8fdc5066df1eb18d64665a4b5c Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 24 Aug 2023 14:46:51 -0700 Subject: [PATCH 52/67] update is_full to utilize _drain_function and set_drain_function and expanded is_full_x functions --- singer_sdk/sinks/core.py | 73 ++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 6ac307112..cac8cb98f 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -8,6 +8,7 @@ import json import time import typing as t +from functools import cached_property from gzip import GzipFile from gzip import open as gzip_open from types import MappingProxyType @@ -48,6 +49,8 @@ class Sink(metaclass=abc.ABCMeta): MAX_SIZE_DEFAULT = 10000 WAIT_LIMIT_SECONDS_DEFAULT = 30 + _drain_function: t.Callable | None = None + def __init__( self, target: Target, @@ -93,9 +96,8 @@ def __init__( self._batch_dupe_records_merged: int = 0 # Batch Performace Timer - self._batch_size_rows: int = target.config.get( + self._batch_size_rows: int | None = target.config.get( "batch_size_rows", - self.MAX_SIZE_DEFAULT, ) self._batch_wait_limit_seconds: int | None = target.config.get( "batch_wait_limit_seconds", @@ -143,18 +145,24 @@ def current_size(self) -> int: @property def is_full(self) -> bool: - """Check against size limit. + """Calls the size limit check funtion. Returns: True if the sink needs to be drained. """ - return ( - self.current_size >= self.sink_timer.sink_max_size - if self.batch_dynamic_management and self.sink_timer is not None - else self.current_size >= self.max_size - ) + if self._drain_function is None: + self.set_drain_function() + + return self._drain_function() + + def is_full_rows(self) -> bool: + """Check against limit in rows. + + Returns: + True if the sink needs to be drained. + """ + return self.current_size >= self.max_size - @property def is_too_old(self) -> bool: """Check against limit in seconds. @@ -172,8 +180,36 @@ def is_too_old(self) -> bool: else False ) + def is_full_rows_and_too_old(self) -> bool: + """Check against limit in rows and seconds. + + Returns: + True if the sink needs to be drained. + """ + return True in (self.is_full_rows(), self.is_too_old()) + + def is_full_dynamic(self) -> bool: + """Check against limit in caclulated limit in rows. + + Returns: + True if the sink needs to be drained. + """ + return self.current_size >= self.sink_timer.sink_max_size + + def set_drain_function(self) -> None: + """Return the function to use in is_full.""" + if self.batch_dynamic_management: + self._drain_function = self.is_full_dynamic + elif self.batch_wait_limit_seconds is not None: + if self.batch_size_rows is not None: + self._drain_function = self.is_full_rows_and_too_old + else: + self._drain_function = self.is_too_old + else: + self._drain_function = self.is_full_rows + @property - def batch_size_rows(self) -> int: + def batch_size_rows(self) -> int | None: """Get batch_size_rows object. Returns: @@ -181,15 +217,6 @@ def batch_size_rows(self) -> int: """ return self._batch_size_rows - @batch_size_rows.setter - def batch_size_rows(self, new_value: int) -> None: - """Set batch_size_rows object to the given value. - - Args: - new_value: The value you want to set batch_size_rows too. - """ - self._batch_size_rows = new_value - @property def batch_wait_limit_seconds(self) -> int | None: """Get batch_wait_limit_seconds object. @@ -217,14 +244,18 @@ def sink_timer(self) -> BatchPerfTimer | None: """ return self._sink_timer - @property + @cached_property def max_size(self) -> int: """Get max batch size. Returns: Max number of records to batch before `is_full=True` """ - return self.batch_size_rows + return ( + self.batch_size_rows + if self.batch_size_rows is not None + else self.MAX_SIZE_DEFAULT + ) # Tally methods From 933fa21d96d565947df2a1565ec54b5593d37b65 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 24 Aug 2023 14:48:26 -0700 Subject: [PATCH 53/67] remove is_too_old check from _process_record_message and update is_full logging message --- singer_sdk/target_base.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index 4f75a8540..2594cf639 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -354,14 +354,7 @@ def _process_record_message(self, message_dict: dict) -> None: if sink.is_full: self.logger.info( - "Target sink for '%s' is full. Draining...", - sink.stream_name, - ) - self.drain_one(sink) - - if sink.is_too_old: - self.logger.info( - "Target sink for '%s' is to old. Current size is '%s'. Draining...", + "Target sink for '%s' is full. Current size is '%s'. Draining...", sink.stream_name, sink.current_size, ) From c342e44ed8288f3a5930149a4b34d7fd72d0f1f6 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 24 Aug 2023 15:13:58 -0700 Subject: [PATCH 54/67] mypy fixes --- singer_sdk/sinks/core.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index cac8cb98f..fdf7a1a9c 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -49,7 +49,7 @@ class Sink(metaclass=abc.ABCMeta): MAX_SIZE_DEFAULT = 10000 WAIT_LIMIT_SECONDS_DEFAULT = 30 - _drain_function: t.Callable | None = None + _drain_function: t.Callable[[], bool] | None = None def __init__( self, @@ -111,7 +111,9 @@ def __init__( if self._batch_wait_limit_seconds is not None or self._batch_dynamic_management: self._sink_timer = BatchPerfTimer( - self._batch_size_rows, + self._batch_size_rows + if self._batch_size_rows is not None + else self.MAX_SIZE_DEFAULT, self.WAIT_LIMIT_SECONDS_DEFAULT if self._batch_wait_limit_seconds is None else self._batch_wait_limit_seconds, @@ -153,7 +155,7 @@ def is_full(self) -> bool: if self._drain_function is None: self.set_drain_function() - return self._drain_function() + return self._drain_function() if self._drain_function is not None else True def is_full_rows(self) -> bool: """Check against limit in rows. @@ -194,7 +196,11 @@ def is_full_dynamic(self) -> bool: Returns: True if the sink needs to be drained. """ - return self.current_size >= self.sink_timer.sink_max_size + return ( + self.current_size >= self.sink_timer.sink_max_size + if self.sink_timer is not None + else False + ) def set_drain_function(self) -> None: """Return the function to use in is_full.""" From 9ce87da26e50a33629155f2f09c6e5ffd4c7a2ce Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 24 Aug 2023 15:48:56 -0700 Subject: [PATCH 55/67] update docs implementation index to include target_batch_full --- docs/implementation/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/implementation/index.md b/docs/implementation/index.md index 06a2cec9a..361c61934 100644 --- a/docs/implementation/index.md +++ b/docs/implementation/index.md @@ -13,6 +13,7 @@ metrics logging state at_least_once +target_batch_full ``` ## How to use the implementation reference material From b2697ee8d31ca1cda57694f331a846fa8eb53371 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Mon, 28 Aug 2023 13:50:32 -0700 Subject: [PATCH 56/67] added documentation for updated is_full and associated new functions --- docs/implementation/target_batch_full.md | 54 ++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 docs/implementation/target_batch_full.md diff --git a/docs/implementation/target_batch_full.md b/docs/implementation/target_batch_full.md new file mode 100644 index 000000000..ef878c288 --- /dev/null +++ b/docs/implementation/target_batch_full.md @@ -0,0 +1,54 @@ +# Target: Batch Drain + +The SDK automatically handles when to send a batch of records to the target. + +## The Basics + +A Tap sends records messages to the Target. Those messages are grabbed, decoded, and sent to the Target method `_process_record_message`. After each record is processed one question is aksed. + +1. Is the batch full, `Sink.is_full` + +If the answer is `True` then the records currently held in the `Sink._pending_batch` dict are drained. The drain process is managed by the `Target.drain_one` method. Records get written, counters get reset, and on to filling the next batch with records. + +## How a Batch is Measured and How Is Full Defined + +You need to know three things to determine if somthing is full + +1. The unit of measure +2. The level at which an object is determined full +3. The current mesurment of the object + +The units of measure used with `Sink._pending_batch` are `rows` and `time` in `seconds`. We gather the full level marks from the Meltano Target configuation options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present we fall back to the defined constants `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT`. How do we meaure `Sink._pending_batch`? To meausre the `rows` the count of records read from the Tap is used. This is held in the internal variable `Sink._batch_records_read` that is aviable via the property `Sink.current_size`. To measure `seconds` a batch timer is utlized. + +There are four full senarios and each one has a function that looks at the batch and returns `True` if it is full or `False` if the batch can take more records. + +1. `Rows` limit has been reached. `Sink.is_full_rows` +2. `Wait` limit is `seconds` has been reached. `Sink.is_too_old` +3. `Rows` limit or `Wait` limit is seconds has been reached. `Sink.is_full_rows_and_too_old` +4. `Rows` limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` + +Based on the Meltano Target configuration options given the function `set_drain_function` places the appropriate function into the internal variable`_batch_drain_fucntion`. This variable is what is run when the attribute `Sink.is_full` is called at the end of each Target method `_process_record_message` cycle. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Draining..." is logged to the console and `Target.drain_one()` is called. + +## `Rows` limit has been reached. `Sink.is_full_rows` + +To know if something is full you need to know how much it currently holds and at what point to consider it full. Sinks have the property `current_size` which gives you the number of records read and placed into the pending batch. The `current_size` gives us how much the current batch is holding. Now that we know the `current_size` we need to determing if the current size is considered full. The Sink property of `max_size` gives us the integer that defines the full mark. The property `max_size` is returns the Sink internal variable `_batch_size_rows` if not `None` or the `DEFAULT_MAX_SIZE` constant which is `10000`. + +Both the `Sink.current_size` and `Sink.max_size` are used to calculate the `bool` value returned by the property `Sink.is_full`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Current size is 'current_size'Draining..." is logged to the console and `Target.drain_one()` is called. + +## `Wait` limit is `seconds` has been reached. `Sink.is_too_old` + +To know if something is too old you need to know how much time has passed and how much time needs to pass to be considered old. When the Meltano Target configuration option of `batch_wait_limit_seconds` is present and set the internal variable `_sink_timer` is initialized with an instance of a `BatchPerfTimer`. The `BatchPerfTimer` Class is a batch specific stop watch. The timer can be accessable via the property `sink_timer`. Right after the timer is initalized the stop watch is started `sink_timer.start()`. We can see how much time has passed by running `sink_timer.on_the_clock()`. The property `Sink.batch_wait_limit_seconds` hold how much time needs to pass to be considered old. + +Both the `sink_timer.on_the_clock()` and `Sink.batch_wait_limit_seconds` are used to calcualte the bool value returned by the function`Sink.is_too_old`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message ""Target sink for 'stream_name' is to full. Current size is 'current_size'. Draining..."" is logged to the console and `Target.drain_one(Sink)` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calcuates the lap time, and starts the timer again. + +The timer is an instance of `BatchPerfTimer` which is a batch specific stop watch. When `batch_wait_limits_seconds` is reached on `Sink.sink_timer.on_the_clock()` + +## `Rows` limit or `Wait` limit is seconds has been reached. `Sink.is_full_rows_and_too_old` + +The previously described `is_full_rows` and `is_too_old` functions are run and their results are held in a tuple. If `True` is present in the tuple the function returns `True` so `is_full` will return `Ture`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Current size is 'current_size' Draining..." is logged to the console and `Target.drain_one()` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calcuates the lap time, and starts the timer again. + +## `Rows` limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` + +When the Meltano Target configuration option `batch_dynamic_management` is set to `True` you are asking the `Sink.sink_timer` to find the maxiumum `rows` limit that keeps the time to fill a batch with records and write those records to the Target's target within the `time` in `seconds` given. + +The `Sink.sink_timer` is passed the given `batch_size_rows` or the `DEFAULT_MAX_SIZE` constant which is `10000` if it is `None` and is passed the given `batch_wait_limit_seconds` if present or the `WAIT_LIMIT_SECONDS_DEFAULT` constant which is `30` if it is `None`. Interally the `rows` passed turns into `Sink.sink_timer.SINK_MAX_SIZE_CEILING` which is the max size a batch can reach. The `time` in `seconds` passed turns into `Sink.sink_timer.max_perf_counter` which is the time in seconds a full cycle should take. The attribute `Sink.sink_timer.sink_max_size` starts at a predefined size of `100`. During the `Target.drain_one(Sink)` process `Sink._lap_manager` is called and the timer method `counter_based_max_size` runs and checks if `Sink.sink_timer.perf_diff`, which is the `max_perf_counter` - `lap_time`, is greater than `Sink.sink_timer.perf_diff_allowed_max` or less than `Sink.sink_timer.perf_diff_allowed_min`. If `Sink.sink_timer.perf_diff` is greater than `Sink.sink_timer.perf_diff_allowed_max` the `Sink.sink_timer.sink_max_size` is increased as long as the `Sink.sink_timer.sink_max_size` is less than `Sink.sink_timer.SINK_MAX_SIZE_CEILING`. If `Sink.sink_timer.perf_diff` is less than `Sink.sink_timer.perf_diff_allowed_min` the `Sink.sink_timer.sink_max_size` is reduced. If the `Sink.sink_timer.perf_diff` is between `Sink.sink_timer.perf_diff_allowed_max` and `Sink.sink_timer.perf_diff_allowed_min` no correction to `Sink.sink_timer.sink_max_size` is made since the optimal maximum rows has been reached. This process is repeated when each `Sink` is initalized and starts processing records. \ No newline at end of file From f36a79f4e41e8af089a297468052c60d650214a8 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 29 Aug 2023 09:07:15 -0700 Subject: [PATCH 57/67] Merge branch '1626-configurable-batch-size-and-max-wait-limit-for-targets' of https://github.com/BuzzCutNorman/sdk into 1626-configurable-batch-size-and-max-wait-limit-for-targets --- singer_sdk/exceptions.py | 4 ++++ singer_sdk/streams/core.py | 8 ++++++++ tests/core/test_streams.py | 21 +++++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/singer_sdk/exceptions.py b/singer_sdk/exceptions.py index 23325aa2a..351776291 100644 --- a/singer_sdk/exceptions.py +++ b/singer_sdk/exceptions.py @@ -17,6 +17,10 @@ class FatalAPIError(Exception): """Exception raised when a failed request should not be considered retriable.""" +class InvalidReplicationKeyException(Exception): + """Exception to raise if the replication key is not in the stream properties.""" + + class InvalidStreamSortException(Exception): """Exception to raise if sorting errors are found while syncing the records.""" diff --git a/singer_sdk/streams/core.py b/singer_sdk/streams/core.py index 53e9533ff..4c3adb225 100644 --- a/singer_sdk/streams/core.py +++ b/singer_sdk/streams/core.py @@ -19,6 +19,7 @@ from singer_sdk.exceptions import ( AbortedSyncFailedException, AbortedSyncPausedException, + InvalidReplicationKeyException, InvalidStreamSortException, MaxRecordsLimitException, ) @@ -211,10 +212,17 @@ def is_timestamp_replication_key(self) -> bool: Returns: True if the stream uses a timestamp-based replication key. + + Raises: + InvalidReplicationKeyException: If the schema does not contain the + replication key. """ if not self.replication_key: return False type_dict = self.schema.get("properties", {}).get(self.replication_key) + if type_dict is None: + msg = f"Field '{self.replication_key}' is not in schema for stream '{self.name}'" # noqa: E501 + raise InvalidReplicationKeyException(msg) return is_datetime_type(type_dict) def get_starting_replication_key_value( diff --git a/tests/core/test_streams.py b/tests/core/test_streams.py index 34bbc7514..a3a451086 100644 --- a/tests/core/test_streams.py +++ b/tests/core/test_streams.py @@ -10,6 +10,9 @@ import requests from singer_sdk._singerlib import Catalog, MetadataMapping +from singer_sdk.exceptions import ( + InvalidReplicationKeyException, +) from singer_sdk.helpers._classproperty import classproperty from singer_sdk.helpers.jsonpath import _compile_jsonpath, extract_jsonpath from singer_sdk.pagination import first @@ -275,6 +278,24 @@ def test_stream_starting_timestamp( assert get_starting_value(None) == expected_starting_value +def test_stream_invalid_replication_key(tap: SimpleTestTap): + """Validate an exception is raised if replication_key not in schema.""" + + class InvalidReplicationKeyStream(SimpleTestStream): + replication_key = "INVALID" + + stream = InvalidReplicationKeyStream(tap) + + with pytest.raises( + InvalidReplicationKeyException, + match=( + f"Field '{stream.replication_key}' is not in schema for stream " + f"'{stream.name}'" + ), + ): + _check = stream.is_timestamp_replication_key + + @pytest.mark.parametrize( "path,content,result", [ From 2952513c01606b18c68d0e8c7dbf6f852fc2d89f Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 29 Aug 2023 09:10:57 -0700 Subject: [PATCH 58/67] edits to target batch full documentation --- docs/implementation/target_batch_full.md | 35 ++++++++++++------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/docs/implementation/target_batch_full.md b/docs/implementation/target_batch_full.md index ef878c288..deccfe5c7 100644 --- a/docs/implementation/target_batch_full.md +++ b/docs/implementation/target_batch_full.md @@ -1,6 +1,6 @@ # Target: Batch Drain -The SDK automatically handles when to send a batch of records to the target. +The SDK automatically handles creating and releasing properly sized batches. ## The Basics @@ -10,7 +10,7 @@ A Tap sends records messages to the Target. Those messages are grabbed, decoded If the answer is `True` then the records currently held in the `Sink._pending_batch` dict are drained. The drain process is managed by the `Target.drain_one` method. Records get written, counters get reset, and on to filling the next batch with records. -## How a Batch is Measured and How Is Full Defined +## How a Batch is Measured and How is Full Defined You need to know three things to determine if somthing is full @@ -18,37 +18,36 @@ You need to know three things to determine if somthing is full 2. The level at which an object is determined full 3. The current mesurment of the object -The units of measure used with `Sink._pending_batch` are `rows` and `time` in `seconds`. We gather the full level marks from the Meltano Target configuation options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present we fall back to the defined constants `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT`. How do we meaure `Sink._pending_batch`? To meausre the `rows` the count of records read from the Tap is used. This is held in the internal variable `Sink._batch_records_read` that is aviable via the property `Sink.current_size`. To measure `seconds` a batch timer is utlized. +The units of measure used with `Sink._pending_batch` are `rows` and `time` in `seconds`. The full level marks come from the Meltano Target configuration options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present, the defined constants of `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT` are used. How do we measure `Sink._pending_batch`? To measure the `rows` the count of records read from the Tap is used. This is held in the internal variable `Sink._batch_records_read` that is available via the property `Sink.current_size`. To measure `seconds` a batch timer is utilized. -There are four full senarios and each one has a function that looks at the batch and returns `True` if it is full or `False` if the batch can take more records. +There are four “is full” scenarios and each one has a function that looks at the batch and returns `True` if it is full or `False` if the batch can take more records. 1. `Rows` limit has been reached. `Sink.is_full_rows` 2. `Wait` limit is `seconds` has been reached. `Sink.is_too_old` 3. `Rows` limit or `Wait` limit is seconds has been reached. `Sink.is_full_rows_and_too_old` 4. `Rows` limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` -Based on the Meltano Target configuration options given the function `set_drain_function` places the appropriate function into the internal variable`_batch_drain_fucntion`. This variable is what is run when the attribute `Sink.is_full` is called at the end of each Target method `_process_record_message` cycle. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Draining..." is logged to the console and `Target.drain_one()` is called. +Based on the Meltano Target configuration options given the function `set_drain_function` places the appropriate function into the internal variable`_batch_drain_fucntion`. This variable is what is run when the attribute `Sink.is_full` is called at the end of each Target method `_process_record_message` cycle. When `Sink.is_full` is checked and returns `True` the message "Target sink for 'stream_name' is full. Draining..." is logged to the console and `Target.drain_one(Sink)` is called. -## `Rows` limit has been reached. `Sink.is_full_rows` +## Explanation of the Four "is full" Scenarios +### `Rows` limit has been reached. `Sink.is_full_rows` -To know if something is full you need to know how much it currently holds and at what point to consider it full. Sinks have the property `current_size` which gives you the number of records read and placed into the pending batch. The `current_size` gives us how much the current batch is holding. Now that we know the `current_size` we need to determing if the current size is considered full. The Sink property of `max_size` gives us the integer that defines the full mark. The property `max_size` is returns the Sink internal variable `_batch_size_rows` if not `None` or the `DEFAULT_MAX_SIZE` constant which is `10000`. +To know if something is full you need to know how much it currently holds and at what point to consider it full. Sinks have the property `current_size` which gives you the number of records read and placed into the pending batch. The `current_size` gives us how much the current batch is holding. Now that we know the `current_size` we need to determine if the current size is considered full. The Sink property of `max_size` gives us the integer that defines the full mark. The property `max_size` returns the Sink’s internal variable `_batch_size_rows` if not `None` or the `DEFAULT_MAX_SIZE` constant which is `10000`. -Both the `Sink.current_size` and `Sink.max_size` are used to calculate the `bool` value returned by the property `Sink.is_full`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Current size is 'current_size'Draining..." is logged to the console and `Target.drain_one()` is called. +Both the `Sink.current_size` and `Sink.max_size` are used to calculate the `bool` value returned by the property `Sink.is_full`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message "Target sink for 'stream_name' is full. Current size is 'current_size'Draining..." is logged to the console and `Target.drain_one(Sink)` is called. -## `Wait` limit is `seconds` has been reached. `Sink.is_too_old` +### `Wait` limit is `seconds` has been reached. `Sink.is_too_old` -To know if something is too old you need to know how much time has passed and how much time needs to pass to be considered old. When the Meltano Target configuration option of `batch_wait_limit_seconds` is present and set the internal variable `_sink_timer` is initialized with an instance of a `BatchPerfTimer`. The `BatchPerfTimer` Class is a batch specific stop watch. The timer can be accessable via the property `sink_timer`. Right after the timer is initalized the stop watch is started `sink_timer.start()`. We can see how much time has passed by running `sink_timer.on_the_clock()`. The property `Sink.batch_wait_limit_seconds` hold how much time needs to pass to be considered old. +To know if something is too old you need to know how much time has passed and how much time needs to pass to be considered old. When the Meltano Target configuration option of `batch_wait_limit_seconds` is present and set the internal variable `_sink_timer` is initialized with an instance of a `BatchPerfTimer`. The `BatchPerfTimer` Class is a batch specific stop watch. The timer is accessible via the property `Sink.sink_timer`. Right after the timer is initialized the stop watch is started `Sink.sink_timer.start()`. We can see how much time has passed by running `Sink.sink_timer.on_the_clock()`. The property `Sink.batch_wait_limit_seconds` hold how much time needs to pass to be considered old. -Both the `sink_timer.on_the_clock()` and `Sink.batch_wait_limit_seconds` are used to calcualte the bool value returned by the function`Sink.is_too_old`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message ""Target sink for 'stream_name' is to full. Current size is 'current_size'. Draining..."" is logged to the console and `Target.drain_one(Sink)` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calcuates the lap time, and starts the timer again. +Both the `Sink.sink_timer.on_the_clock()` and `Sink.batch_wait_limit_seconds` are used to calculate the `bool` value returned by the function`Sink.is_too_old`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message ""Target sink for 'stream_name' is to full. Current size is 'current_size'. Draining..."" is logged to the console and `Target.drain_one(Sink)` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. -The timer is an instance of `BatchPerfTimer` which is a batch specific stop watch. When `batch_wait_limits_seconds` is reached on `Sink.sink_timer.on_the_clock()` +### `Rows` limit or `Wait` limit is seconds has been reached. `Sink.is_full_rows_and_too_old` -## `Rows` limit or `Wait` limit is seconds has been reached. `Sink.is_full_rows_and_too_old` +The previously described `is_full_rows` and `is_too_old` functions are run and their results are held in a tuple. If `True` is present in the tuple the function returns `True` so `is_full` will return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Current size is 'current_size' Draining..." is logged to the console and `Target.drain_one()` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. -The previously described `is_full_rows` and `is_too_old` functions are run and their results are held in a tuple. If `True` is present in the tuple the function returns `True` so `is_full` will return `Ture`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Current size is 'current_size' Draining..." is logged to the console and `Target.drain_one()` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calcuates the lap time, and starts the timer again. +### `Rows` limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` -## `Rows` limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` +When the Meltano Target configuration option `batch_dynamic_management` is set to `True` you are asking the `Sink.sink_timer` to find the maximum `rows` is full mark that keeps the time to fill a batch with records and write those records to the Target's target within the `time` in `seconds` given. -When the Meltano Target configuration option `batch_dynamic_management` is set to `True` you are asking the `Sink.sink_timer` to find the maxiumum `rows` limit that keeps the time to fill a batch with records and write those records to the Target's target within the `time` in `seconds` given. - -The `Sink.sink_timer` is passed the given `batch_size_rows` or the `DEFAULT_MAX_SIZE` constant which is `10000` if it is `None` and is passed the given `batch_wait_limit_seconds` if present or the `WAIT_LIMIT_SECONDS_DEFAULT` constant which is `30` if it is `None`. Interally the `rows` passed turns into `Sink.sink_timer.SINK_MAX_SIZE_CEILING` which is the max size a batch can reach. The `time` in `seconds` passed turns into `Sink.sink_timer.max_perf_counter` which is the time in seconds a full cycle should take. The attribute `Sink.sink_timer.sink_max_size` starts at a predefined size of `100`. During the `Target.drain_one(Sink)` process `Sink._lap_manager` is called and the timer method `counter_based_max_size` runs and checks if `Sink.sink_timer.perf_diff`, which is the `max_perf_counter` - `lap_time`, is greater than `Sink.sink_timer.perf_diff_allowed_max` or less than `Sink.sink_timer.perf_diff_allowed_min`. If `Sink.sink_timer.perf_diff` is greater than `Sink.sink_timer.perf_diff_allowed_max` the `Sink.sink_timer.sink_max_size` is increased as long as the `Sink.sink_timer.sink_max_size` is less than `Sink.sink_timer.SINK_MAX_SIZE_CEILING`. If `Sink.sink_timer.perf_diff` is less than `Sink.sink_timer.perf_diff_allowed_min` the `Sink.sink_timer.sink_max_size` is reduced. If the `Sink.sink_timer.perf_diff` is between `Sink.sink_timer.perf_diff_allowed_max` and `Sink.sink_timer.perf_diff_allowed_min` no correction to `Sink.sink_timer.sink_max_size` is made since the optimal maximum rows has been reached. This process is repeated when each `Sink` is initalized and starts processing records. \ No newline at end of file +The `Sink.sink_timer` is passed the given `batch_size_rows` or the `DEFAULT_MAX_SIZE` constant which is `10000` if it is `None` and is also passed the given `batch_wait_limit_seconds` if present or the `WAIT_LIMIT_SECONDS_DEFAULT` constant which is `30` if it is `None`. Internally the `rows` passed turns into `Sink.sink_timer.SINK_MAX_SIZE_CEILING` which is the max size a batch can reach. The `time` in `seconds` passed turns into `Sink.sink_timer.max_perf_counter` which is the time in seconds a full cycle should take. The attribute `Sink.sink_timer.sink_max_size` starts at a predefined size of `100`. During the `Target.drain_one(Sink)` process `Sink._lap_manager` is called and the timer method `counter_based_max_size` runs and checks if `Sink.sink_timer.perf_diff`, which is `max_perf_counter` - `lap_time`, is greater than `Sink.sink_timer.perf_diff_allowed_max` or less than `Sink.sink_timer.perf_diff_allowed_min`. If `Sink.sink_timer.perf_diff` is greater than `Sink.sink_timer.perf_diff_allowed_max` the `Sink.sink_timer.sink_max_size` is increased as long as the `Sink.sink_timer.sink_max_size` is less than `Sink.sink_timer.SINK_MAX_SIZE_CEILING`. If `Sink.sink_timer.perf_diff` is less than `Sink.sink_timer.perf_diff_allowed_min` the `Sink.sink_timer.sink_max_size` is reduced. If the `Sink.sink_timer.perf_diff` is between `Sink.sink_timer.perf_diff_allowed_max` and `Sink.sink_timer.perf_diff_allowed_min` no correction to `Sink.sink_timer.sink_max_size` is made since the optimal rows size has been reached. This process is repeated when each `Sink` is initialized and starts processing records. From 879995ab8a84b91f02a5684ccf375243de8ec30b Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 29 Aug 2023 12:03:09 -0700 Subject: [PATCH 59/67] removed cached_property decorator from max_size --- singer_sdk/sinks/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index fdf7a1a9c..102a1e9c7 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -8,7 +8,6 @@ import json import time import typing as t -from functools import cached_property from gzip import GzipFile from gzip import open as gzip_open from types import MappingProxyType @@ -250,7 +249,7 @@ def sink_timer(self) -> BatchPerfTimer | None: """ return self._sink_timer - @cached_property + @property def max_size(self) -> int: """Get max batch size. From 1c81669e527ab96ef48c69f57f36ad0afe5f99f4 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 29 Aug 2023 14:38:03 -0700 Subject: [PATCH 60/67] Edits and formatting changes. --- docs/implementation/target_batch_full.md | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/implementation/target_batch_full.md b/docs/implementation/target_batch_full.md index deccfe5c7..48d2da720 100644 --- a/docs/implementation/target_batch_full.md +++ b/docs/implementation/target_batch_full.md @@ -1,4 +1,4 @@ -# Target: Batch Drain +# Target: Batch Full The SDK automatically handles creating and releasing properly sized batches. @@ -10,7 +10,7 @@ A Tap sends records messages to the Target. Those messages are grabbed, decoded If the answer is `True` then the records currently held in the `Sink._pending_batch` dict are drained. The drain process is managed by the `Target.drain_one` method. Records get written, counters get reset, and on to filling the next batch with records. -## How a Batch is Measured and How is Full Defined +## How a Batch is Measured and Full is Defined You need to know three things to determine if somthing is full @@ -18,36 +18,37 @@ You need to know three things to determine if somthing is full 2. The level at which an object is determined full 3. The current mesurment of the object -The units of measure used with `Sink._pending_batch` are `rows` and `time` in `seconds`. The full level marks come from the Meltano Target configuration options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present, the defined constants of `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT` are used. How do we measure `Sink._pending_batch`? To measure the `rows` the count of records read from the Tap is used. This is held in the internal variable `Sink._batch_records_read` that is available via the property `Sink.current_size`. To measure `seconds` a batch timer is utilized. +The units of measure used with `Sink._pending_batch` are **rows** and **time** in **seconds**. The full level marks come from the Meltano Target configuration options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present, the defined constants of `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT` are used. How do we measure `Sink._pending_batch`? To measure the **rows** the count of records read from the Tap is used. This is held in the internal variable `Sink._batch_records_read` that is available via the property `Sink.current_size`. To measure **seconds** a batch timer is utilized. There are four “is full” scenarios and each one has a function that looks at the batch and returns `True` if it is full or `False` if the batch can take more records. -1. `Rows` limit has been reached. `Sink.is_full_rows` -2. `Wait` limit is `seconds` has been reached. `Sink.is_too_old` -3. `Rows` limit or `Wait` limit is seconds has been reached. `Sink.is_full_rows_and_too_old` -4. `Rows` limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` +1. Rows limit has been reached. `Sink.is_full_rows` +2. Wait limit is seconds has been reached. `Sink.is_too_old` +3. Rows limit or Wait limit is seconds has been reached. `Sink.is_full_rows_and_too_old` +4. Rows limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` Based on the Meltano Target configuration options given the function `set_drain_function` places the appropriate function into the internal variable`_batch_drain_fucntion`. This variable is what is run when the attribute `Sink.is_full` is called at the end of each Target method `_process_record_message` cycle. When `Sink.is_full` is checked and returns `True` the message "Target sink for 'stream_name' is full. Draining..." is logged to the console and `Target.drain_one(Sink)` is called. ## Explanation of the Four "is full" Scenarios -### `Rows` limit has been reached. `Sink.is_full_rows` + +### Rows limit has been reached. `Sink.is_full_rows` To know if something is full you need to know how much it currently holds and at what point to consider it full. Sinks have the property `current_size` which gives you the number of records read and placed into the pending batch. The `current_size` gives us how much the current batch is holding. Now that we know the `current_size` we need to determine if the current size is considered full. The Sink property of `max_size` gives us the integer that defines the full mark. The property `max_size` returns the Sink’s internal variable `_batch_size_rows` if not `None` or the `DEFAULT_MAX_SIZE` constant which is `10000`. Both the `Sink.current_size` and `Sink.max_size` are used to calculate the `bool` value returned by the property `Sink.is_full`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message "Target sink for 'stream_name' is full. Current size is 'current_size'Draining..." is logged to the console and `Target.drain_one(Sink)` is called. -### `Wait` limit is `seconds` has been reached. `Sink.is_too_old` +### Wait limit in seconds has been reached. `Sink.is_too_old` To know if something is too old you need to know how much time has passed and how much time needs to pass to be considered old. When the Meltano Target configuration option of `batch_wait_limit_seconds` is present and set the internal variable `_sink_timer` is initialized with an instance of a `BatchPerfTimer`. The `BatchPerfTimer` Class is a batch specific stop watch. The timer is accessible via the property `Sink.sink_timer`. Right after the timer is initialized the stop watch is started `Sink.sink_timer.start()`. We can see how much time has passed by running `Sink.sink_timer.on_the_clock()`. The property `Sink.batch_wait_limit_seconds` hold how much time needs to pass to be considered old. Both the `Sink.sink_timer.on_the_clock()` and `Sink.batch_wait_limit_seconds` are used to calculate the `bool` value returned by the function`Sink.is_too_old`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message ""Target sink for 'stream_name' is to full. Current size is 'current_size'. Draining..."" is logged to the console and `Target.drain_one(Sink)` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. -### `Rows` limit or `Wait` limit is seconds has been reached. `Sink.is_full_rows_and_too_old` +### Rows or Wait limit has been reached. `Sink.is_full_rows_and_too_old` The previously described `is_full_rows` and `is_too_old` functions are run and their results are held in a tuple. If `True` is present in the tuple the function returns `True` so `is_full` will return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Current size is 'current_size' Draining..." is logged to the console and `Target.drain_one()` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. -### `Rows` limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` +### Rows limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` -When the Meltano Target configuration option `batch_dynamic_management` is set to `True` you are asking the `Sink.sink_timer` to find the maximum `rows` is full mark that keeps the time to fill a batch with records and write those records to the Target's target within the `time` in `seconds` given. +When the Meltano Target configuration option `batch_dynamic_management` is set to `True` you are asking the `Sink.sink_timer` to find the maximum rows is full mark that keeps the time to fill a batch with records and write those records to the Target's target within the time in seconds given. The `Sink.sink_timer` is passed the given `batch_size_rows` or the `DEFAULT_MAX_SIZE` constant which is `10000` if it is `None` and is also passed the given `batch_wait_limit_seconds` if present or the `WAIT_LIMIT_SECONDS_DEFAULT` constant which is `30` if it is `None`. Internally the `rows` passed turns into `Sink.sink_timer.SINK_MAX_SIZE_CEILING` which is the max size a batch can reach. The `time` in `seconds` passed turns into `Sink.sink_timer.max_perf_counter` which is the time in seconds a full cycle should take. The attribute `Sink.sink_timer.sink_max_size` starts at a predefined size of `100`. During the `Target.drain_one(Sink)` process `Sink._lap_manager` is called and the timer method `counter_based_max_size` runs and checks if `Sink.sink_timer.perf_diff`, which is `max_perf_counter` - `lap_time`, is greater than `Sink.sink_timer.perf_diff_allowed_max` or less than `Sink.sink_timer.perf_diff_allowed_min`. If `Sink.sink_timer.perf_diff` is greater than `Sink.sink_timer.perf_diff_allowed_max` the `Sink.sink_timer.sink_max_size` is increased as long as the `Sink.sink_timer.sink_max_size` is less than `Sink.sink_timer.SINK_MAX_SIZE_CEILING`. If `Sink.sink_timer.perf_diff` is less than `Sink.sink_timer.perf_diff_allowed_min` the `Sink.sink_timer.sink_max_size` is reduced. If the `Sink.sink_timer.perf_diff` is between `Sink.sink_timer.perf_diff_allowed_max` and `Sink.sink_timer.perf_diff_allowed_min` no correction to `Sink.sink_timer.sink_max_size` is made since the optimal rows size has been reached. This process is repeated when each `Sink` is initialized and starts processing records. From c804b768a21ff371687a22c23e2a540616a0ffad Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 5 Sep 2023 08:48:50 -0700 Subject: [PATCH 61/67] further edits and formatting --- docs/implementation/target_batch_full.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/implementation/target_batch_full.md b/docs/implementation/target_batch_full.md index 48d2da720..0dfcaa0e3 100644 --- a/docs/implementation/target_batch_full.md +++ b/docs/implementation/target_batch_full.md @@ -18,16 +18,20 @@ You need to know three things to determine if somthing is full 2. The level at which an object is determined full 3. The current mesurment of the object -The units of measure used with `Sink._pending_batch` are **rows** and **time** in **seconds**. The full level marks come from the Meltano Target configuration options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present, the defined constants of `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT` are used. How do we measure `Sink._pending_batch`? To measure the **rows** the count of records read from the Tap is used. This is held in the internal variable `Sink._batch_records_read` that is available via the property `Sink.current_size`. To measure **seconds** a batch timer is utilized. +The units of measure used with `Sink._pending_batch` are **rows** and **time** in **seconds**. The full level marks come from the Meltano Target configuration options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present, the defined constants of `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT` are used. How is `Sink._pending_batch` measured? To measure **rows** the count of records read from the Tap is used. The current row count is available via the property `Sink.current_size`. To measure **seconds** a batch timer is utilized. There are four “is full” scenarios and each one has a function that looks at the batch and returns `True` if it is full or `False` if the batch can take more records. -1. Rows limit has been reached. `Sink.is_full_rows` +1. Row limit has been reached. `Sink.is_full_rows` 2. Wait limit is seconds has been reached. `Sink.is_too_old` -3. Rows limit or Wait limit is seconds has been reached. `Sink.is_full_rows_and_too_old` -4. Rows limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` +3. Row limit or Wait limit is seconds has been reached. `Sink.is_full_rows_and_too_old` +4. Row limit managed by the batch timer has been reached. `Sink.is_full_dynamic` -Based on the Meltano Target configuration options given the function `set_drain_function` places the appropriate function into the internal variable`_batch_drain_fucntion`. This variable is what is run when the attribute `Sink.is_full` is called at the end of each Target method `_process_record_message` cycle. When `Sink.is_full` is checked and returns `True` the message "Target sink for 'stream_name' is full. Draining..." is logged to the console and `Target.drain_one(Sink)` is called. +Based on the Meltano Target configuration option(s) given the function `set_drain_function` places the appropriate function into the internal variable`_batch_drain_fucntion`. This variable is what is run when the attribute `Sink.is_full` is called at the end of each Target method `_process_record_message` cycle. When `Sink.is_full` is checked and returns `True` the message: +``` +"Target sink for [Stream Name] is full. Current size is [Current Size]. Draining...", +``` +is logged to the console and `Target.drain_one(Sink)` is called. ## Explanation of the Four "is full" Scenarios @@ -35,13 +39,13 @@ Based on the Meltano Target configuration options given the function `set_drain_ To know if something is full you need to know how much it currently holds and at what point to consider it full. Sinks have the property `current_size` which gives you the number of records read and placed into the pending batch. The `current_size` gives us how much the current batch is holding. Now that we know the `current_size` we need to determine if the current size is considered full. The Sink property of `max_size` gives us the integer that defines the full mark. The property `max_size` returns the Sink’s internal variable `_batch_size_rows` if not `None` or the `DEFAULT_MAX_SIZE` constant which is `10000`. -Both the `Sink.current_size` and `Sink.max_size` are used to calculate the `bool` value returned by the property `Sink.is_full`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message "Target sink for 'stream_name' is full. Current size is 'current_size'Draining..." is logged to the console and `Target.drain_one(Sink)` is called. +Both the `Sink.current_size` and `Sink.max_size` are used to calculate the `bool` value returned by the property `Sink.is_full`. If `Sink.current_size` is greater than or equal to `Sink.max_size` the batch is full and `Sink.is_full` would return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message "Target sink for 'stream_name' is full. Current size is 'current_size'Draining..." is logged to the console and `Target.drain_one(Sink)` is called. ### Wait limit in seconds has been reached. `Sink.is_too_old` To know if something is too old you need to know how much time has passed and how much time needs to pass to be considered old. When the Meltano Target configuration option of `batch_wait_limit_seconds` is present and set the internal variable `_sink_timer` is initialized with an instance of a `BatchPerfTimer`. The `BatchPerfTimer` Class is a batch specific stop watch. The timer is accessible via the property `Sink.sink_timer`. Right after the timer is initialized the stop watch is started `Sink.sink_timer.start()`. We can see how much time has passed by running `Sink.sink_timer.on_the_clock()`. The property `Sink.batch_wait_limit_seconds` hold how much time needs to pass to be considered old. -Both the `Sink.sink_timer.on_the_clock()` and `Sink.batch_wait_limit_seconds` are used to calculate the `bool` value returned by the function`Sink.is_too_old`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message ""Target sink for 'stream_name' is to full. Current size is 'current_size'. Draining..."" is logged to the console and `Target.drain_one(Sink)` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. +Both the `Sink.sink_timer.on_the_clock()` and `Sink.batch_wait_limit_seconds` are used to calculate the `bool` value returned by the function`Sink.is_too_old`. If `Sink.sink_timer.on_the_clock()` is greater than or equal to `Sink.batch_wait_limit_seconds` the batch is full and `Sink.is_full` would return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message ""Target sink for 'stream_name' is to full. Current size is 'current_size'. Draining..."" is logged to the console and `Target.drain_one(Sink)` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. ### Rows or Wait limit has been reached. `Sink.is_full_rows_and_too_old` From ffabffab71f4c2d6ad56d07e795daf062359c2f0 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 5 Sep 2023 17:33:47 -0600 Subject: [PATCH 62/67] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Edgar Ramírez Mondragón --- docs/implementation/target_batch_full.md | 31 ++++++++++++------------ 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/docs/implementation/target_batch_full.md b/docs/implementation/target_batch_full.md index 0dfcaa0e3..8b096c3f7 100644 --- a/docs/implementation/target_batch_full.md +++ b/docs/implementation/target_batch_full.md @@ -4,50 +4,51 @@ The SDK automatically handles creating and releasing properly sized batches. ## The Basics -A Tap sends records messages to the Target. Those messages are grabbed, decoded, and sent to the Target method `_process_record_message`. After each record is processed one question is aksed. +A Tap sends records messages to the Target. Those messages are deserialized and sent to the Target method `_process_record_message`. After each record is processed, one question is asked: -1. Is the batch full, `Sink.is_full` +> Is the batch full? (see [`Sink.is_full`](singer_sdk.Sink.is_full)) -If the answer is `True` then the records currently held in the `Sink._pending_batch` dict are drained. The drain process is managed by the `Target.drain_one` method. Records get written, counters get reset, and on to filling the next batch with records. +If the answer is `True`, then the records currently held in the `Sink._pending_batch` dict are drained. The drain process is managed by the `Target.drain_one` method. Records get written, counters get reset, and on to filling the next batch with records. -## How a Batch is Measured and Full is Defined +## How a Batch is Measured and "Full" is Defined -You need to know three things to determine if somthing is full +You need to know three things to determine if something is full 1. The unit of measure 2. The level at which an object is determined full -3. The current mesurment of the object +3. The current measurement of the object -The units of measure used with `Sink._pending_batch` are **rows** and **time** in **seconds**. The full level marks come from the Meltano Target configuration options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present, the defined constants of `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT` are used. How is `Sink._pending_batch` measured? To measure **rows** the count of records read from the Tap is used. The current row count is available via the property `Sink.current_size`. To measure **seconds** a batch timer is utilized. +The units of measure used with `Sink._pending_batch` are **rows** and **time** in **seconds**. The full level marks come from the Meltano Target configuration options of `batch_size_rows`, `batch_wait_limit_seconds`. If the configuration options are not present, the defined constants of `MAX_SIZE_DEFAULT` and `WAIT_LIMIT_SECONDS_DEFAULT` are used. How is `Sink._pending_batch` measured? To measure **rows**, the count of records read from the Tap is used. The current row count is available via the property `Sink.current_size`. To measure **seconds**, a batch timer is utilized. -There are four “is full” scenarios and each one has a function that looks at the batch and returns `True` if it is full or `False` if the batch can take more records. +There are four “is full” scenarios and each one has a function that looks at the current batch and returns `True` if it is full or `False` if the batch can take more records. 1. Row limit has been reached. `Sink.is_full_rows` 2. Wait limit is seconds has been reached. `Sink.is_too_old` 3. Row limit or Wait limit is seconds has been reached. `Sink.is_full_rows_and_too_old` 4. Row limit managed by the batch timer has been reached. `Sink.is_full_dynamic` -Based on the Meltano Target configuration option(s) given the function `set_drain_function` places the appropriate function into the internal variable`_batch_drain_fucntion`. This variable is what is run when the attribute `Sink.is_full` is called at the end of each Target method `_process_record_message` cycle. When `Sink.is_full` is checked and returns `True` the message: +Based on the Meltano Target configuration option(s), `Sink.set_drain_function` sets the `_drain_function` attribute to the appropriate method. This method is then called in `Sink.is_full` at the end of each Target method `_process_record_message` cycle. When `Sink.is_full` is checked and returns `True` the message: ``` "Target sink for [Stream Name] is full. Current size is [Current Size]. Draining...", ``` + is logged to the console and `Target.drain_one(Sink)` is called. ## Explanation of the Four "is full" Scenarios -### Rows limit has been reached. `Sink.is_full_rows` +### Rows limit has been reached (`Sink.is_full_rows`) To know if something is full you need to know how much it currently holds and at what point to consider it full. Sinks have the property `current_size` which gives you the number of records read and placed into the pending batch. The `current_size` gives us how much the current batch is holding. Now that we know the `current_size` we need to determine if the current size is considered full. The Sink property of `max_size` gives us the integer that defines the full mark. The property `max_size` returns the Sink’s internal variable `_batch_size_rows` if not `None` or the `DEFAULT_MAX_SIZE` constant which is `10000`. -Both the `Sink.current_size` and `Sink.max_size` are used to calculate the `bool` value returned by the property `Sink.is_full`. If `Sink.current_size` is greater than or equal to `Sink.max_size` the batch is full and `Sink.is_full` would return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message "Target sink for 'stream_name' is full. Current size is 'current_size'Draining..." is logged to the console and `Target.drain_one(Sink)` is called. +Both the `Sink.current_size` and `Sink.max_size` are used to calculate the `bool` value returned by the property `Sink.is_full`. If `Sink.current_size` is greater than or equal to `Sink.max_size` the batch is full and `Sink.is_full` would return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message "Target sink for 'stream_name' is full. A log message is emitted to stderr and `Target.drain_one(Sink)` is called. -### Wait limit in seconds has been reached. `Sink.is_too_old` +### Wait limit in seconds has been reached (`Sink.is_too_old`) -To know if something is too old you need to know how much time has passed and how much time needs to pass to be considered old. When the Meltano Target configuration option of `batch_wait_limit_seconds` is present and set the internal variable `_sink_timer` is initialized with an instance of a `BatchPerfTimer`. The `BatchPerfTimer` Class is a batch specific stop watch. The timer is accessible via the property `Sink.sink_timer`. Right after the timer is initialized the stop watch is started `Sink.sink_timer.start()`. We can see how much time has passed by running `Sink.sink_timer.on_the_clock()`. The property `Sink.batch_wait_limit_seconds` hold how much time needs to pass to be considered old. +To know if something is too old you need to know how much time has passed and how much time needs to pass to be considered old. When the Meltano Target configuration option of `batch_wait_limit_seconds` is present and set, the internal variable `_sink_timer` is initialized with an instance of the `BatchPerfTimer` class. The `BatchPerfTimer` class is a batch-specific stopwatch. The timer is accessible via the property [`Sink.sink_timer`](singer_sdk.Sink.sink_timer). Right after the timer is initialized, the stopwatch is started with `Sink.sink_timer.start()`. We can see how much time has passed by running `Sink.sink_timer.on_the_clock()`. The property `Sink.batch_wait_limit_seconds` indicates how much time needs to pass for a batch to be considered old. -Both the `Sink.sink_timer.on_the_clock()` and `Sink.batch_wait_limit_seconds` are used to calculate the `bool` value returned by the function`Sink.is_too_old`. If `Sink.sink_timer.on_the_clock()` is greater than or equal to `Sink.batch_wait_limit_seconds` the batch is full and `Sink.is_full` would return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` the message ""Target sink for 'stream_name' is to full. Current size is 'current_size'. Draining..."" is logged to the console and `Target.drain_one(Sink)` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. +Both the `Sink.sink_timer.on_the_clock()` method and the `Sink.batch_wait_limit_seconds` attribute are used to calculate the boolean value returned by [`Sink.is_too_old`](singer_sdk.Sink.is_too_old). If `Sink.sink_timer.on_the_clock()` is greater than or equal to `Sink.batch_wait_limit_seconds`, the batch is considered full and `Sink.is_full` would return `True`. Again, a log message is emitted to stderr and `Target.drain_one(Sink)` is called. The [`Target.drain_one(Sink)`](singer_sdk.Target.drain_one) method calls `Sink._lap_manager`, which stops the timer, calculates the lap time, and restarts the timer. -### Rows or Wait limit has been reached. `Sink.is_full_rows_and_too_old` +### Rows or Wait limit has been reached (`Sink.is_full_rows_and_too_old`) The previously described `is_full_rows` and `is_too_old` functions are run and their results are held in a tuple. If `True` is present in the tuple the function returns `True` so `is_full` will return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Current size is 'current_size' Draining..." is logged to the console and `Target.drain_one()` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. From 50e82ae0c9420aa3c1275fcf1e07b05579171882 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Wed, 6 Sep 2023 08:01:23 -0600 Subject: [PATCH 63/67] Apply suggestion from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Edgar Ramírez Mondragón --- docs/implementation/target_batch_full.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/implementation/target_batch_full.md b/docs/implementation/target_batch_full.md index 8b096c3f7..09501c0d6 100644 --- a/docs/implementation/target_batch_full.md +++ b/docs/implementation/target_batch_full.md @@ -50,7 +50,7 @@ Both the `Sink.sink_timer.on_the_clock()` method and the `Sink.batch_wait_limit_ ### Rows or Wait limit has been reached (`Sink.is_full_rows_and_too_old`) -The previously described `is_full_rows` and `is_too_old` functions are run and their results are held in a tuple. If `True` is present in the tuple the function returns `True` so `is_full` will return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` Then the message "Target sink for 'stream_name' is full. Current size is 'current_size' Draining..." is logged to the console and `Target.drain_one()` is called. The `Target.drain_one(Sink)` method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and starts the timer again. +The previously described `is_full_rows` and `is_too_old` methods are run and their results are held in a tuple. If `True` is present in the tuple the function returns `True` so `is_full` will return `True`. When `Sink.is_full` is checked at the end of `Target._process_record_message` and returns `True` a log messages is emitted to stderr and [`Target.drain_one(Sink)`](singer_sdk.Target.drain_one) is called. The [`Target.drain_one(Sink)`](singer_sdk.Target.drain_one) method calls `Sink._lap_manager` which stops the timer, calculates the lap time, and restarts the timer. ### Rows limit managed by `Sink.sink_timer.counter_based_max_size` has been reached. `Sink.is_full_dynamic` From 7c4756c7fc9f62f7753a145aecd788c1a3bf3967 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 8 Dec 2023 10:15:57 -0800 Subject: [PATCH 64/67] added return type annotations to start_time, stop_time, and lap_time --- singer_sdk/helpers/_perftimer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index b9294af70..b8b120341 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -17,15 +17,15 @@ class PerfTimer: _lap_time: float | None = None @property - def start_time(self): + def start_time(self) -> float | None: return self._start_time @property - def stop_time(self): + def stop_time(self) -> float | None: return self._stop_time @property - def lap_time(self): + def lap_time(self) -> float | None: return self._lap_time def start(self) -> None: From e50f1a2dfd8b5c597439fb368209cb91bd55ae04 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Fri, 8 Dec 2023 11:02:39 -0800 Subject: [PATCH 65/67] resolve mypy unsupported operand types error --- singer_sdk/helpers/_perftimer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer_sdk/helpers/_perftimer.py b/singer_sdk/helpers/_perftimer.py index b8b120341..4ff795d68 100644 --- a/singer_sdk/helpers/_perftimer.py +++ b/singer_sdk/helpers/_perftimer.py @@ -100,7 +100,7 @@ def perf_diff_allowed_max(self) -> float: @property def perf_diff(self) -> float: """Difference between wanted elapsed time and actual elapsed time.""" - diff = self.max_perf_counter - self.lap_time if self._lap_time else 0 + diff = self.max_perf_counter - self.lap_time if self.lap_time else 0 return float(diff) def counter_based_max_size(self) -> None: # noqa: C901 From 5c1b5eabab0f68f5214ef5c4f8c0ead113d10907 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Tue, 9 Jan 2024 13:56:08 -0800 Subject: [PATCH 66/67] change private Sink._lap_manager() to public Sink.lap_manager() --- singer_sdk/sinks/core.py | 2 +- singer_sdk/target_base.py | 2 +- tests/core/test_target_base.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/singer_sdk/sinks/core.py b/singer_sdk/sinks/core.py index 940b75f56..cb85c7466 100644 --- a/singer_sdk/sinks/core.py +++ b/singer_sdk/sinks/core.py @@ -362,7 +362,7 @@ def key_properties(self) -> t.Sequence[str]: # Timer Management - def _lap_manager(self) -> None: + def lap_manager(self) -> None: """Start and Stop the Perf Time during drain. This method is called when the target triggers a drain on this sink. diff --git a/singer_sdk/target_base.py b/singer_sdk/target_base.py index fd4b69eda..62f3a4865 100644 --- a/singer_sdk/target_base.py +++ b/singer_sdk/target_base.py @@ -501,7 +501,7 @@ def drain_one(self, sink: Sink) -> None: draining_status = sink.start_drain() sink.process_batch(draining_status) - sink._lap_manager() + sink.lap_manager() sink.mark_drained() def _drain_all(self, sink_list: list[Sink], parallelism: int) -> None: diff --git a/tests/core/test_target_base.py b/tests/core/test_target_base.py index a3fba486b..5292a4b1b 100644 --- a/tests/core/test_target_base.py +++ b/tests/core/test_target_base.py @@ -217,7 +217,7 @@ def test_batch_wait_limit_seconds(): assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time is None assert sink_set.sink_timer.perf_diff == 0.0 - sink_set._lap_manager() + sink_set.lap_manager() assert sink_set.sink_timer.start_time > 0.0 assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time > 1.0 @@ -259,7 +259,7 @@ def test_batch_dynamic_management(): assert sink_set.max_size == 100000 assert sink_set.sink_timer.sink_max_size == 100 time.sleep(1.1) - sink_set._lap_manager() + sink_set.lap_manager() assert sink_set.sink_timer.start_time > 0.0 assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time > 1.0 @@ -267,7 +267,7 @@ def test_batch_dynamic_management(): assert sink_set.sink_timer.perf_diff > sink_set.sink_timer.perf_diff_allowed_max assert sink_set.sink_timer.sink_max_size == 200 time.sleep(3.1) - sink_set._lap_manager() + sink_set.lap_manager() assert sink_set.sink_timer.start_time > 0.0 assert sink_set.sink_timer.stop_time is None assert sink_set.sink_timer.lap_time > 1.0 From ab2782de4671b64a84d9f3b6a1de648f18979524 Mon Sep 17 00:00:00 2001 From: Dan Norman Date: Thu, 11 Jan 2024 16:15:31 -0800 Subject: [PATCH 67/67] added settings to default_settings in test_target_about_info --- tests/core/targets/test_target_sql.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/core/targets/test_target_sql.py b/tests/core/targets/test_target_sql.py index fd71c0aeb..7a5911b2f 100644 --- a/tests/core/targets/test_target_sql.py +++ b/tests/core/targets/test_target_sql.py @@ -46,5 +46,11 @@ class MyTarget(SQLTargetMock, capabilities=capabilities): pass about = MyTarget._get_about_info() - default_settings = {"add_record_metadata", "load_method"} + default_settings = { + "add_record_metadata", + "load_method", + "batch_dynamic_management", + "batch_size_rows", + "batch_wait_limit_seconds", + } assert set(about.settings["properties"]) == expected_settings | default_settings