diff --git a/src/config/settings.py b/src/config/settings.py index a01a015..3ea0efb 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -142,19 +142,26 @@ def config_pipeline(dataset, test, update, mode, backfill, engine): def generate_date_series( start_date, end_date, frequency="D", missing_dates=None, chunk_size=100 ): - """ - Generate a sorted list of dates between start and end dates, incorporating missing dates, - partitioned into chunks of specified size. - - Parameters: - start_date (str or datetime): Start date in 'YYYY-MM-DD' format if string - end_date (str or datetime): End date in 'YYYY-MM-DD' format if string - frequency (str): 'D' for daily or 'M' for monthly - missing_dates (list): Optional list of dates to include, in 'YYYY-MM-DD' format if strings - chunk_size (int): Maximum number of dates per partition - - Returns: - list of lists: List of date chunks, where each chunk is a list of datetime.date objects + """Generate a sorted list of dates partitioned into chunks. + + Parameters + ---------- + start_date : str or datetime + Start date in 'YYYY-MM-DD' format if string + end_date : str or datetime + End date in 'YYYY-MM-DD' format if string, or None for single date + frequency : str, default='D' + Date frequency, either 'D' for daily or 'M' for monthly + missing_dates : list, optional + Additional dates to include in the series + chunk_size : int, default=100 + Maximum number of dates per chunk + + Returns + ------- + list of list of datetime.date + List of date chunks, where each chunk contains up to chunk_size dates, + sorted in ascending order with duplicates removed """ if not end_date: dates = [start_date] diff --git a/src/utils/general_utils.py b/src/utils/general_utils.py index 0f5527b..03c2395 100644 --- a/src/utils/general_utils.py +++ b/src/utils/general_utils.py @@ -1,5 +1,5 @@ import re -from datetime import datetime, timedelta +from datetime import datetime from typing import List import pandas as pd @@ -9,37 +9,6 @@ from src.utils.cloud_utils import get_container_client -def split_date_range(start_date, end_date): - """ - Split the date range into yearly chunks if the range is greater than a year. - - Parameters - ---------- - start_date (str): Start date in 'YYYY-MM-DD' format - end_date (str): End date in 'YYYY-MM-DD' format - - Returns - ------- - list of tuples: Each tuple contains the start and end date for a chunk - """ - start = pd.to_datetime(start_date) - end = pd.to_datetime(end_date) - - # If the date range is less than or equal to a year, return it as a single chunk - if (end - start).days <= 365: - return [(start_date, end_date)] - - date_ranges = [] - while start < end: - year_end = min(datetime(start.year, 12, 31), end) - date_ranges.append( - (start.strftime("%Y-%m-%d"), year_end.strftime("%Y-%m-%d")) - ) - start = year_end + timedelta(days=1) - - return date_ranges - - def add_months_to_date(date_string, months): """ Add or subtract a number of months to/from a given date string.