Skip to content

Commit

Permalink
ignore NaNs when building hash
Browse files Browse the repository at this point in the history
  • Loading branch information
hsyyid committed Jan 31, 2025
1 parent c48ff7b commit 2e698f3
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
11 changes: 10 additions & 1 deletion gluestick/etl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os

import pandas as pd
import numpy as np
from datetime import datetime
from pytz import utc
from gluestick.singer import to_singer
Expand Down Expand Up @@ -256,7 +257,15 @@ def get_row_hash(row, columns):
"""
# ensure stable order
row_str = "".join(row[columns].astype(str).tolist())
values = []

for col in columns:
v = row[col]

if (not isinstance(v, list) and not pd.isna(v)) and v==v and (v not in [None, np.nan]):
values.append(str(v))

row_str = "".join(values)
return hashlib.md5(row_str.encode()).hexdigest()


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="gluestick",
version="2.2.6",
version="2.2.7",
description="ETL utility functions built on Pandas",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down

0 comments on commit 2e698f3

Please sign in to comment.