From 98cef125bfe87b6a1d843c357a7d24bb5173b132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Ram=C3=ADrez=20Mondrag=C3=B3n?= Date: Mon, 27 Jan 2025 17:25:48 -0600 Subject: [PATCH] feat: Support integer types other than `BIGINT` (#485) TODO: - [x] Docs, specifically updating the `Data Types` section of the readme, and how to override the schema to force the target into a certain integer type - [x] Tests --- README.md | 107 ++++++++++++++++------------ target_postgres/connector.py | 24 ++++++- target_postgres/tests/test_types.py | 70 +++++++++++++++++- 3 files changed, 154 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index ec156491..5cbb6e20 100644 --- a/README.md +++ b/README.md @@ -238,51 +238,51 @@ develop your own Singer taps and targets. The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes. -| jsonschema | Postgres | -|--------------------------------|-----------------------------------------| -| integer | bigint | -| UNSUPPORTED | bigserial | -| UNSUPPORTED | bit [ (n) ] | -| UNSUPPORTED | bit varying [ (n) ] | -| boolean | boolean | -| UNSUPPORTED | box | +| jsonschema | Postgres | +| ---------------------------------------------------------------------------------- | --------------------------------------- | +| integer | bigint | +| integer with minimum >= 32768 or maximum < 32768 | smallint | +| integer with minimum >= 2147483648 or maximum < 2147483648 | integer | +| UNSUPPORTED | bigserial | +| UNSUPPORTED | bit [ (n) ] | +| UNSUPPORTED | bit varying [ (n) ] | +| boolean | boolean | +| UNSUPPORTED | box | | string with contentEncoding="base16" ([opt-in feature](#content-encoding-support)) | bytea | -| UNSUPPORTED | character [ (n) ] | -| UNSUPPORTED | character varying [ (n) ] | -| UNSUPPORTED | cidr | -| UNSUPPORTED | circle | -| string with format="date" | date | -| UNSUPPORTED | double precision | -| UNSUPPORTED | inet | -| UNSUPPORTED | integer | -| UNSUPPORTED | interval [ fields ] [ (p) ] | -| UNSUPPORTED | json | -| array; object | jsonb | -| UNSUPPORTED | line | -| UNSUPPORTED | lseg | -| UNSUPPORTED | macaddr | -| UNSUPPORTED | macaddr8 | -| UNSUPPORTED | money | -| number | numeric [ (p, s) ] | -| UNSUPPORTED | path | -| UNSUPPORTED | pg_lsn | -| UNSUPPORTED | pg_snapshot | -| UNSUPPORTED | point | -| UNSUPPORTED | polygon | -| UNSUPPORTED | real | -| UNSUPPORTED | smallint | -| UNSUPPORTED | smallserial | -| UNSUPPORTED | serial | -| string without format; untyped | text | -| string with format="time" | time [ (p) ] [ without time zone ] | -| UNSUPPORTED | time [ (p) ] with time zone | -| string with format="date-time" | timestamp [ (p) ] [ without time zone ] | -| UNSUPPORTED | timestamp [ (p) ] with time zone | -| UNSUPPORTED | tsquery | -| UNSUPPORTED | tsvector | -| UNSUPPORTED | txid_snapshot | -| string with format="uuid" | uuid | -| UNSUPPORTED | xml | +| UNSUPPORTED | character [ (n) ] | +| UNSUPPORTED | character varying [ (n) ] | +| UNSUPPORTED | cidr | +| UNSUPPORTED | circle | +| string with format="date" | date | +| UNSUPPORTED | double precision | +| UNSUPPORTED | inet | +| UNSUPPORTED | interval [ fields ] [ (p) ] | +| UNSUPPORTED | json | +| array; object | jsonb | +| UNSUPPORTED | line | +| UNSUPPORTED | lseg | +| UNSUPPORTED | macaddr | +| UNSUPPORTED | macaddr8 | +| UNSUPPORTED | money | +| number | numeric [ (p, s) ] | +| UNSUPPORTED | path | +| UNSUPPORTED | pg_lsn | +| UNSUPPORTED | pg_snapshot | +| UNSUPPORTED | point | +| UNSUPPORTED | polygon | +| UNSUPPORTED | real | +| UNSUPPORTED | smallserial | +| UNSUPPORTED | serial | +| string without format; untyped | text | +| string with format="time" | time [ (p) ] [ without time zone ] | +| UNSUPPORTED | time [ (p) ] with time zone | +| string with format="date-time" | timestamp [ (p) ] [ without time zone ] | +| UNSUPPORTED | timestamp [ (p) ] with time zone | +| UNSUPPORTED | tsquery | +| UNSUPPORTED | tsvector | +| UNSUPPORTED | txid_snapshot | +| string with format="uuid" | uuid | +| UNSUPPORTED | xml | Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array. @@ -298,9 +298,28 @@ If a column has multiple jsonschema types, the following order is using to order - DECIMAL - BIGINT - INTEGER +- SMALLINT - BOOLEAN - NOTYPE +### Using the Singer catalog to narrow down the Postgres data types + +You can use [Singer catalog's schema](https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#schemas) to override the data types coming from the tap. The easiest way to do this is to use Meltano and its [`schema` setting](https://docs.meltano.com/concepts/plugins/#schema-extra) for the tap: + +```yaml +# meltano.yml +plugins: + extractors: + - name: tap-my-tap + schema: + some_stream_id: + my_column: + type: integer + # This will be mapped to 'smallint' + minimum: 0 + maximum: 1000 +``` + ## Content Encoding Support Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types. diff --git a/target_postgres/connector.py b/target_postgres/connector.py index ffd64466..228a3f0a 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -5,6 +5,7 @@ import atexit import io import itertools +import math import signal import sys import typing as t @@ -18,7 +19,14 @@ import sqlalchemy as sa from singer_sdk import SQLConnector from singer_sdk.connectors.sql import JSONSchemaToSQL -from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, BYTEA, JSONB, UUID +from sqlalchemy.dialects.postgresql import ( + ARRAY, + BIGINT, + BYTEA, + JSONB, + SMALLINT, + UUID, +) from sqlalchemy.engine import URL from sqlalchemy.engine.url import make_url from sqlalchemy.types import ( @@ -273,6 +281,17 @@ def _handle_array_type(self, jsonschema: dict) -> ARRAY | JSONB: # Case 3: tuples return ARRAY(JSONB()) if isinstance(items, list) else JSONB() + def _handle_integer_type(self, jsonschema: dict) -> SMALLINT | INTEGER | BIGINT: + """Handle integer type.""" + minimum = jsonschema.get("minimum", -math.inf) + maximum = jsonschema.get("maximum", math.inf) + if minimum >= -(2**15) and maximum < 2**15: + return SMALLINT() + if minimum >= -(2**31) and maximum < 2**31: + return INTEGER() + + return BIGINT() + @cached_property def jsonschema_to_sql(self) -> JSONSchemaToSQL: """Return a JSONSchemaToSQL instance with custom type handling.""" @@ -281,7 +300,7 @@ def jsonschema_to_sql(self) -> JSONSchemaToSQL: max_varchar_length=self.max_varchar_length, ) to_sql.fallback_type = TEXT - to_sql.register_type_handler("integer", BIGINT) + to_sql.register_type_handler("integer", self._handle_integer_type) to_sql.register_type_handler("object", JSONB) to_sql.register_type_handler("array", self._handle_array_type) to_sql.register_format_handler("date-time", TIMESTAMP) @@ -386,6 +405,7 @@ def pick_best_sql_type(sql_type_array: list): DECIMAL, BIGINT, INTEGER, + SMALLINT, BOOLEAN, NOTYPE, ] diff --git a/target_postgres/tests/test_types.py b/target_postgres/tests/test_types.py index 8741d5ff..065ca327 100644 --- a/target_postgres/tests/test_types.py +++ b/target_postgres/tests/test_types.py @@ -2,8 +2,9 @@ import pytest import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import BIGINT, SMALLINT -from target_postgres.connector import NOTYPE, PostgresConnector +from target_postgres.connector import NOTYPE, JSONSchemaToPostgres, PostgresConnector @pytest.fixture @@ -36,3 +37,70 @@ def connector(): def test_type_hierarchy(connector, types, expected): """Test that types are merged correctly.""" assert type(connector.merge_sql_types(types)) is expected + + +class TestJSONSchemaToPostgres: + """Test JSONSchemaToPostgres class.""" + + @pytest.fixture + def to_postgres(self, connector: PostgresConnector): + """Create a JSONSchemaToPostgres instance.""" + return connector.jsonschema_to_sql + + def test_datetime_string(self, to_postgres: JSONSchemaToPostgres): + """Test conversion of JSON schema string to Postgres datetime.""" + result = to_postgres.to_sql_type({"type": "string", "format": "date-time"}) + assert type(result) is sa.TIMESTAMP + + @pytest.mark.parametrize( + ("jsonschema", "expected"), + [ + pytest.param({"type": "integer"}, BIGINT, id="default"), + pytest.param({"type": ["integer", "null"]}, BIGINT, id="default-nullable"), + pytest.param( + { + "type": "integer", + "minimum": 0, + "maximum": 2**15 - 1, + }, + SMALLINT, + id="smallint", + ), + pytest.param( + { + "type": "integer", + "minimum": -5, + "maximum": 5, + }, + SMALLINT, + id="negative-smallint", + ), + pytest.param( + { + "type": "integer", + "minimum": 0, + "maximum": 2**31 - 1, + }, + sa.INTEGER, + id="integer", + ), + pytest.param( + { + "type": "integer", + "minimum": 0, + "maximum": 2**31 + 1, + }, + BIGINT, + id="bigint", + ), + ], + ) + def test_integers( + self, + to_postgres: JSONSchemaToPostgres, + jsonschema: dict, + expected: type[sa.types.TypeEngine], + ): + """Test conversion of JSON schema types to Postgres types.""" + result = to_postgres.to_sql_type(jsonschema) + assert type(result) is expected