Skip to content

Commit

Permalink
feat: Support integer types other than BIGINT (#485)
Browse files Browse the repository at this point in the history
TODO:

- [x] Docs, specifically updating the `Data Types` section of the
readme, and how to override the schema to force the target into a
certain integer type
- [x] Tests
  • Loading branch information
edgarrmondragon authored Jan 27, 2025
1 parent f77971c commit 98cef12
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 47 deletions.
107 changes: 63 additions & 44 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,51 +238,51 @@ develop your own Singer taps and targets.

The below table shows how this tap will map between jsonschema datatypes and Postgres datatypes.

| jsonschema | Postgres |
|--------------------------------|-----------------------------------------|
| integer | bigint |
| UNSUPPORTED | bigserial |
| UNSUPPORTED | bit [ (n) ] |
| UNSUPPORTED | bit varying [ (n) ] |
| boolean | boolean |
| UNSUPPORTED | box |
| jsonschema | Postgres |
| ---------------------------------------------------------------------------------- | --------------------------------------- |
| integer | bigint |
| integer with minimum >= 32768 or maximum < 32768 | smallint |
| integer with minimum >= 2147483648 or maximum < 2147483648 | integer |
| UNSUPPORTED | bigserial |
| UNSUPPORTED | bit [ (n) ] |
| UNSUPPORTED | bit varying [ (n) ] |
| boolean | boolean |
| UNSUPPORTED | box |
| string with contentEncoding="base16" ([opt-in feature](#content-encoding-support)) | bytea |
| UNSUPPORTED | character [ (n) ] |
| UNSUPPORTED | character varying [ (n) ] |
| UNSUPPORTED | cidr |
| UNSUPPORTED | circle |
| string with format="date" | date |
| UNSUPPORTED | double precision |
| UNSUPPORTED | inet |
| UNSUPPORTED | integer |
| UNSUPPORTED | interval [ fields ] [ (p) ] |
| UNSUPPORTED | json |
| array; object | jsonb |
| UNSUPPORTED | line |
| UNSUPPORTED | lseg |
| UNSUPPORTED | macaddr |
| UNSUPPORTED | macaddr8 |
| UNSUPPORTED | money |
| number | numeric [ (p, s) ] |
| UNSUPPORTED | path |
| UNSUPPORTED | pg_lsn |
| UNSUPPORTED | pg_snapshot |
| UNSUPPORTED | point |
| UNSUPPORTED | polygon |
| UNSUPPORTED | real |
| UNSUPPORTED | smallint |
| UNSUPPORTED | smallserial |
| UNSUPPORTED | serial |
| string without format; untyped | text |
| string with format="time" | time [ (p) ] [ without time zone ] |
| UNSUPPORTED | time [ (p) ] with time zone |
| string with format="date-time" | timestamp [ (p) ] [ without time zone ] |
| UNSUPPORTED | timestamp [ (p) ] with time zone |
| UNSUPPORTED | tsquery |
| UNSUPPORTED | tsvector |
| UNSUPPORTED | txid_snapshot |
| string with format="uuid" | uuid |
| UNSUPPORTED | xml |
| UNSUPPORTED | character [ (n) ] |
| UNSUPPORTED | character varying [ (n) ] |
| UNSUPPORTED | cidr |
| UNSUPPORTED | circle |
| string with format="date" | date |
| UNSUPPORTED | double precision |
| UNSUPPORTED | inet |
| UNSUPPORTED | interval [ fields ] [ (p) ] |
| UNSUPPORTED | json |
| array; object | jsonb |
| UNSUPPORTED | line |
| UNSUPPORTED | lseg |
| UNSUPPORTED | macaddr |
| UNSUPPORTED | macaddr8 |
| UNSUPPORTED | money |
| number | numeric [ (p, s) ] |
| UNSUPPORTED | path |
| UNSUPPORTED | pg_lsn |
| UNSUPPORTED | pg_snapshot |
| UNSUPPORTED | point |
| UNSUPPORTED | polygon |
| UNSUPPORTED | real |
| UNSUPPORTED | smallserial |
| UNSUPPORTED | serial |
| string without format; untyped | text |
| string with format="time" | time [ (p) ] [ without time zone ] |
| UNSUPPORTED | time [ (p) ] with time zone |
| string with format="date-time" | timestamp [ (p) ] [ without time zone ] |
| UNSUPPORTED | timestamp [ (p) ] with time zone |
| UNSUPPORTED | tsquery |
| UNSUPPORTED | tsvector |
| UNSUPPORTED | txid_snapshot |
| string with format="uuid" | uuid |
| UNSUPPORTED | xml |

Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array.

Expand All @@ -298,9 +298,28 @@ If a column has multiple jsonschema types, the following order is using to order
- DECIMAL
- BIGINT
- INTEGER
- SMALLINT
- BOOLEAN
- NOTYPE

### Using the Singer catalog to narrow down the Postgres data types

You can use [Singer catalog's schema](https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#schemas) to override the data types coming from the tap. The easiest way to do this is to use Meltano and its [`schema` setting](https://docs.meltano.com/concepts/plugins/#schema-extra) for the tap:

```yaml
# meltano.yml
plugins:
extractors:
- name: tap-my-tap
schema:
some_stream_id:
my_column:
type: integer
# This will be mapped to 'smallint'
minimum: 0
maximum: 1000
```

## Content Encoding Support

Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types.
Expand Down
24 changes: 22 additions & 2 deletions target_postgres/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import atexit
import io
import itertools
import math
import signal
import sys
import typing as t
Expand All @@ -18,7 +19,14 @@
import sqlalchemy as sa
from singer_sdk import SQLConnector
from singer_sdk.connectors.sql import JSONSchemaToSQL
from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, BYTEA, JSONB, UUID
from sqlalchemy.dialects.postgresql import (
ARRAY,
BIGINT,
BYTEA,
JSONB,
SMALLINT,
UUID,
)
from sqlalchemy.engine import URL
from sqlalchemy.engine.url import make_url
from sqlalchemy.types import (
Expand Down Expand Up @@ -273,6 +281,17 @@ def _handle_array_type(self, jsonschema: dict) -> ARRAY | JSONB:
# Case 3: tuples
return ARRAY(JSONB()) if isinstance(items, list) else JSONB()

def _handle_integer_type(self, jsonschema: dict) -> SMALLINT | INTEGER | BIGINT:
"""Handle integer type."""
minimum = jsonschema.get("minimum", -math.inf)
maximum = jsonschema.get("maximum", math.inf)
if minimum >= -(2**15) and maximum < 2**15:
return SMALLINT()
if minimum >= -(2**31) and maximum < 2**31:
return INTEGER()

return BIGINT()

@cached_property
def jsonschema_to_sql(self) -> JSONSchemaToSQL:
"""Return a JSONSchemaToSQL instance with custom type handling."""
Expand All @@ -281,7 +300,7 @@ def jsonschema_to_sql(self) -> JSONSchemaToSQL:
max_varchar_length=self.max_varchar_length,
)
to_sql.fallback_type = TEXT
to_sql.register_type_handler("integer", BIGINT)
to_sql.register_type_handler("integer", self._handle_integer_type)
to_sql.register_type_handler("object", JSONB)
to_sql.register_type_handler("array", self._handle_array_type)
to_sql.register_format_handler("date-time", TIMESTAMP)
Expand Down Expand Up @@ -386,6 +405,7 @@ def pick_best_sql_type(sql_type_array: list):
DECIMAL,
BIGINT,
INTEGER,
SMALLINT,
BOOLEAN,
NOTYPE,
]
Expand Down
70 changes: 69 additions & 1 deletion target_postgres/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import pytest
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import BIGINT, SMALLINT

from target_postgres.connector import NOTYPE, PostgresConnector
from target_postgres.connector import NOTYPE, JSONSchemaToPostgres, PostgresConnector


@pytest.fixture
Expand Down Expand Up @@ -36,3 +37,70 @@ def connector():
def test_type_hierarchy(connector, types, expected):
"""Test that types are merged correctly."""
assert type(connector.merge_sql_types(types)) is expected


class TestJSONSchemaToPostgres:
"""Test JSONSchemaToPostgres class."""

@pytest.fixture
def to_postgres(self, connector: PostgresConnector):
"""Create a JSONSchemaToPostgres instance."""
return connector.jsonschema_to_sql

def test_datetime_string(self, to_postgres: JSONSchemaToPostgres):
"""Test conversion of JSON schema string to Postgres datetime."""
result = to_postgres.to_sql_type({"type": "string", "format": "date-time"})
assert type(result) is sa.TIMESTAMP

@pytest.mark.parametrize(
("jsonschema", "expected"),
[
pytest.param({"type": "integer"}, BIGINT, id="default"),
pytest.param({"type": ["integer", "null"]}, BIGINT, id="default-nullable"),
pytest.param(
{
"type": "integer",
"minimum": 0,
"maximum": 2**15 - 1,
},
SMALLINT,
id="smallint",
),
pytest.param(
{
"type": "integer",
"minimum": -5,
"maximum": 5,
},
SMALLINT,
id="negative-smallint",
),
pytest.param(
{
"type": "integer",
"minimum": 0,
"maximum": 2**31 - 1,
},
sa.INTEGER,
id="integer",
),
pytest.param(
{
"type": "integer",
"minimum": 0,
"maximum": 2**31 + 1,
},
BIGINT,
id="bigint",
),
],
)
def test_integers(
self,
to_postgres: JSONSchemaToPostgres,
jsonschema: dict,
expected: type[sa.types.TypeEngine],
):
"""Test conversion of JSON schema types to Postgres types."""
result = to_postgres.to_sql_type(jsonschema)
assert type(result) is expected

0 comments on commit 98cef12

Please sign in to comment.