Skip to content

Commit

Permalink
Merge pull request #70 from Bartman0/feature/csv-params
Browse files Browse the repository at this point in the history
added set of sensible options to configure CSV reading in more detail…
  • Loading branch information
pnadolny13 authored Oct 12, 2022
2 parents eaf47ef + 1340aaa commit 92872ee
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,4 @@ dmypy.json

# Pyre type checker
.pyre/

14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,29 @@ The `config.json` contains an array called `files` that consists of dictionary o
* `keys`: The names of the columns that constitute the unique keys for that entity
* `encoding`: [Optional] The file encoding to use when reading the file (i.e. "latin1", "UTF-8"). Use this setting when you get a `UnicodeDecodeError` error.

The following entries are passed through in an internal CSV dialect that then is used to configure the CSV reader:
* `delimiter`: A one-character string used to separate fields. It defaults to ','.
* `doublequote`: Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. It defaults to True.
* `escapechar`: A one-character string used by the reader, where the escapechar removes any special meaning from the following character. It defaults to None, which disables escaping.
* `quotechar`: A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '"'.
* `skipinitialspace`: When True, spaces immediately following the delimiter are ignored. The default is False.
* `strict`: When True, raise exception Error on bad CSV input. The default is False.

Example:

```json
{
"files": [
{ "entity" : "leads",
"path" : "/path/to/leads.csv",
"keys" : ["Id"]
"keys" : ["Id"],
"delimiter": ";"
},
{ "entity" : "opportunities",
"path" : "/path/to/opportunities.csv",
"keys" : ["Id"],
"encoding" : "latin1"
"encoding" : "latin1",
"skipinitialspace": true
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion meltano.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ plugins:
keys:
- col1
settings:
- description: Array of objects containing keys: `entity`, `file`, `keys`, and `encoding` (Optional)
- description: Array of objects containing keys - `entity`, `file`, `keys`, `encoding` (Optional), `delimiter` (Optional), `doublequote` (Optional), `escapechar` (Optional), `quotechar` (Optional), `skipinitialspace` (Optional), `strict` (Optional)
kind: array
name: files
- description: Project-relative path to JSON file holding array of objects with
Expand Down
10 changes: 9 additions & 1 deletion tap_csv/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,16 @@ def is_valid_filename(self, file_path: str) -> bool:
def get_rows(self, file_path: str) -> Iterable[list]:
"""Return a generator of the rows in a particular CSV file."""
encoding = self.file_config.get("encoding", None)
csv.register_dialect("tap_dialect",
delimiter=self.file_config.get("delimiter", ","),
doublequote=self.file_config.get("doublequote", True),
escapechar=self.file_config.get("escapechar", None),
quotechar=self.file_config.get("quotechar", '"'),
skipinitialspace=self.file_config.get("skipinitialspace", False),
strict=self.file_config.get("strict", False)
)
with open(file_path, "r", encoding=encoding) as f:
reader = csv.reader(f)
reader = csv.reader(f, dialect="tap_dialect")
for row in reader:
yield row

Expand Down
6 changes: 6 additions & 0 deletions tap_csv/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ class TapCSV(Tap):
th.Property("path", th.StringType, required=True),
th.Property("keys", th.ArrayType(th.StringType), required=True),
th.Property("encoding", th.StringType, required=False),
th.Property("delimiter", th.StringType, required=False),
th.Property("doublequote", th.BooleanType, required=False),
th.Property("escapechar", th.StringType, required=False),
th.Property("quotechar", th.StringType, required=False),
th.Property("skipinitialspace", th.BooleanType, required=False),
th.Property("strict", th.BooleanType, required=False),
)
),
description="An array of csv file stream settings.",
Expand Down
24 changes: 24 additions & 0 deletions tap_csv/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,27 @@ def test_standard_tap_tests_encoding():
tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG)
for test in tests:
test()


# Run standard built-in tap tests from the SDK, with different CSV dialect settings:
def test_standard_tap_tests_csv_dialect():
"""Run standard built-in tap tests from the SDK, with different CSV dialect settings."""
test_data_dir = os.path.dirname(os.path.abspath(__file__))
SAMPLE_CONFIG = {
"files": [
{
"entity": "test",
"path": f"{test_data_dir}/data/alphabet_encoding.csv",
"keys": [],
"delimiter": ",",
"doublequote": True,
"escapechar": "^",
"quotechar": "\"",
"skipinitialspace": True,
"strict": True
}
]
}
tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG)
for test in tests:
test()

0 comments on commit 92872ee

Please sign in to comment.