Merge pull request #70 from Bartman0/feature/csv-params

added set of sensible options to configure CSV reading in more detail…
MeltanoLabs · Oct 12, 2022 · 92872ee · 92872ee
2 parents eaf47ef + 1340aaa
commit 92872ee
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -130,3 +130,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
diff --git a/README.md b/README.md
@@ -28,19 +28,29 @@ The `config.json` contains an array called `files` that consists of dictionary o
 * `keys`: The names of the columns that constitute the unique keys for that entity
 * `encoding`: [Optional] The file encoding to use when reading the file (i.e. "latin1", "UTF-8"). Use this setting when you get a `UnicodeDecodeError` error.
 
+The following entries are passed through in an internal CSV dialect that then is used to configure the CSV reader:
+* `delimiter`: A one-character string used to separate fields. It defaults to ','.
+* `doublequote`: Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. It defaults to True.
+* `escapechar`: A one-character string used by the reader, where the escapechar removes any special meaning from the following character. It defaults to None, which disables escaping.
+* `quotechar`: A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '"'.
+* `skipinitialspace`: When True, spaces immediately following the delimiter are ignored. The default is False.
+* `strict`: When True, raise exception Error on bad CSV input. The default is False.
+
 Example:
 
 ```json
 {
 	"files":	[ 	
 					{	"entity" : "leads",
 						"path" : "/path/to/leads.csv",
-						"keys" : ["Id"]
+						"keys" : ["Id"],
+						"delimiter": ";"
 					},
 					{	"entity" : "opportunities",
 						"path" : "/path/to/opportunities.csv",
 						"keys" : ["Id"],
-						"encoding" : "latin1"
+						"encoding" : "latin1",
+						"skipinitialspace": true
 					}
 				]
 }

diff --git a/meltano.yml b/meltano.yml
@@ -19,7 +19,7 @@ plugins:
         keys:
         - col1
     settings:
-      - description: Array of objects containing keys: `entity`, `file`, `keys`,  and `encoding` (Optional)
+      - description: Array of objects containing keys - `entity`, `file`, `keys`, `encoding` (Optional), `delimiter` (Optional), `doublequote` (Optional), `escapechar` (Optional), `quotechar` (Optional), `skipinitialspace` (Optional), `strict` (Optional)
         kind: array
         name: files
       - description: Project-relative path to JSON file holding array of objects with

diff --git a/tap_csv/client.py b/tap_csv/client.py
@@ -81,8 +81,16 @@ def is_valid_filename(self, file_path: str) -> bool:
     def get_rows(self, file_path: str) -> Iterable[list]:
         """Return a generator of the rows in a particular CSV file."""
         encoding = self.file_config.get("encoding", None)
+        csv.register_dialect("tap_dialect", 
+            delimiter=self.file_config.get("delimiter", ","),
+            doublequote=self.file_config.get("doublequote", True),
+            escapechar=self.file_config.get("escapechar", None),
+            quotechar=self.file_config.get("quotechar", '"'),
+            skipinitialspace=self.file_config.get("skipinitialspace", False),
+            strict=self.file_config.get("strict", False)
+        )
         with open(file_path, "r", encoding=encoding) as f:
-            reader = csv.reader(f)
+            reader = csv.reader(f, dialect="tap_dialect")
             for row in reader:
                 yield row
 

diff --git a/tap_csv/tap.py b/tap_csv/tap.py
@@ -25,6 +25,12 @@ class TapCSV(Tap):
                     th.Property("path", th.StringType, required=True),
                     th.Property("keys", th.ArrayType(th.StringType), required=True),
                     th.Property("encoding", th.StringType, required=False),
+                    th.Property("delimiter", th.StringType, required=False),
+                    th.Property("doublequote", th.BooleanType, required=False),
+                    th.Property("escapechar", th.StringType, required=False),
+                    th.Property("quotechar", th.StringType, required=False),
+                    th.Property("skipinitialspace", th.BooleanType, required=False),
+                    th.Property("strict", th.BooleanType, required=False),
                 )
             ),
             description="An array of csv file stream settings.",

diff --git a/tap_csv/tests/test_core.py b/tap_csv/tests/test_core.py
@@ -42,3 +42,27 @@ def test_standard_tap_tests_encoding():
     tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG)
     for test in tests:
         test()
+
+
+# Run standard built-in tap tests from the SDK, with different CSV dialect settings:
+def test_standard_tap_tests_csv_dialect():
+    """Run standard built-in tap tests from the SDK, with different CSV dialect settings."""
+    test_data_dir = os.path.dirname(os.path.abspath(__file__))
+    SAMPLE_CONFIG = {
+        "files": [
+            {
+                "entity": "test",
+                "path": f"{test_data_dir}/data/alphabet_encoding.csv",
+                "keys": [],
+                "delimiter": ",",
+                "doublequote": True,
+                "escapechar": "^",
+                "quotechar": "\"",
+                "skipinitialspace": True,
+                "strict": True
+            }
+        ]
+    }
+    tests = get_standard_tap_tests(TapCSV, config=SAMPLE_CONFIG)
+    for test in tests:
+        test()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -130,3 +130,4 @@ dmypy.json

		# Pyre type checker
		.pyre/