From 8ac81c620527c8066503cd226f7a028c73ccc9ae Mon Sep 17 00:00:00 2001 From: Darek Date: Wed, 16 Oct 2024 15:40:26 -0400 Subject: [PATCH] Setting inferSchemaLength = 0 when null (#285) Setting inferSchemaLength = 0 when null to close #279 --- __tests__/io.test.ts | 10 ++++++++-- polars/io.ts | 11 +++++++++++ polars/lazy/expr/string.ts | 2 +- polars/series/string.ts | 2 +- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/__tests__/io.test.ts b/__tests__/io.test.ts index 7985dfce..14de9db3 100644 --- a/__tests__/io.test.ts +++ b/__tests__/io.test.ts @@ -24,7 +24,7 @@ describe("read:csv", () => { expect(df.shape).toEqual({ height: 27, width: 4 }); }); it("can read from a csv file with inferSchemaLength = 0 option", () => { - const df = pl.readCSV(csvpath, { inferSchemaLength: 0 }); + let df = pl.readCSV(csvpath, { inferSchemaLength: 0 }); const expected = `shape: (1, 4) ┌────────────┬──────────┬────────┬──────────┐ │ category ┆ calories ┆ fats_g ┆ sugars_g │ @@ -34,6 +34,8 @@ describe("read:csv", () => { │ vegetables ┆ 45 ┆ 0.5 ┆ 2 │ └────────────┴──────────┴────────┴──────────┘`; expect(df.head(1).toString()).toEqual(expected); + df = pl.readCSV(csvpath, { inferSchemaLength: null }); + expect(df.head(1).toString()).toEqual(expected); }); it("can read from a csv file with options", () => { const df = pl.readCSV(csvpath, { hasHeader: false, skipRows: 1, nRows: 4 }); @@ -154,7 +156,11 @@ describe("read:json", () => { expect(df.shape).toEqual({ height: 27, width: 4 }); }); it("can specify read options", () => { - const df = pl.readJSON(jsonpath, { batchSize: 10, inferSchemaLength: 100 }); + let df = pl.readJSON(jsonpath, { batchSize: 10, inferSchemaLength: 100 }); + expect(df.shape).toEqual({ height: 27, width: 4 }); + df = pl.readJSON(jsonpath, { batchSize: 10, inferSchemaLength: null }); + expect(df.shape).toEqual({ height: 27, width: 4 }); + df = pl.readJSON(jsonpath, { batchSize: 10, inferSchemaLength: 0 }); expect(df.shape).toEqual({ height: 27, width: 4 }); }); it("can read from a json buffer", () => { diff --git a/polars/io.ts b/polars/io.ts index 7c02b960..37ea88ef 100644 --- a/polars/io.ts +++ b/polars/io.ts @@ -186,6 +186,9 @@ export function readCSV(pathOrBody, options?) { options = { ...readCsvDefaultOptions, ...options }; const extensions = [".tsv", ".csv"]; + // Handle If set to `null` case + options.inferSchemaLength = options.inferSchemaLength ?? 0; + if (Buffer.isBuffer(pathOrBody)) { return _DataFrame(pli.readCsv(pathOrBody, options)); } @@ -275,6 +278,8 @@ export function scanCSV( export function scanCSV(path, options?) { options = { ...scanCsvDefaultOptions, ...options }; + // Handle If set to `null` case + options.inferSchemaLength = options.inferSchemaLength ?? 0; return _LazyDataFrame(pli.scanCsv(path, options)); } /** @@ -320,6 +325,10 @@ export function readJSON( options = { ...readJsonDefaultOptions, ...options }; const method = options.format === "lines" ? pli.readJsonLines : pli.readJson; const extensions = [".ndjson", ".json", ".jsonl"]; + + // Handle If set to `null` case + options.inferSchemaLength = options.inferSchemaLength ?? 0; + if (Buffer.isBuffer(pathOrBody)) { return _DataFrame(pli.readJson(pathOrBody, options)); } @@ -382,6 +391,8 @@ export function scanJson( export function scanJson(path: string, options?: Partial) { options = { ...readJsonDefaultOptions, ...options }; + // Handle If set to `null` case + options.inferSchemaLength = options.inferSchemaLength ?? 0; return _LazyDataFrame(pli.scanJson(path, options)); } diff --git a/polars/lazy/expr/string.ts b/polars/lazy/expr/string.ts index f4718f55..239fa4d8 100644 --- a/polars/lazy/expr/string.ts +++ b/polars/lazy/expr/string.ts @@ -164,7 +164,7 @@ export interface StringNamespace extends StringFunctions { * @see https://goessner.net/articles/JsonPath/ * @param jsonPath - A valid JSON path query string * @param dtype - The dtype to cast the extracted value to. If None, the dtype will be inferred from the JSON value. - * @param inferSchemaLength - How many rows to parse to determine the schema. If ``None`` all rows are used. + * @param inferSchemaLength - How many rows to parse to determine the schema. If `null` all rows are used. * @returns Utf8 array. Contain null if original value is null or the `jsonPath` return nothing. * @example * ``` diff --git a/polars/series/string.ts b/polars/series/string.ts index a2ad7baa..a03b4d07 100644 --- a/polars/series/string.ts +++ b/polars/series/string.ts @@ -132,7 +132,7 @@ export interface StringNamespace extends StringFunctions { * @see https://goessner.net/articles/JsonPath/ * @param jsonPath - A valid JSON path query string * @param dtype - The dtype to cast the extracted value to. If None, the dtype will be inferred from the JSON value. - * @param inferSchemaLength - How many rows to parse to determine the schema. If ``None`` all rows are used. + * @param inferSchemaLength - How many rows to parse to determine the schema. If ``null`` all rows are used. * @returns Utf8 array. Contain null if original value is null or the `jsonPath` return nothing. * @example * ```