From 94621ffb6e715d0f8d31a50a0acce70dc1823db3 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Thu, 7 Sep 2023 15:33:03 +0100 Subject: [PATCH 01/22] Add some streaming related docstrings --- py-polars/polars/io/csv/functions.py | 9 +++-- py-polars/polars/io/parquet/functions.py | 6 ++-- py-polars/polars/lazyframe/frame.py | 42 ++++++++++++++++++++---- 3 files changed, 46 insertions(+), 11 deletions(-) diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index 177c8828cffe..808c8de677df 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -133,7 +133,8 @@ def read_csv( ``utf8-lossy``, the input is first decoded in memory with python. Defaults to ``utf8``. low_memory - Reduce memory usage at expense of performance. + Reduce memory usage at expense of performance when rechunking into + a single array. To work with larger than-memory datasets use streaming mode. rechunk Make sure that all columns are contiguous in memory by aggregating the chunks into a single array. @@ -502,7 +503,8 @@ def read_csv_batched( ``utf8-lossy``, the input is first decoded in memory with python. Defaults to ``utf8``. low_memory - Reduce memory usage at expense of performance. + Reduce memory usage at expense of performance when rechunking into + a single array. To work with larger than-memory datasets use streaming mode. rechunk Make sure that all columns are contiguous in memory by aggregating the chunks into a single array. @@ -781,7 +783,8 @@ def scan_csv( Lossy means that invalid utf8 values are replaced with ``�`` characters. Defaults to "utf8". low_memory - Reduce memory usage in expense of performance. + Reduce memory usage at expense of performance when rechunking into + a single array. To work with larger than-memory datasets use streaming mode. rechunk Reallocate to contiguous memory when all chunks/ files are parsed. skip_rows_after_header diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 26d660c42fe7..f1b85a5bb544 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -80,7 +80,8 @@ def read_parquet( row_count_offset Offset to start the row_count column (only use if the name is set). low_memory - Reduce memory pressure at the expense of performance. + Reduce memory usage at expense of performance when rechunking into + a single array. To work with larger than-memory datasets use streaming mode. pyarrow_options Keyword arguments for `pyarrow.parquet.read_table `_. @@ -215,7 +216,8 @@ def scan_parquet( particular storage connection. e.g. host, port, username, password, etc. low_memory - Reduce memory pressure at the expense of performance. + Reduce memory usage at expense of performance when rechunking into + a single array. To work with larger than-memory datasets use streaming mode. use_statistics Use statistics in the parquet to determine if pages can be skipped from reading. diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index e7b3757a5a10..9bd4d23a5ad7 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -1624,11 +1624,29 @@ def collect( **kwargs: Any, ) -> DataFrame: """ - Collect into a DataFrame. + Collect a LazyFrame into a DataFrame. - Note: use :func:`fetch` if you want to run your query on the first `n` rows + Use :func:`fetch` if you want to run your query on the first `n` rows only. This can be a huge time saver in debugging queries. + By default all query optimizations are applied. Use the arguments to collect to turn off + particular optimizations. + + If streaming is False the entire query is processed in a single batch. + If streaming is True Polars tries to process the query in batches for + larger than memory datasets. Use :func:`explain` to see if Polars can process the query + in streaming mode. Use :func:`polars.set_streaming_chunk_size` to set the size of the + batches. + + See Also + -------- + polars.collect_all : Collect multiple LazyFrames at the same time. + polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. + polars.explain : Print the query plan that is evaluated with collect. + polars.set_streaming_chunk_size : Set the size of batches when streaming is used. + profile : Collect the LazyFrame and time each node in the computation graph. + + Parameters ---------- type_coercion @@ -1676,6 +1694,18 @@ def collect( │ b ┆ 11 ┆ 10 │ │ c ┆ 6 ┆ 1 │ └─────┴─────┴─────┘ + Collect in streaming mode + >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect(streaming=True) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ a ┆ 4 ┆ 10 │ + │ b ┆ 11 ┆ 10 │ + │ c ┆ 6 ┆ 1 │ + └─────┴─────┴─────┘ """ eager = kwargs.get("eager", False) @@ -1830,7 +1860,7 @@ def sink_parquet( slice_pushdown: bool = True, ) -> DataFrame: """ - Persists a LazyFrame at the provided path. + Collect a LazyFrame and write the output in streaming mode to a Parquet file at the provided path. This allows streaming results that are larger than RAM to be written to disk. @@ -1926,7 +1956,7 @@ def sink_ipc( slice_pushdown: bool = True, ) -> DataFrame: """ - Persists a LazyFrame at the provided path. + Collect a LazyFrame and write the output in streaming mode to an IPC file at the provided path. This allows streaming results that are larger than RAM to be written to disk. @@ -2009,7 +2039,7 @@ def sink_csv( slice_pushdown: bool = True, ) -> DataFrame: """ - Persists a LazyFrame at the provided path. + Collect a LazyFrame and write the output in streaming mode to a CSV file at the provided path. This allows streaming results that are larger than RAM to be written to disk. @@ -2629,7 +2659,7 @@ def group_by( maintain_order Ensure that the order of the groups is consistent with the input data. This is slower than a default group by. - Settings this to ``True`` blocks the possibility + Setting this to ``True`` blocks the possibility to run on the streaming engine. Examples From a03b7c09edb98e43d6ca996626815e65afabb91a Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Thu, 7 Sep 2023 16:39:37 +0100 Subject: [PATCH 02/22] Fix lints --- py-polars/polars/lazyframe/frame.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 9bd4d23a5ad7..1a783e729a97 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -1629,13 +1629,14 @@ def collect( Use :func:`fetch` if you want to run your query on the first `n` rows only. This can be a huge time saver in debugging queries. - By default all query optimizations are applied. Use the arguments to collect to turn off - particular optimizations. + By default all query optimizations are applied. Use the arguments + to collect to turn off particular optimizations. If streaming is False the entire query is processed in a single batch. If streaming is True Polars tries to process the query in batches for - larger than memory datasets. Use :func:`explain` to see if Polars can process the query - in streaming mode. Use :func:`polars.set_streaming_chunk_size` to set the size of the + larger than memory datasets. Use :func:`explain` to see if Polars + can process the query in streaming mode. + Use :func:`polars.set_streaming_chunk_size` to set the size of the batches. See Also @@ -1643,7 +1644,7 @@ def collect( polars.collect_all : Collect multiple LazyFrames at the same time. polars.collect_all_async: Collect multiple LazyFrames at the same time lazily. polars.explain : Print the query plan that is evaluated with collect. - polars.set_streaming_chunk_size : Set the size of batches when streaming is used. + polars.set_streaming_chunk_size : Set the size of streaming batches. profile : Collect the LazyFrame and time each node in the computation graph. @@ -1694,8 +1695,14 @@ def collect( │ b ┆ 11 ┆ 10 │ │ c ┆ 6 ┆ 1 │ └─────┴─────┴─────┘ + Collect in streaming mode - >>> lf.group_by("a", maintain_order=True).agg(pl.all().sum()).collect(streaming=True) + + >>> ( + ... lf.group_by("a", maintain_order=True) + ... .agg(pl.all().sum()) + ... .collect(streaming=True) + ... ) shape: (3, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -1860,7 +1867,7 @@ def sink_parquet( slice_pushdown: bool = True, ) -> DataFrame: """ - Collect a LazyFrame and write the output in streaming mode to a Parquet file at the provided path. + Collect and write a LazyFrame in streaming mode to a Parquet file at the path. This allows streaming results that are larger than RAM to be written to disk. @@ -1956,7 +1963,7 @@ def sink_ipc( slice_pushdown: bool = True, ) -> DataFrame: """ - Collect a LazyFrame and write the output in streaming mode to an IPC file at the provided path. + Collect and write a LazyFrame in streaming mode to an IPC file at the path. This allows streaming results that are larger than RAM to be written to disk. @@ -2039,7 +2046,7 @@ def sink_csv( slice_pushdown: bool = True, ) -> DataFrame: """ - Collect a LazyFrame and write the output in streaming mode to a CSV file at the provided path. + Collect and write a LazyFrame in streaming mode to a CSV file at the path. This allows streaming results that are larger than RAM to be written to disk. From 340fca253171c7da81eceb25ad2b60b8b30bb1ef Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Wed, 13 Sep 2023 09:58:50 +0100 Subject: [PATCH 03/22] update sink strings --- py-polars/polars/lazyframe/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 1a783e729a97..c20cb4448c72 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -1867,7 +1867,7 @@ def sink_parquet( slice_pushdown: bool = True, ) -> DataFrame: """ - Collect and write a LazyFrame in streaming mode to a Parquet file at the path. + Evaluate the query in streaming mode and write to a Parquet file at the path. This allows streaming results that are larger than RAM to be written to disk. @@ -1963,7 +1963,7 @@ def sink_ipc( slice_pushdown: bool = True, ) -> DataFrame: """ - Collect and write a LazyFrame in streaming mode to an IPC file at the path. + Evaluate the query in streaming mode and write to an IPC file at the path. This allows streaming results that are larger than RAM to be written to disk. @@ -2046,7 +2046,7 @@ def sink_csv( slice_pushdown: bool = True, ) -> DataFrame: """ - Collect and write a LazyFrame in streaming mode to a CSV file at the path. + Evaluate the query in streaming mode and write to a CSV file at the path. This allows streaming results that are larger than RAM to be written to disk. From ac2707cc82e3aa67d2189afa637ecfb79ad7234c Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Sun, 17 Sep 2023 19:07:29 +0100 Subject: [PATCH 04/22] add config example --- py-polars/polars/config.py | 6 ++++++ py-polars/polars/lazyframe/frame.py | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index 781c1d15b4cf..cc5ef8fc51ec 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -364,6 +364,12 @@ def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]: fmt : {"mixed", "full"} How to format floating point numbers + Examples + -------- + >>> with pl.Config(set_fmt_float="full"): + s = pl.Series([1.2304980958725870923]) + print(s) + """ _set_float_fmt(fmt="mixed" if fmt is None else fmt) return cls diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 888566149bd2..2b6e1e24b4ff 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -1648,7 +1648,6 @@ def collect( polars.set_streaming_chunk_size : Set the size of streaming batches. profile : Collect the LazyFrame and time each node in the computation graph. - Parameters ---------- type_coercion From abe00e19c63cdebfc6264f7b2e619a11ba374744 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Fri, 22 Sep 2023 14:57:34 +0100 Subject: [PATCH 05/22] Add structify example --- py-polars/polars/config.py | 45 +++++++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index cc5ef8fc51ec..2986dae9cff3 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -347,7 +347,33 @@ def set_ascii_tables(cls, active: bool | None = True) -> type[Config]: @classmethod def set_auto_structify(cls, active: bool | None = False) -> type[Config]: - """Allow multi-output expressions to be automatically turned into Structs.""" + """ + Allow multi-output expressions to be automatically turned into Structs. + + Examples + -------- + >>> df = pl.DataFrame({"v":range(10),"v2":range(10,20)}) + >>> with pl.Config(set_auto_structify=True): + ... out = df.select(pl.all()) + >>> out + shape: (10, 1) + ┌───────────┐ + │ v │ + │ --- │ + │ struct[2] │ + ╞═══════════╡ + │ {0,10} │ + │ {1,11} │ + │ {2,12} │ + │ {3,13} │ + │ … │ + │ {6,16} │ + │ {7,17} │ + │ {8,18} │ + │ {9,19} │ + └───────────┘ + + """ if active is None: os.environ.pop("POLARS_AUTO_STRUCTIFY", None) else: @@ -366,9 +392,22 @@ def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]: Examples -------- + >>> s = pl.Series([1.2304980958725870923]) + >>> with pl.Config(set_fmt_float="mixed"): + ... print(s) + shape: (1,) + Series: '' [f64] + [ + 1.230498 + ] + >>> with pl.Config(set_fmt_float="full"): - s = pl.Series([1.2304980958725870923]) - print(s) + ... print(s) + shape: (1,) + Series: '' [f64] + [ + 1.230498095872587 + ] """ _set_float_fmt(fmt="mixed" if fmt is None else fmt) From 489cd722d1487d57c58b976b714377730e98bf06 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Mon, 25 Sep 2023 14:04:54 +0100 Subject: [PATCH 06/22] checking tests --- py-polars/polars/dataframe/frame.py | 2 +- py-polars/polars/lazyframe/frame.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 97a7f8ae639a..a16b96bfd73e 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -6532,7 +6532,7 @@ def clear(self, n: int = 0) -> Self: def clone(self) -> Self: """ - Cheap deepcopy/clone. + Create a copy of a DataFrame. This is a cheap operation that does not copy data. See Also -------- diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 06f7e783a94d..25b7ff13f38f 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2480,7 +2480,7 @@ def clear(self, n: int = 0) -> LazyFrame: def clone(self) -> Self: """ - Very cheap deepcopy/clone. + Create a copy of a LazyFrame. This is a cheap operation that does not copy data. See Also -------- From eda8aa8282e034ff4b8b410f9ff0ccebc4719ffe Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Thu, 28 Sep 2023 19:49:33 +0100 Subject: [PATCH 07/22] Add scan parquet options --- py-polars/polars/io/parquet/functions.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index 33cc7c023f4f..a504ad82ec29 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -238,12 +238,22 @@ def scan_parquet( Examples -------- >>> source = "s3://bucket/*.parquet" + >>> pl.scan_parquet(source) # doctest: +SKIP >>> storage_options = { ... "aws_access_key_id": "", ... "aws_secret_access_key": "", ... } >>> pl.scan_parquet(source, storage_options=storage_options) # doctest: +SKIP + If you get a missing region error then you can set the region in the storage options: + >>> source = "s3://bucket/*.parquet" + >>> storage_options = { + ... "aws_access_key_id": "", + ... "aws_secret_access_key": "", + ... "region": "us-east-1", + ... } + >>> pl.scan_parquet(source, storage_options=storage_options) # doctest: +SKIP + See Also -------- read_parquet From c56e2b9c0d0b59ed3e5ae861b979c076c92705cf Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Fri, 29 Sep 2023 15:10:31 +0100 Subject: [PATCH 08/22] update docs --- docs/src/python/user-guide/io/aws.py | 28 ++++++++++++++++++----- docs/user-guide/io/aws.md | 29 +++++++++++++++++++----- py-polars/polars/io/parquet/functions.py | 2 +- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/docs/src/python/user-guide/io/aws.py b/docs/src/python/user-guide/io/aws.py index c8bfa94941d2..c8d51af22e8e 100644 --- a/docs/src/python/user-guide/io/aws.py +++ b/docs/src/python/user-guide/io/aws.py @@ -1,14 +1,30 @@ """ -# --8<-- [start:bucket] +# --8<-- [start:read_parquet] import polars as pl -import pyarrow.parquet as pq -import s3fs -fs = s3fs.S3FileSystem() bucket = "" path = "" -dataset = pq.ParquetDataset(f"s3://{bucket}/{path}", filesystem=fs) -df = pl.from_arrow(dataset.read()) +df = pl.read_parquet(f"s3://{bucket}/{path}") # --8<-- [end:bucket] + +# --8<-- [start:scan_parquet] +import polars as pl + +bucket = "" +path = "" + +df = pl.scan_parquet(f"s3://{bucket}/{path}") +# --8<-- [end:scan_parquet] + +# --8<-- [start:scan_parquet_query] +import polars as pl + +bucket = "" +path = "" + +df = pl.scan_parquet(f"s3://{bucket}/{path}").filter(pl.col("id") < 100).select("id","value") +# --8<-- [end:scan_parquet_query] + + """ diff --git a/docs/user-guide/io/aws.md b/docs/user-guide/io/aws.md index 27c9cfeaf453..4f643cffe1f2 100644 --- a/docs/user-guide/io/aws.md +++ b/docs/user-guide/io/aws.md @@ -2,7 +2,12 @@ --8<-- "docs/_build/snippets/under_construction.md" -To read from or write to an AWS bucket, additional dependencies are needed in Rust: +To read from or write to an AWS bucket, additional dependencies may be needed: +=== ":fontawesome-brands-python: Python" + +```shell +$ pip install fsspec +``` === ":fontawesome-brands-rust: Rust" @@ -10,11 +15,23 @@ To read from or write to an AWS bucket, additional dependencies are needed in Ru $ cargo add aws_sdk_s3 aws_config tokio --features tokio/full ``` -In the next few snippets we'll demonstrate interacting with a `Parquet` file -located on an AWS bucket. - ## Read -Load a `.parquet` file using: +We can read a `.parquet` file in eager mode from an AWS bucket: + +{{code_block('user-guide/io/aws','read_parquet',[])}} + +This downloads the file to a temporary location and reads it from there. + +## Scan + +We can scan a `.parquet` file in lazy mode from an AWS bucket: + +{{code_block('user-guide/io/aws','scan_parquet',[])}} + +This creates a `LazyFrame` without downloading the file. We have access to file metadata such as the schema. + +If we create a lazy query with predicate and projection pushdowns the query optimiser will apply them before the file is downloaded + +{{code_block('user-guide/io/aws','scan_parquet_query',[])}} -{{code_block('user-guide/io/aws','bucket',[])}} diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index a504ad82ec29..7214ac28e1db 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -239,12 +239,12 @@ def scan_parquet( -------- >>> source = "s3://bucket/*.parquet" >>> pl.scan_parquet(source) # doctest: +SKIP + Pass in storage options to connect to the cloud provider: >>> storage_options = { ... "aws_access_key_id": "", ... "aws_secret_access_key": "", ... } >>> pl.scan_parquet(source, storage_options=storage_options) # doctest: +SKIP - If you get a missing region error then you can set the region in the storage options: >>> source = "s3://bucket/*.parquet" >>> storage_options = { From f9044cd8f4e741c4eeeae118920423714a67e96b Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Tue, 3 Oct 2023 13:03:24 +0100 Subject: [PATCH 09/22] Update user guide docs for IO --- docs/src/python/user-guide/io/aws.py | 30 --------- .../src/python/user-guide/io/cloud-storage.py | 63 +++++++++++++++++++ docs/src/python/user-guide/io/database.py | 32 +++++++--- .../io/{aws.rs => cloud-storage.rs} | 17 ++++- docs/user-guide/io/aws.md | 37 ----------- docs/user-guide/io/cloud-storage.md | 47 ++++++++++++++ docs/user-guide/io/csv.md | 2 + docs/user-guide/io/database.md | 21 +++++-- docs/user-guide/io/json_file.md | 9 ++- docs/user-guide/io/parquet.md | 22 ++++--- mkdocs.yml | 2 +- 11 files changed, 186 insertions(+), 96 deletions(-) delete mode 100644 docs/src/python/user-guide/io/aws.py create mode 100644 docs/src/python/user-guide/io/cloud-storage.py rename docs/src/rust/user-guide/io/{aws.rs => cloud-storage.rs} (67%) delete mode 100644 docs/user-guide/io/aws.md create mode 100644 docs/user-guide/io/cloud-storage.md diff --git a/docs/src/python/user-guide/io/aws.py b/docs/src/python/user-guide/io/aws.py deleted file mode 100644 index c8d51af22e8e..000000000000 --- a/docs/src/python/user-guide/io/aws.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -# --8<-- [start:read_parquet] -import polars as pl - -bucket = "" -path = "" - -df = pl.read_parquet(f"s3://{bucket}/{path}") -# --8<-- [end:bucket] - -# --8<-- [start:scan_parquet] -import polars as pl - -bucket = "" -path = "" - -df = pl.scan_parquet(f"s3://{bucket}/{path}") -# --8<-- [end:scan_parquet] - -# --8<-- [start:scan_parquet_query] -import polars as pl - -bucket = "" -path = "" - -df = pl.scan_parquet(f"s3://{bucket}/{path}").filter(pl.col("id") < 100).select("id","value") -# --8<-- [end:scan_parquet_query] - - -""" diff --git a/docs/src/python/user-guide/io/cloud-storage.py b/docs/src/python/user-guide/io/cloud-storage.py new file mode 100644 index 000000000000..0f968e15f97b --- /dev/null +++ b/docs/src/python/user-guide/io/cloud-storage.py @@ -0,0 +1,63 @@ +""" +# --8<-- [start:read_parquet] +import polars as pl + +source = "s3://bucket/*.parquet" + +df = pl.read_parquet(source) +# --8<-- [end:read_parquet] + +# --8<-- [start:scan_parquet] +import polars as pl + +source = "s3://bucket/*.parquet" + +storage_options = { + "aws_access_key_id": "", + "aws_secret_access_key": "", + "aws_region": "us-east-1", +} +df = pl.scan_parquet(source, storage_options=storage_options) +# --8<-- [end:scan_parquet] + +# --8<-- [start:scan_parquet_query] +import polars as pl + +source = "s3://bucket/*.parquet" + + +df = pl.scan_parquet(source).filter(pl.col("id") < 100).select("id","value").collect() +# --8<-- [end:scan_parquet_query] + +# --8<-- [start:scan_pyarrow_dataset] +import polars as pl +import pyarrow.dataset as ds + +dset = ds.dataset("s3://my-partitioned-folder/", format="parquet") +( + pl.scan_pyarrow_dataset(dset) + .filter("foo" == "a") + .select(["foo", "bar"]) + .collect() +) +# --8<-- [end:scan_pyarrow_dataset] + +# --8<-- [start:write_parquet] + +import polars as pl +import s3fs + +df = pl.DataFrame({ + "foo": ["a", "b", "c", "d", "d"], + "bar": [1, 2, 3, 4, 5], +}) + +fs = s3fs.S3FileSystem() +destination = "s3://bucket/my_file.parquet" + +# write parquet +with fs.open(destination, mode='wb') as f: + df.write_parquet(f) +# --8<-- [end:write_parquet] + +""" diff --git a/docs/src/python/user-guide/io/database.py b/docs/src/python/user-guide/io/database.py index 97e8f659de73..b37045719995 100644 --- a/docs/src/python/user-guide/io/database.py +++ b/docs/src/python/user-guide/io/database.py @@ -1,32 +1,44 @@ """ -# --8<-- [start:read] +# --8<-- [start:read_uri] import polars as pl -connection_uri = "postgres://username:password@server:port/database" +uri = "postgres://username:password@server:port/database" query = "SELECT * FROM foo" -pl.read_database(query=query, connection_uri=connection_uri) -# --8<-- [end:read] +pl.read_database_uri(query=query, uri=uri) +# --8<-- [end:read_uri] + +# --8<-- [start:read_cursor] +import polars as pl +from sqlalchemy import create_engine + +conn = create_engine(f"sqlite:///test.db") + +query = "SELECT * FROM foo" + +pl.read_database(query=query, connection=conn.connect()) +# --8<-- [end:read_cursor] + # --8<-- [start:adbc] -connection_uri = "postgres://username:password@server:port/database" +uri = "postgres://username:password@server:port/database" query = "SELECT * FROM foo" -pl.read_database(query=query, connection_uri=connection_uri, engine="adbc") +pl.read_database_uri(query=query, uri=uri, engine="adbc") # --8<-- [end:adbc] # --8<-- [start:write] -connection_uri = "postgres://username:password@server:port/database" +uri = "postgres://username:password@server:port/database" df = pl.DataFrame({"foo": [1, 2, 3]}) -df.write_database(table_name="records", connection_uri=connection_uri) +df.write_database(table_name="records", uri=uri) # --8<-- [end:write] # --8<-- [start:write_adbc] -connection_uri = "postgres://username:password@server:port/database" +uri = "postgres://username:password@server:port/database" df = pl.DataFrame({"foo": [1, 2, 3]}) -df.write_database(table_name="records", connection_uri=connection_uri, engine="adbc") +df.write_database(table_name="records", uri=uri, engine="adbc") # --8<-- [end:write_adbc] """ diff --git a/docs/src/rust/user-guide/io/aws.rs b/docs/src/rust/user-guide/io/cloud-storage.rs similarity index 67% rename from docs/src/rust/user-guide/io/aws.rs rename to docs/src/rust/user-guide/io/cloud-storage.rs index 0a1924d9d294..4118e520628d 100644 --- a/docs/src/rust/user-guide/io/aws.rs +++ b/docs/src/rust/user-guide/io/cloud-storage.rs @@ -1,5 +1,5 @@ """ -# --8<-- [start:bucket] +# --8<-- [start:read_parquet] use aws_sdk_s3::Region; use aws_config::meta::region::RegionProviderChain; @@ -28,5 +28,18 @@ async fn main() { println!("{:?}", df); } -# --8<-- [end:bucket] +# --8<-- [end:read_parquet] + +# --8<-- [start:scan_parquet] +# --8<-- [end:scan_parquet] + +# --8<-- [start:scan_parquet_query] +# --8<-- [end:scan_parquet_query] + +# --8<-- [start:scan_pyarrow_dataset] +# --8<-- [end:scan_pyarrow_dataset] + +# --8<-- [start:write_parquet] +# --8<-- [end:write_parquet] + """ diff --git a/docs/user-guide/io/aws.md b/docs/user-guide/io/aws.md deleted file mode 100644 index 4f643cffe1f2..000000000000 --- a/docs/user-guide/io/aws.md +++ /dev/null @@ -1,37 +0,0 @@ -# AWS - ---8<-- "docs/_build/snippets/under_construction.md" - -To read from or write to an AWS bucket, additional dependencies may be needed: -=== ":fontawesome-brands-python: Python" - -```shell -$ pip install fsspec -``` - -=== ":fontawesome-brands-rust: Rust" - -```shell -$ cargo add aws_sdk_s3 aws_config tokio --features tokio/full -``` - -## Read - -We can read a `.parquet` file in eager mode from an AWS bucket: - -{{code_block('user-guide/io/aws','read_parquet',[])}} - -This downloads the file to a temporary location and reads it from there. - -## Scan - -We can scan a `.parquet` file in lazy mode from an AWS bucket: - -{{code_block('user-guide/io/aws','scan_parquet',[])}} - -This creates a `LazyFrame` without downloading the file. We have access to file metadata such as the schema. - -If we create a lazy query with predicate and projection pushdowns the query optimiser will apply them before the file is downloaded - -{{code_block('user-guide/io/aws','scan_parquet_query',[])}} - diff --git a/docs/user-guide/io/cloud-storage.md b/docs/user-guide/io/cloud-storage.md new file mode 100644 index 000000000000..c6b8efaadc86 --- /dev/null +++ b/docs/user-guide/io/cloud-storage.md @@ -0,0 +1,47 @@ +# Cloud storage + +Polars can read and write to AWS S3, Azure Blob Storage and Google Cloud Storage. The API is the same for all three storage providers. + +To read from cloud storage, additional dependencies may be needed depending on the use case and cloud storage provider: +=== ":fontawesome-brands-python: Python" + + ```shell + $ pip install fsspec s3fs adlfs gcsfs + ``` + +=== ":fontawesome-brands-rust: Rust" + + ```shell + $ cargo add aws_sdk_s3 aws_config tokio --features tokio/full + ``` + +## Reading from cloud storage + +Polars can read a CSV, IPC or Parquet file in eager mode from cloud storage. + +{{code_block('user-guide/io/cloud-storage','read_parquet',[read_parquet,read_csv,read_ipc])}} + +This eager query downloads the file to a buffer in memory and creates a `DataFrame` from there. Polars uses `fsspec` to manage this download internally for all cloud storage providers. + +## Scanning from cloud storage with query optimisation + +Polars can scan a Parquet file in lazy mode from cloud storage. We may need to provide further details beyond the source url such as authentication details or storage region. Polars looks for these as environment variables but we can also do this manually by passing a `dict` as the `storage_options` argument. + +{{code_block('user-guide/io/cloud-storage','scan_parquet',[scan_parquet])}} + +This query creates a `LazyFrame` without downloading the file. In the `LazyFrame` we have access to file metadata such as the schema. Polars uses the `object_store.rs` library internally to manage the interface with the cloud storage providers and so no extra dependencies are required in Python to scan a cloud Parquet file. + +If we create a lazy query with [predicate and projection pushdowns](/polars/user-guide/lazy/optimizations/) the query optimiser will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. + +{{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} + +## Scanning with PyArrow +We can also scan from cloud storage using PyArrow. This is particularly useful for partitioned datasets such as Hive partitioning. + +We first create a PyArrow dataset and then create a `LazyFrame` from the dataset. +{{code_block('user-guide/io/cloud-storage','scan_pyarrow_dataset',[scan_pyarrow_dataset])}} + +## Writing to cloud storage +We can write a `DataFrame` to cloud storage in Python using s3fs for S3, adlfs for Azure Blob Storage and gcsfs for Google Cloud Storage. In this example we write a Parquet file to S3. + +{{code_block('user-guide/io/cloud-storage','write_parquet',[write_parquet])}} diff --git a/docs/user-guide/io/csv.md b/docs/user-guide/io/csv.md index eeb209dfb34e..dc3304b91d12 100644 --- a/docs/user-guide/io/csv.md +++ b/docs/user-guide/io/csv.md @@ -10,6 +10,8 @@ Writing a CSV file is similar with the `write_csv` function: {{code_block('user-guide/io/csv','write',['write_csv'])}} +For reading and writing a CSV from a cloud storage provider, see [Cloud storage](cloud-storage.md). + ## Scan `Polars` allows you to _scan_ a CSV input. Scanning delays the actual parsing of the diff --git a/docs/user-guide/io/database.md b/docs/user-guide/io/database.md index 4444e7be799e..0385b3f51e9f 100644 --- a/docs/user-guide/io/database.md +++ b/docs/user-guide/io/database.md @@ -2,19 +2,30 @@ ## Read from a database -We can read from a database with Polars using the `pl.read_database` function. To use this function you need an SQL query string and a connection string called a `connection_uri`. +Polars can read from a database using either the `pl.read_database_uri` and `pl.read_database` functions. -For example, the following snippet shows the general patterns for reading all columns from the `foo` table in a Postgres database: +### Difference between the `read_database` functions -{{code_block('user-guide/io/database','read',['read_database_connectorx'])}} +Use `pl.read_database_uri` if you want to specify the database connection with a connection string called a `uri`. For example, the following snippet shows a query to read all columns from the `foo` table in a Postgres database where we use the `uri` to connect: + +{{code_block('user-guide/io/database','read_uri',['read_database_uri'])}} + +On the other hand use `pl.read_database` if you want to connect via a connection engine created with a library like SQLAlchemy. +{{code_block('user-guide/io/database','read_cursor',['read_database'])}} + +Note that `pl.read_database_uri` is likely to be faster than `pl.read_database` if you are using a SQLAlchemy or DBAPI2 connection as these connections may load the data row-wise into Python before copying the data again to the column-wise Apache Arrow format. ### Engines -Polars doesn't manage connections and data transfer from databases by itself. Instead external libraries (known as _engines_) handle this. At present Polars can use two engines to read from databases: +Polars doesn't manage connections and data transfer from databases by itself. Instead external libraries (known as _engines_) handle this. + +If you use `pl.read_database` then you specify the engine when you make the connection object. If you use `pl.read_database_uri` then you can specify one of two engines to read from the database: - [ConnectorX](https://github.com/sfu-db/connector-x) and - [ADBC](https://arrow.apache.org/docs/format/ADBC.html) +Both engines have native support for Apache Arrow and so can read data directly into a Polars `DataFrame` without copying the data. + #### ConnectorX ConnectorX is the default engine and [supports numerous databases](https://github.com/sfu-db/connector-x#sources) including Postgres, Mysql, SQL Server and Redshift. ConnectorX is written in Rust and stores data in Arrow format to allow for zero-copy to Polars. @@ -27,7 +38,7 @@ $ pip install connectorx #### ADBC -ADBC (Arrow Database Connectivity) is an engine supported by the Apache Arrow project. ADBC aims to be both an API standard for connecting to databases and libraries implementing this standard in a range of languages. +ADBC (Arrow Database Connectivity) is a new engine supported by the Apache Arrow project. It is still early days for ADBC so support for different databases is still limited. At present drivers for ADBC are only available for [Postgres and SQLite](https://arrow.apache.org/adbc/0.1.0/driver/cpp/index.html). To install ADBC you need to install the driver for your database. For example to install the driver for SQLite you run diff --git a/docs/user-guide/io/json_file.md b/docs/user-guide/io/json_file.md index 352904829c7b..5f62759f81a5 100644 --- a/docs/user-guide/io/json_file.md +++ b/docs/user-guide/io/json_file.md @@ -1,6 +1,7 @@ # JSON files +Polars can read and write both standard JSON and newline-delimited JSON (NDJSON). -## Read & write +## Read ### JSON @@ -12,13 +13,15 @@ Reading a JSON file should look familiar: JSON objects that are delimited by newlines can be read into polars in a much more performant way than standard json. +Polars can read an ND-JSON file into a `DataFrame` using the `read_ndjson` function: + {{code_block('user-guide/io/json-file','readnd',['read_ndjson'])}} ## Write - {{code_block('user-guide/io/json-file','write',['write_json','write_ndjson'])}} -## Scan + +## Scan NDJSON `Polars` allows you to _scan_ a JSON input **only for newline delimited json**. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. diff --git a/docs/user-guide/io/parquet.md b/docs/user-guide/io/parquet.md index 71a5399bb393..e31481fcbdd9 100644 --- a/docs/user-guide/io/parquet.md +++ b/docs/user-guide/io/parquet.md @@ -1,24 +1,30 @@ # Parquet -Loading or writing [`Parquet` files](https://parquet.apache.org/) is lightning fast. -`Pandas` uses [`PyArrow`](https://arrow.apache.org/docs/python/) -`Python` bindings -exposed by `Arrow`- to load `Parquet` files into memory, but it has to copy that data into -`Pandas` memory. With `Polars` there is no extra cost due to -copying as we read `Parquet` directly into `Arrow` memory and _keep it there_. +Loading or writing [`Parquet` files](https://parquet.apache.org/) is lightning fast as the layout of data in a Polars `DataFrame` in memory mirrors the layout of a Parquet file on disk in many respects. -## Read +Unlike CSV, Parquet is a columnar format. This means that the data is stored in columns rather than rows. This is a more efficient way of storing data as it allows for better compression and faster access to data. +## Read +We can read a `Parquet` file into a `DataFrame` using the `read_parquet` function: {{code_block('user-guide/io/parquet','read',['read_parquet'])}} +For reading a Parquet file from a cloud storage provider, see [Cloud storage](cloud-storage.md/#reading-from-cloud-storage). + + ## Write {{code_block('user-guide/io/parquet','write',['write_parquet'])}} +For writing a Parquet file to a cloud storage provider, see [Cloud storage](cloud-storage.md/#writing-to-cloud-storage). + ## Scan -`Polars` allows you to _scan_ a `Parquet` input. Scanning delays the actual parsing of the -file and instead returns a lazy computation holder called a `LazyFrame`. +`Polars` allows you to _scan_ a `Parquet` input. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. {{code_block('user-guide/io/parquet','scan',['scan_parquet'])}} If you want to know why this is desirable, you can read more about those `Polars` optimizations [here](../concepts/lazy-vs-eager.md). + +When we scan a `Parquet` file stored in the cloud, we can also apply predicate and projection pushdowns. This can significantly reduce the amount of data that needs to be downloaded. For scanning a Parquet file in the cloud, see [Cloud storage](cloud-storage.md/#scanning-from-cloud-storage-with-query-optimisation). + + diff --git a/mkdocs.yml b/mkdocs.yml index 425033e2eb19..541683d685e3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -64,7 +64,7 @@ nav: - user-guide/io/json_file.md - user-guide/io/multiple.md - user-guide/io/database.md - - user-guide/io/aws.md + - user-guide/io/cloud-storage.md - user-guide/io/bigquery.md - SQL: - user-guide/sql/intro.md From 020a21753bbb20e488b91bdac372405921d377fb Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Tue, 3 Oct 2023 13:03:44 +0100 Subject: [PATCH 10/22] add python JSON --- docs/src/python/user-guide/io/json-file.py | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 docs/src/python/user-guide/io/json-file.py diff --git a/docs/src/python/user-guide/io/json-file.py b/docs/src/python/user-guide/io/json-file.py new file mode 100644 index 000000000000..8e6ba3955dc4 --- /dev/null +++ b/docs/src/python/user-guide/io/json-file.py @@ -0,0 +1,24 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +""" +# --8<-- [start:read] +df = pl.read_json("docs/data/path.json") +# --8<-- [end:read] + +# --8<-- [start:readnd] +df = pl.read_ndjson("docs/data/path.json") +# --8<-- [end:readnd] + +""" + +# --8<-- [start:write] +df = pl.DataFrame({"foo": [1, 2, 3], "bar": [None, "bak", "baz"]}) +df.write_json("docs/data/path.json") +# --8<-- [end:write] + +# --8<-- [start:scan] +df = pl.scan_ndjson("docs/data/path.json") +# --8<-- [end:scan] From d5a49a5f03f4fb071f64f43be9faca06050e2b27 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Wed, 4 Oct 2023 07:17:20 +0100 Subject: [PATCH 11/22] Update contributing for user guide --- CONTRIBUTING.md | 49 +++++++++++++++++++++++- py-polars/polars/config.py | 5 ++- py-polars/polars/io/parquet/functions.py | 2 +- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4040af393985..2282998898d2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -151,8 +151,53 @@ The most important components of Polars documentation are the [user guide](https ### User guide -The user guide is maintained in the `docs` folder. -Further contributing information will be added shortly. +The user guide is maintained in the `docs/user-guide` folder. Before creating a PR first raise an issue to discuss what you feel is missing or could be improved. + +#### Building and serving the user guide +The user guide is built using [MkDocs](https://www.mkdocs.org/). You install the dependencies for building the user guide by running `make requirements` in the root of the repo. + +Run `mkdocs serve` to build and serve the user guide so you can view it locally and see updates as you make changes. + +#### Creating a new user guide page +Each user guide page is based on a `.md` markdown file. This file must be listed in `mkdocs.yml`. + +#### Adding a shell code block +To add a code block with code to be run in a shell with tabs for Python and Rust use the following format +` +=== ":fontawesome-brands-python: Python" + + ```shell + $ pip install fsspec + ``` + +=== ":fontawesome-brands-rust: Rust" + + ```shell + $ cargo add aws_sdk_s3 + ``` +` +#### Adding a code block +The snippets for Python and Rust code blocks are in the `docs/src/python/` and `docs/src/rust/` directories respectively. To add a code snippet with Python or Rust code to a `.md` page use the following format +` +{{code_block('user-guide/io/cloud-storage','read_parquet',[read_parquet,read_csv])}} +` +- The first argument is a path to either or both files called `docs/src/python/user-guide/io/cloud-storage.py` and `docs/src/rust/user-guide/io/cloud-storage.rs`. +- The second argument is the name given at the start and end of each snippet in the `.py` or `.rs` file +- The third argument is a list of links to functions in the API docs. For each element of the list there must be a corresponding entry in `docs/_build/API_REFERENCE_LINKS.yml` + +If the corresponding `.py` and `.rs` snippet files both exist then each snippet named in the second argument to `code_block` above must exist or the build will fail. An empty snippet should be added to the `.py` or `.rs` file if the snippet is not needed. + +Each snippet is formatted as follows: +` +# --8<-- [start:read_parquet] +import polars as pl + +source = "s3://bucket/*.parquet" + +df = pl.read_parquet(source) +# --8<-- [end:read_parquet] +` +The snippet is delimited by `--8<-- [start:]` and `--8<-- [end:]`. The snippet name must match the name given in the second argument to `code_block` above. ### API reference diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index 4baa59974778..05fa3f0229b9 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -404,9 +404,10 @@ def set_auto_structify(cls, active: bool | None = False) -> type[Config]: Examples -------- - >>> df = pl.DataFrame({"v":range(10),"v2":range(10,20)}) + >>> df = pl.DataFrame({"v": range(10), "v2": range(10, 20)}) >>> with pl.Config(set_auto_structify=True): ... out = df.select(pl.all()) + ... >>> out shape: (10, 1) ┌───────────┐ @@ -447,6 +448,7 @@ def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]: >>> s = pl.Series([1.2304980958725870923]) >>> with pl.Config(set_fmt_float="mixed"): ... print(s) + ... shape: (1,) Series: '' [f64] [ @@ -455,6 +457,7 @@ def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]: >>> with pl.Config(set_fmt_float="full"): ... print(s) + ... shape: (1,) Series: '' [f64] [ diff --git a/py-polars/polars/io/parquet/functions.py b/py-polars/polars/io/parquet/functions.py index bb6d01490098..c67afc79be48 100644 --- a/py-polars/polars/io/parquet/functions.py +++ b/py-polars/polars/io/parquet/functions.py @@ -249,7 +249,7 @@ def scan_parquet( ... "aws_region": "us-east-1", ... } >>> pl.scan_parquet(source, storage_options=storage_options) # doctest: +SKIP - If you get a missing region error then you can set the region in the storage options: + If you get a missing region error then set the region in the storage options: >>> source = "s3://bucket/*.parquet" >>> storage_options = { ... "aws_access_key_id": "", From a9d99f829605e70c665715b9ecff9626ec335972 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Wed, 4 Oct 2023 07:27:30 +0100 Subject: [PATCH 12/22] Run dprint --- docs/user-guide/io/cloud-storage.md | 2 ++ docs/user-guide/io/database.md | 2 +- docs/user-guide/io/json_file.md | 3 ++- docs/user-guide/io/parquet.md | 6 ++---- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/user-guide/io/cloud-storage.md b/docs/user-guide/io/cloud-storage.md index c6b8efaadc86..b30006812196 100644 --- a/docs/user-guide/io/cloud-storage.md +++ b/docs/user-guide/io/cloud-storage.md @@ -36,12 +36,14 @@ If we create a lazy query with [predicate and projection pushdowns](/polars/user {{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} ## Scanning with PyArrow + We can also scan from cloud storage using PyArrow. This is particularly useful for partitioned datasets such as Hive partitioning. We first create a PyArrow dataset and then create a `LazyFrame` from the dataset. {{code_block('user-guide/io/cloud-storage','scan_pyarrow_dataset',[scan_pyarrow_dataset])}} ## Writing to cloud storage + We can write a `DataFrame` to cloud storage in Python using s3fs for S3, adlfs for Azure Blob Storage and gcsfs for Google Cloud Storage. In this example we write a Parquet file to S3. {{code_block('user-guide/io/cloud-storage','write_parquet',[write_parquet])}} diff --git a/docs/user-guide/io/database.md b/docs/user-guide/io/database.md index 0385b3f51e9f..4b18c48f85e0 100644 --- a/docs/user-guide/io/database.md +++ b/docs/user-guide/io/database.md @@ -2,7 +2,7 @@ ## Read from a database -Polars can read from a database using either the `pl.read_database_uri` and `pl.read_database` functions. +Polars can read from a database using either the `pl.read_database_uri` and `pl.read_database` functions. ### Difference between the `read_database` functions diff --git a/docs/user-guide/io/json_file.md b/docs/user-guide/io/json_file.md index 5f62759f81a5..0f4e4fae6195 100644 --- a/docs/user-guide/io/json_file.md +++ b/docs/user-guide/io/json_file.md @@ -1,4 +1,5 @@ # JSON files + Polars can read and write both standard JSON and newline-delimited JSON (NDJSON). ## Read @@ -18,8 +19,8 @@ Polars can read an ND-JSON file into a `DataFrame` using the `read_ndjson` funct {{code_block('user-guide/io/json-file','readnd',['read_ndjson'])}} ## Write -{{code_block('user-guide/io/json-file','write',['write_json','write_ndjson'])}} +{{code_block('user-guide/io/json-file','write',['write_json','write_ndjson'])}} ## Scan NDJSON diff --git a/docs/user-guide/io/parquet.md b/docs/user-guide/io/parquet.md index e31481fcbdd9..ac071f507d56 100644 --- a/docs/user-guide/io/parquet.md +++ b/docs/user-guide/io/parquet.md @@ -2,15 +2,15 @@ Loading or writing [`Parquet` files](https://parquet.apache.org/) is lightning fast as the layout of data in a Polars `DataFrame` in memory mirrors the layout of a Parquet file on disk in many respects. -Unlike CSV, Parquet is a columnar format. This means that the data is stored in columns rather than rows. This is a more efficient way of storing data as it allows for better compression and faster access to data. +Unlike CSV, Parquet is a columnar format. This means that the data is stored in columns rather than rows. This is a more efficient way of storing data as it allows for better compression and faster access to data. ## Read + We can read a `Parquet` file into a `DataFrame` using the `read_parquet` function: {{code_block('user-guide/io/parquet','read',['read_parquet'])}} For reading a Parquet file from a cloud storage provider, see [Cloud storage](cloud-storage.md/#reading-from-cloud-storage). - ## Write {{code_block('user-guide/io/parquet','write',['write_parquet'])}} @@ -26,5 +26,3 @@ For writing a Parquet file to a cloud storage provider, see [Cloud storage](clou If you want to know why this is desirable, you can read more about those `Polars` optimizations [here](../concepts/lazy-vs-eager.md). When we scan a `Parquet` file stored in the cloud, we can also apply predicate and projection pushdowns. This can significantly reduce the amount of data that needs to be downloaded. For scanning a Parquet file in the cloud, see [Cloud storage](cloud-storage.md/#scanning-from-cloud-storage-with-query-optimisation). - - From 4ea1b8736aeec252036902fdd1c638bdc2872b14 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Wed, 4 Oct 2023 07:29:06 +0100 Subject: [PATCH 13/22] Run dprint --- CONTRIBUTING.md | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2282998898d2..2590dcf8703e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -154,14 +154,17 @@ The most important components of Polars documentation are the [user guide](https The user guide is maintained in the `docs/user-guide` folder. Before creating a PR first raise an issue to discuss what you feel is missing or could be improved. #### Building and serving the user guide + The user guide is built using [MkDocs](https://www.mkdocs.org/). You install the dependencies for building the user guide by running `make requirements` in the root of the repo. Run `mkdocs serve` to build and serve the user guide so you can view it locally and see updates as you make changes. #### Creating a new user guide page -Each user guide page is based on a `.md` markdown file. This file must be listed in `mkdocs.yml`. + +Each user guide page is based on a `.md` markdown file. This file must be listed in `mkdocs.yml`. #### Adding a shell code block + To add a code block with code to be run in a shell with tabs for Python and Rust use the following format ` === ":fontawesome-brands-python: Python" @@ -175,12 +178,14 @@ To add a code block with code to be run in a shell with tabs for Python and Rust ```shell $ cargo add aws_sdk_s3 ``` + ` + #### Adding a code block + The snippets for Python and Rust code blocks are in the `docs/src/python/` and `docs/src/rust/` directories respectively. To add a code snippet with Python or Rust code to a `.md` page use the following format -` -{{code_block('user-guide/io/cloud-storage','read_parquet',[read_parquet,read_csv])}} -` +`{{code_block('user-guide/io/cloud-storage','read_parquet',[read_parquet,read_csv])}}` + - The first argument is a path to either or both files called `docs/src/python/user-guide/io/cloud-storage.py` and `docs/src/rust/user-guide/io/cloud-storage.rs`. - The second argument is the name given at the start and end of each snippet in the `.py` or `.rs` file - The third argument is a list of links to functions in the API docs. For each element of the list there must be a corresponding entry in `docs/_build/API_REFERENCE_LINKS.yml` @@ -189,15 +194,24 @@ If the corresponding `.py` and `.rs` snippet files both exist then each snippet Each snippet is formatted as follows: ` + # --8<-- [start:read_parquet] + import polars as pl source = "s3://bucket/*.parquet" df = pl.read_parquet(source) + # --8<-- [end:read_parquet] -` -The snippet is delimited by `--8<-- [start:]` and `--8<-- [end:]`. The snippet name must match the name given in the second argument to `code_block` above. + +`The snippet is delimited by`--8<-- [start:]`and`--8<-- [end:]`. The snippet name must match the name given in the second argument to`code_block` above. + +#### Linting + +Before committing install `dprint` (see above) and run +`dprint fmt` +from the `docs` directory to link the markdown files. ### API reference From 4308a500649e52f25d12aa96a971438b7b71fad0 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Wed, 4 Oct 2023 08:59:44 +0100 Subject: [PATCH 14/22] Fix link --- docs/user-guide/io/cloud-storage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/io/cloud-storage.md b/docs/user-guide/io/cloud-storage.md index b30006812196..ca2aa9dac045 100644 --- a/docs/user-guide/io/cloud-storage.md +++ b/docs/user-guide/io/cloud-storage.md @@ -31,7 +31,7 @@ Polars can scan a Parquet file in lazy mode from cloud storage. We may need to p This query creates a `LazyFrame` without downloading the file. In the `LazyFrame` we have access to file metadata such as the schema. Polars uses the `object_store.rs` library internally to manage the interface with the cloud storage providers and so no extra dependencies are required in Python to scan a cloud Parquet file. -If we create a lazy query with [predicate and projection pushdowns](/polars/user-guide/lazy/optimizations/) the query optimiser will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. +If we create a lazy query with [predicate and projection pushdowns](../../lazy/optimizations/) the query optimiser will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. {{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} From 28b4fd280ae056798cb4351a19ca1bacb9505c99 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Wed, 4 Oct 2023 09:12:58 +0100 Subject: [PATCH 15/22] fix api link --- docs/_build/API_REFERENCE_LINKS.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index d68415e84f5f..abf0b3c63d4d 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -12,8 +12,6 @@ python: write_csv: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_csv.html read_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_json.html write_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_json.html - read_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html - write_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_parquet.html min: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.min.html max: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.max.html value_counts: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.value_counts.html @@ -65,6 +63,7 @@ python: write_database: name: write_database link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_database.html + read_database_uri: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_database_uri.html read_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html write_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_parquet.html scan_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_parquet.html @@ -73,6 +72,7 @@ python: write_ndjson: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_ndjson.html write_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_json.html scan_ndjson: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_ndjson.html + scan_pyarrow_dataset: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_pyarrow_dataset.html from_arrow: name: from_arrow link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.from_arrow.html From 0a1e0b5c92edf19a5cfb0ef091401a4cb3c0a572 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Wed, 4 Oct 2023 09:15:23 +0100 Subject: [PATCH 16/22] Update links --- docs/_build/API_REFERENCE_LINKS.yml | 2 +- docs/user-guide/io/database.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index abf0b3c63d4d..4edeee90c58d 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -197,7 +197,7 @@ rust: feature_flags: ['json'] read_ndjson: name: JsonLineReader - link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/ndjson_core/ndjson/struct.JsonLineReader.html + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/ndjson/core/struct.JsonLineReader.html feature_flags: ['json'] write_json: name: JsonWriter diff --git a/docs/user-guide/io/database.md b/docs/user-guide/io/database.md index 4b18c48f85e0..a9922d8fcf76 100644 --- a/docs/user-guide/io/database.md +++ b/docs/user-guide/io/database.md @@ -48,7 +48,7 @@ $ pip install adbc-driver-sqlite As ADBC is not the default engine you must specify the engine as an argument to `pl.read_database` -{{code_block('user-guide/io/database','adbc',['read_database'])}} +{{code_block('user-guide/io/database','adbc',['read_database_uri'])}} ## Write to a database From f4d7c7f5dfa09a46363c8cc026fbbbb3754c5bc5 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Wed, 4 Oct 2023 09:23:07 +0100 Subject: [PATCH 17/22] Add API links --- docs/_build/API_REFERENCE_LINKS.yml | 5 +++++ docs/user-guide/io/cloud-storage.md | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index 4edeee90c58d..a0c31f1cc8e3 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -12,6 +12,7 @@ python: write_csv: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_csv.html read_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_json.html write_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_json.html + read_ipc: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_ipc.html min: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.min.html max: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.max.html value_counts: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.value_counts.html @@ -223,6 +224,10 @@ rust: name: scan_parquet link: https://pola-rs.github.io/polars/docs/rust/dev/polars/prelude/struct.LazyFrame.html#method.scan_parquet feature_flags: ['parquet'] + read_ipc: + name: IpcReader + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/prelude/struct.IpcReader.html + feature_flags: ['ipc'] min: https://pola-rs.github.io/polars/docs/rust/dev/polars/series/struct.Series.html#method.min max: https://pola-rs.github.io/polars/docs/rust/dev/polars/series/struct.Series.html#method.max struct: diff --git a/docs/user-guide/io/cloud-storage.md b/docs/user-guide/io/cloud-storage.md index ca2aa9dac045..69e01750c6f2 100644 --- a/docs/user-guide/io/cloud-storage.md +++ b/docs/user-guide/io/cloud-storage.md @@ -31,7 +31,7 @@ Polars can scan a Parquet file in lazy mode from cloud storage. We may need to p This query creates a `LazyFrame` without downloading the file. In the `LazyFrame` we have access to file metadata such as the schema. Polars uses the `object_store.rs` library internally to manage the interface with the cloud storage providers and so no extra dependencies are required in Python to scan a cloud Parquet file. -If we create a lazy query with [predicate and projection pushdowns](../../lazy/optimizations/) the query optimiser will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. +If we create a lazy query with [predicate and projection pushdowns](../lazy/optimizations.md) the query optimiser will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. {{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} From e770381306756d5129295decea1aab8321f063ae Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Fri, 6 Oct 2023 16:53:13 +0100 Subject: [PATCH 18/22] better float format example --- py-polars/polars/config.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index 05fa3f0229b9..6742b0a06dcc 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -442,26 +442,33 @@ def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]: ---------- fmt : {"mixed", "full"} How to format floating point numbers. + Mixed limits the number of decimal places and uses + scientific notation for large/small values. Full prints the + full precision of the floating point number. Examples -------- - >>> s = pl.Series([1.2304980958725870923]) + >>> s = pl.Series([1.2304980958725870923,1e6,1e-8]) >>> with pl.Config(set_fmt_float="mixed"): ... print(s) ... - shape: (1,) + shape: (3,) Series: '' [f64] [ 1.230498 + 1e6, + 1.0000e-8 ] >>> with pl.Config(set_fmt_float="full"): ... print(s) ... - shape: (1,) + shape: (3,) Series: '' [f64] [ - 1.230498095872587 + 1.230498095872587, + 1000000, + 0.00000001 ] """ From 05e9057132578d000f3ec6a9c0b3b5476157c347 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Mon, 9 Oct 2023 20:51:05 +0100 Subject: [PATCH 19/22] update examples --- py-polars/polars/config.py | 2 +- py-polars/polars/io/csv/functions.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index 6742b0a06dcc..e41da17c695d 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -404,7 +404,7 @@ def set_auto_structify(cls, active: bool | None = False) -> type[Config]: Examples -------- - >>> df = pl.DataFrame({"v": range(10), "v2": range(10, 20)}) + >>> df = pl.DataFrame({"v": [1,2,3], "v2": [4,5,6]}) >>> with pl.Config(set_auto_structify=True): ... out = df.select(pl.all()) ... diff --git a/py-polars/polars/io/csv/functions.py b/py-polars/polars/io/csv/functions.py index f9cbba71c936..2aff00a3053f 100644 --- a/py-polars/polars/io/csv/functions.py +++ b/py-polars/polars/io/csv/functions.py @@ -134,7 +134,7 @@ def read_csv( python. Defaults to ``utf8``. low_memory Reduce memory usage at expense of performance when rechunking into - a single array. To work with larger than-memory datasets use streaming mode. + a single array by doing rechunking in serial instead of parallel. rechunk Make sure that all columns are contiguous in memory by aggregating the chunks into a single array. @@ -181,6 +181,8 @@ def read_csv( all data will be stored continuously in memory. Set `rechunk=False` if you are benchmarking the csv-reader. A `rechunk` is an expensive operation. + To work with larger than-memory CSV files use pl.scan_csv and evaluate the + query in streaming mode. """ _check_arg_is_1byte("separator", separator, can_be_empty=False) From 1f938b2e59413dc1e08133d9cae606e9c0e6b5be Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Mon, 9 Oct 2023 21:16:59 +0100 Subject: [PATCH 20/22] changes from sdg comments --- docs/_build/API_REFERENCE_LINKS.yml | 1 + docs/src/python/user-guide/io/cloud-storage.py | 5 +++++ docs/user-guide/io/cloud-storage.md | 1 + docs/user-guide/io/csv.md | 6 ++++-- docs/user-guide/io/database.md | 2 +- docs/user-guide/io/json_file.md | 2 +- docs/user-guide/io/parquet.md | 1 - py-polars/polars/config.py | 14 ++++---------- 8 files changed, 17 insertions(+), 15 deletions(-) diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index a0c31f1cc8e3..553e1275c42e 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -13,6 +13,7 @@ python: read_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_json.html write_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_json.html read_ipc: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_ipc.html + read_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html min: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.min.html max: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.max.html value_counts: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.value_counts.html diff --git a/docs/src/python/user-guide/io/cloud-storage.py b/docs/src/python/user-guide/io/cloud-storage.py index 0f968e15f97b..0a01ae101ba6 100644 --- a/docs/src/python/user-guide/io/cloud-storage.py +++ b/docs/src/python/user-guide/io/cloud-storage.py @@ -1,3 +1,8 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + """ # --8<-- [start:read_parquet] import polars as pl diff --git a/docs/user-guide/io/cloud-storage.md b/docs/user-guide/io/cloud-storage.md index 69e01750c6f2..863f32097e3e 100644 --- a/docs/user-guide/io/cloud-storage.md +++ b/docs/user-guide/io/cloud-storage.md @@ -3,6 +3,7 @@ Polars can read and write to AWS S3, Azure Blob Storage and Google Cloud Storage. The API is the same for all three storage providers. To read from cloud storage, additional dependencies may be needed depending on the use case and cloud storage provider: + === ":fontawesome-brands-python: Python" ```shell diff --git a/docs/user-guide/io/csv.md b/docs/user-guide/io/csv.md index dc3304b91d12..b653f36823ac 100644 --- a/docs/user-guide/io/csv.md +++ b/docs/user-guide/io/csv.md @@ -1,11 +1,13 @@ # CSV -## Read & write +## Read Reading a CSV file should look familiar: -{{code_block('user-guide/io/csv','read',['read_csv'])}} +{{code_block +('user-guide/io/csv','read',['read_csv'])}} +## Write Writing a CSV file is similar with the `write_csv` function: {{code_block('user-guide/io/csv','write',['write_csv'])}} diff --git a/docs/user-guide/io/database.md b/docs/user-guide/io/database.md index a9922d8fcf76..ea1c9167fcc9 100644 --- a/docs/user-guide/io/database.md +++ b/docs/user-guide/io/database.md @@ -38,7 +38,7 @@ $ pip install connectorx #### ADBC -ADBC (Arrow Database Connectivity) is a new engine supported by the Apache Arrow project. +ADBC (Arrow Database Connectivity) is an engine supported by the Apache Arrow project. ADBC aims to be both an API standard for connecting to databases and libraries implementing this standard in a range of languages. It is still early days for ADBC so support for different databases is still limited. At present drivers for ADBC are only available for [Postgres and SQLite](https://arrow.apache.org/adbc/0.1.0/driver/cpp/index.html). To install ADBC you need to install the driver for your database. For example to install the driver for SQLite you run diff --git a/docs/user-guide/io/json_file.md b/docs/user-guide/io/json_file.md index 0f4e4fae6195..530fa176b3e3 100644 --- a/docs/user-guide/io/json_file.md +++ b/docs/user-guide/io/json_file.md @@ -22,7 +22,7 @@ Polars can read an ND-JSON file into a `DataFrame` using the `read_ndjson` funct {{code_block('user-guide/io/json-file','write',['write_json','write_ndjson'])}} -## Scan NDJSON +## Scan `Polars` allows you to _scan_ a JSON input **only for newline delimited json**. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. diff --git a/docs/user-guide/io/parquet.md b/docs/user-guide/io/parquet.md index ac071f507d56..886fb73bcb99 100644 --- a/docs/user-guide/io/parquet.md +++ b/docs/user-guide/io/parquet.md @@ -15,7 +15,6 @@ For reading a Parquet file from a cloud storage provider, see [Cloud storage](cl {{code_block('user-guide/io/parquet','write',['write_parquet'])}} -For writing a Parquet file to a cloud storage provider, see [Cloud storage](cloud-storage.md/#writing-to-cloud-storage). ## Scan diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index e41da17c695d..b4a17a35aac9 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -409,21 +409,15 @@ def set_auto_structify(cls, active: bool | None = False) -> type[Config]: ... out = df.select(pl.all()) ... >>> out - shape: (10, 1) + shape: (3, 1) ┌───────────┐ │ v │ │ --- │ │ struct[2] │ ╞═══════════╡ - │ {0,10} │ - │ {1,11} │ - │ {2,12} │ - │ {3,13} │ - │ … │ - │ {6,16} │ - │ {7,17} │ - │ {8,18} │ - │ {9,19} │ + │ {1,4} │ + │ {2,5} │ + │ {3,6} │ └───────────┘ """ From cb2ba8a223e3bc6d696c98b466255b2952ee9175 Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Mon, 9 Oct 2023 21:20:46 +0100 Subject: [PATCH 21/22] linting --- docs/user-guide/io/csv.md | 1 + docs/user-guide/io/parquet.md | 1 - py-polars/polars/config.py | 4 ++-- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user-guide/io/csv.md b/docs/user-guide/io/csv.md index b653f36823ac..52cd2be8e7e6 100644 --- a/docs/user-guide/io/csv.md +++ b/docs/user-guide/io/csv.md @@ -8,6 +8,7 @@ Reading a CSV file should look familiar: ('user-guide/io/csv','read',['read_csv'])}} ## Write + Writing a CSV file is similar with the `write_csv` function: {{code_block('user-guide/io/csv','write',['write_csv'])}} diff --git a/docs/user-guide/io/parquet.md b/docs/user-guide/io/parquet.md index 886fb73bcb99..478b607cb8fe 100644 --- a/docs/user-guide/io/parquet.md +++ b/docs/user-guide/io/parquet.md @@ -15,7 +15,6 @@ For reading a Parquet file from a cloud storage provider, see [Cloud storage](cl {{code_block('user-guide/io/parquet','write',['write_parquet'])}} - ## Scan `Polars` allows you to _scan_ a `Parquet` input. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index b4a17a35aac9..2cad649e61b1 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -404,7 +404,7 @@ def set_auto_structify(cls, active: bool | None = False) -> type[Config]: Examples -------- - >>> df = pl.DataFrame({"v": [1,2,3], "v2": [4,5,6]}) + >>> df = pl.DataFrame({"v": [1, 2, 3], "v2": [4, 5, 6]}) >>> with pl.Config(set_auto_structify=True): ... out = df.select(pl.all()) ... @@ -442,7 +442,7 @@ def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]: Examples -------- - >>> s = pl.Series([1.2304980958725870923,1e6,1e-8]) + >>> s = pl.Series([1.2304980958725870923, 1e6, 1e-8]) >>> with pl.Config(set_fmt_float="mixed"): ... print(s) ... From c4e1b2dc700dbbded048e7af65db0b0bb7f401cb Mon Sep 17 00:00:00 2001 From: Liam Brannigan Date: Mon, 9 Oct 2023 21:53:32 +0100 Subject: [PATCH 22/22] remove commas --- py-polars/polars/config.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index 2cad649e61b1..9dad6f8dc4ab 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -449,9 +449,9 @@ def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]: shape: (3,) Series: '' [f64] [ - 1.230498 - 1e6, - 1.0000e-8 + 1.230498 + 1e6 + 1.0000e-8 ] >>> with pl.Config(set_fmt_float="full"): @@ -460,9 +460,9 @@ def set_fmt_float(cls, fmt: FloatFmt | None = "mixed") -> type[Config]: shape: (3,) Series: '' [f64] [ - 1.230498095872587, - 1000000, - 0.00000001 + 1.230498095872587 + 1000000 + 0.00000001 ] """