pola-rs · braaannigan · Sep 7, 2023 · Sep 7, 2023 · Sep 13, 2023 · Sep 13, 2023
@@ -133,7 +133,8 @@ def read_csv(
         ``utf8-lossy``, the input is first decoded in memory with
         python. Defaults to ``utf8``.
     low_memory
-        Reduce memory usage at expense of performance.
+        Reduce memory usage at expense of performance when rechunking into
+        a single array. To work with larger than-memory datasets use streaming mode.
     rechunk
         Make sure that all columns are contiguous in memory by
         aggregating the chunks into a single array.
@@ -502,7 +503,8 @@ def read_csv_batched(
         ``utf8-lossy``, the input is first decoded in memory with
         python. Defaults to ``utf8``.
     low_memory
-        Reduce memory usage at expense of performance.
+        Reduce memory usage at expense of performance when rechunking into
+        a single array. To work with larger than-memory datasets use streaming mode.
     rechunk
         Make sure that all columns are contiguous in memory by
         aggregating the chunks into a single array.
@@ -781,7 +783,8 @@ def scan_csv(
         Lossy means that invalid utf8 values are replaced with ``�``
         characters. Defaults to "utf8".
     low_memory
-        Reduce memory usage in expense of performance.
+        Reduce memory usage at expense of performance when rechunking into
+        a single array. To work with larger than-memory datasets use streaming mode.
     rechunk
         Reallocate to contiguous memory when all chunks/ files are parsed.
     skip_rows_after_header

@@ -80,7 +80,8 @@ def read_parquet(
     row_count_offset
         Offset to start the row_count column (only use if the name is set).
     low_memory
-        Reduce memory pressure at the expense of performance.
+        Reduce memory usage at expense of performance when rechunking into
+        a single array. To work with larger than-memory datasets use streaming mode.
     pyarrow_options
         Keyword arguments for `pyarrow.parquet.read_table
         <https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html>`_.
@@ -215,7 +216,8 @@ def scan_parquet(
         particular storage connection.
         e.g. host, port, username, password, etc.
     low_memory
-        Reduce memory pressure at the expense of performance.
+        Reduce memory usage at expense of performance when rechunking into
+        a single array. To work with larger than-memory datasets use streaming mode.
     use_statistics
         Use statistics in the parquet to determine if pages
         can be skipped from reading.

@@ -1624,11 +1624,30 @@ def collect(
         **kwargs: Any,
     ) -> DataFrame:
         """
-        Collect into a DataFrame.
+        Collect a LazyFrame into a DataFrame.
 
-        Note: use :func:`fetch` if you want to run your query on the first `n` rows
+        Use :func:`fetch` if you want to run your query on the first `n` rows
         only. This can be a huge time saver in debugging queries.
 
+        By default all query optimizations are applied. Use the arguments
+        to collect to turn off particular optimizations.
+
+        If streaming is False the entire query is processed in a single batch.
+        If streaming is True Polars tries to process the query in batches for
+        larger than memory datasets. Use :func:`explain` to see if Polars
+        can process the query in streaming mode.
+        Use :func:`polars.set_streaming_chunk_size` to set the size of the
+        batches.
+
+        See Also
+        --------
+        polars.collect_all : Collect multiple LazyFrames at the same time.
+        polars.collect_all_async: Collect multiple LazyFrames at the same time lazily.
+        polars.explain : Print the query plan that is evaluated with collect.
+        polars.set_streaming_chunk_size : Set the size of streaming batches.
+        profile : Collect the LazyFrame and time each node in the computation graph.
+
+
         Parameters
         ----------
         type_coercion
@@ -1677,6 +1696,24 @@ def collect(
         │ c   ┆ 6   ┆ 1   │
         └─────┴─────┴─────┘
 
+        Collect in streaming mode
+
+        >>> (
+        ...     lf.group_by("a", maintain_order=True)
+        ...     .agg(pl.all().sum())
+        ...     .collect(streaming=True)
+        ... )
+        shape: (3, 3)
+        ┌─────┬─────┬─────┐
+        │ a   ┆ b   ┆ c   │
+        │ --- ┆ --- ┆ --- │
+        │ str ┆ i64 ┆ i64 │
+        ╞═════╪═════╪═════╡
+        │ a   ┆ 4   ┆ 10  │
+        │ b   ┆ 11  ┆ 10  │
+        │ c   ┆ 6   ┆ 1   │
+        └─────┴─────┴─────┘
+
         """
         eager = kwargs.get("eager", False)
         if no_optimization or eager:
@@ -1830,7 +1867,7 @@ def sink_parquet(
         slice_pushdown: bool = True,
     ) -> DataFrame:
         """
-        Persists a LazyFrame at the provided path.
+        Collect and write a LazyFrame in streaming mode to a Parquet file at the path.
 
         This allows streaming results that are larger than RAM to be written to disk.
 
@@ -1926,7 +1963,7 @@ def sink_ipc(
         slice_pushdown: bool = True,
     ) -> DataFrame:
         """
-        Persists a LazyFrame at the provided path.
+        Collect and write a LazyFrame in streaming mode to an IPC file at the path.
 
         This allows streaming results that are larger than RAM to be written to disk.
 
@@ -2009,7 +2046,7 @@ def sink_csv(
         slice_pushdown: bool = True,
     ) -> DataFrame:
         """
-        Persists a LazyFrame at the provided path.
+        Collect and write a LazyFrame in streaming mode to a CSV file at the path.
 
         This allows streaming results that are larger than RAM to be written to disk.
 
@@ -2629,7 +2666,7 @@ def group_by(
         maintain_order
             Ensure that the order of the groups is consistent with the input data.
             This is slower than a default group by.
-            Settings this to ``True`` blocks the possibility
+            Setting this to ``True`` blocks the possibility
             to run on the streaming engine.
 
         Examples