apacheGH-18487: [R] Read Text (CSV/JSON) from character vector (apach…

…e#33968) ### Rationale for this change Allows literal strings to be read directly through the `I()` function in the same way as the `readr::read_csv()` function. This is useful for checking behavior without the need to create temporary files. ```r > read_csv_arrow(I("x,y\n1,2\n3,4")) # A tibble: 2 × 2 x y <int> <int> 1 1 2 2 3 4 ``` ```r > read_csv_arrow(I(c( "x,y 1,2 3,4" ))) # A tibble: 2 × 2 x y <int> <int> 1 1 2 2 3 4 ``` ```r > read_csv_arrow(I(c("x,y", "1,2", "3,4"))) # A tibble: 2 × 2 x y <int> <int> 1 1 2 2 3 4 ``` ### What changes are included in this PR? In `read_csv_arrow` and `read_json_arrow`, if the first argument `file` inherits `AsIs` class, `file` is now interpreted as literal data. This is consistent with the behavior of `readr::read_csv()`, which is widely used to read text files as data frames. This is a breaking change; the behavior of wrapping a path as a string with `I()` is changed. For example #### readr::read_csv ```r > readr::read_csv(I(readr::readr_example("mtcars.csv"))) Rows: 0 Columns: 1 ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────── Delimiter: "," chr (1): /usr/local/lib/R/site-library/readr/extdata/mtcars.csv ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message. # A tibble: 0 × 1 # … with 1 variable: # /usr/local/lib/R/site-library/readr/extdata/mtcars.csv <chr> # ℹ Use `colnames()` to see all variable names ``` #### arrow 10.01's arrow::read_csv_arrow ```r > arrow::read_csv_arrow(I(readr::readr_example("mtcars.csv"))) # A tibble: 32 × 11 mpg cyl disp hp drat wt qsec vs am gear carb <dbl> <int> <dbl> <int> <dbl> <dbl> <dbl> <int> <int> <int> <int> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 # … with 22 more rows # ℹ Use `print(n = ...)` to see more rows ``` #### This PR's arrow::read_csv_arrow ```r > arrow::read_csv_arrow(I(readr::readr_example("mtcars.csv"))) Error: ! Invalid: CSV parse error: Empty CSV file or block: cannot infer number of columns Run `rlang::last_error()` to see where the error occurred. ``` * Closes: apache#18487 Lead-authored-by: SHIMA Tatsuya <[email protected]> Co-authored-by: eitsupi <[email protected]> Co-authored-by: Nic Crane <[email protected]> Signed-off-by: Nic Crane <[email protected]>
raulcd · Feb 6, 2023 · 0074a66 · 0074a66
1 parent 32c7130
commit 0074a66
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 13 deletions.
diff --git a/r/R/csv.R b/r/R/csv.R
@@ -77,12 +77,15 @@
 #' `col_names`, and the CSV file has a header row that would otherwise be used
 #' to idenfity column names, you'll need to add `skip = 1` to skip that row.
 #'
-#' @param file A character file name or URI, `raw` vector, an Arrow input stream,
-#' or a `FileSystem` with path (`SubTreeFileSystem`).
+#' @param file A character file name or URI, literal data (either a single string or a [raw] vector),
+#' an Arrow input stream, or a `FileSystem` with path (`SubTreeFileSystem`).
+#'
 #' If a file name, a memory-mapped Arrow [InputStream] will be opened and
 #' closed when finished; compression will be detected from the file extension
 #' and handled automatically. If an input stream is provided, it will be left
 #' open.
+#'
+#' To be recognised as literal data, the input must be wrapped with `I()`.
 #' @param delim Single character used to separate fields within a record.
 #' @param quote Single character used to quote strings.
 #' @param escape_double Does the file escape quotes by doubling them?
@@ -154,6 +157,10 @@
 #'   tf,
 #'   col_types = schema(x = timestamp(unit = "us", timezone = "UTC"))
 #' )
+#'
+#' # Read directly from strings with `I()`
+#' read_csv_arrow(I("x,y\n1,2\n3,4"))
+#' read_delim_arrow(I(c("x y", "1 2", "3 4")), delim = " ")
 read_delim_arrow <- function(file,
                              delim = ",",
                              quote = '"',
@@ -198,6 +205,15 @@ read_delim_arrow <- function(file,
     )
   }
 
+  if (inherits(file, "AsIs")) {
+    if (is.raw(file)) {
+      # If a raw vector is wrapped by `I()`, we need to unclass the `AsIs` class to read the raw vector.
+      file <- unclass(file)
+    } else {
+      file <- charToRaw(paste(file, collapse = "\n"))
+    }
+  }
+
   if (!inherits(file, "InputStream")) {
     compression <- detect_compression(file)
     file <- make_readable_file(file)

diff --git a/r/R/json.R b/r/R/json.R
@@ -48,12 +48,24 @@
 #'     { "hello": 3.25, "world": null }
 #'     { "hello": 0.0, "world": true, "yo": null }
 #'   ', tf, useBytes = TRUE)
+#'
 #' read_json_arrow(tf)
+#'
+#' # Read directly from strings with `I()`
+#' read_json_arrow(I(c('{"x": 1, "y": 2}', '{"x": 3, "y": 4}')))
 read_json_arrow <- function(file,
                             col_select = NULL,
                             as_data_frame = TRUE,
                             schema = NULL,
                             ...) {
+  if (inherits(file, "AsIs")) {
+    if (is.raw(file)) {
+      file <- unclass(file)
+    } else {
+      file <- charToRaw(paste(file, collapse = "\n"))
+    }
+  }
+
   if (!inherits(file, "InputStream")) {
     compression <- detect_compression(file)
     file <- make_readable_file(file)

diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd
diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd
diff --git a/r/tests/testthat/test-csv.R b/r/tests/testthat/test-csv.R
@@ -687,3 +687,14 @@ test_that("CSV reading/parsing/convert options can be passed in as lists", {
 
   expect_equal(tab1, tab2)
 })
+
+test_that("Read literal data directly", {
+  expected <- tibble::tibble(x = c(1L, 3L), y = c(2L, 4L))
+
+  expect_identical(read_csv_arrow(I("x,y\n1,2\n3,4")), expected)
+  expect_identical(read_csv_arrow(I("x,y\r1,2\r3,4")), expected)
+  expect_identical(read_csv_arrow(I("x,y\n\r1,2\n\r3,4")), expected)
+  expect_identical(read_csv_arrow(charToRaw("x,y\n1,2\n3,4")), expected)
+  expect_identical(read_csv_arrow(I(charToRaw("x,y\n1,2\n3,4"))), expected)
+  expect_identical(read_csv_arrow(I(c("x,y", "1,2", "3,4"))), expected)
+})
diff --git a/r/tests/testthat/test-json.R b/r/tests/testthat/test-json.R
@@ -253,3 +253,12 @@ test_that("Can read json file with list<struct<T...>> nested columns (ARROW-7740
   expected <- tibble::tibble(a = c(list(one), list(one)))
   expect_equal(read_json_arrow(tf), expected, ignore_attr = TRUE)
 })
+
+test_that("Read literal data directly", {
+  expected <- tibble::tibble(x = c(1L, 3L), y = c(2L, 4L))
+
+  expect_identical(read_json_arrow(I('{"x": 1, "y": 2}\n{"x": 3, "y": 4}')), expected)
+  expect_identical(read_json_arrow(charToRaw('{"x": 1, "y": 2}\n{"x": 3, "y": 4}')), expected)
+  expect_identical(read_json_arrow(I(charToRaw('{"x": 1, "y": 2}\n{"x": 3, "y": 4}'))), expected)
+  expect_identical(read_json_arrow(I(c('{"x": 1, "y": 2}', '{"x": 3, "y": 4}'))), expected)
+})