Skip to content

Commit

Permalink
GH-28994: [C++][JSON] Change the max rows to Unlimited(int_32) (#38582)
Browse files Browse the repository at this point in the history
### Rationale for this change
Unlimited parse max rows for parse json block. Raise `Row count overflowed int32_t` error when the loop times out of `init_32::max()`.

See issue: #28994

### What changes are included in this PR?
Delete const `kMaxParserNumRows`,  Minor code( C++ ) modifications and test code(python)

### Are these changes tested? Yes
New a test code for parse large (100100) rows json .

### Are there any user-facing changes?  No

* Closes: #28994

Lead-authored-by: zhipeng <[email protected]>
Co-authored-by: zhipeng <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
  • Loading branch information
Ox0400 authored Nov 27, 2023
1 parent 62e1e9a commit ca46557
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 8 deletions.
2 changes: 1 addition & 1 deletion cpp/src/arrow/dataset/file_json_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class JsonScanMixin {

// Use a reduced number of rows in valgrind to avoid timeouts.
#ifndef ARROW_VALGRIND
constexpr static int64_t kTestMaxNumRows = json::kMaxParserNumRows;
constexpr static int64_t kTestMaxNumRows = (1UL << 17);
#else
constexpr static int64_t kTestMaxNumRows = 1024;
#endif
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/json/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -769,8 +769,8 @@ class HandlerBase : public BlockParser,
rj::kParseNumbersAsStringsFlag;

rj::Reader reader;

for (; num_rows_ < kMaxParserNumRows; ++num_rows_) {
// ensure that the loop can exit when the block too large.
for (; num_rows_ < std::numeric_limits<int32_t>::max(); ++num_rows_) {
auto ok = reader.Parse<parse_flags>(json, handler);
switch (ok.Code()) {
case rj::kParseErrorNone:
Expand All @@ -790,7 +790,7 @@ class HandlerBase : public BlockParser,
return ParseError(rj::GetParseError_En(ok.Code()), " in row ", num_rows_);
}
}
return Status::Invalid("Exceeded maximum rows");
return Status::Invalid("Row count overflowed int32_t");
}

template <typename Handler>
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/arrow/json/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ struct Kind {
static Status ForType(const DataType& type, Kind::type* kind);
};

constexpr int32_t kMaxParserNumRows = 100000;

/// \class BlockParser
/// \brief A reusable block-based parser for JSON data
///
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/arrow/json/parser_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,6 @@ static void ParseJSONFields(benchmark::State& state) { // NOLINT non-const refe
int32_t num_rows = static_cast<int32_t>(2e4 / (1.0 - sparsity) / num_fields);
// ... however, we want enough rows to make setup/finish overhead negligible
num_rows = std::max<int32_t>(num_rows, 200);
// ... and also we want to avoid an "Exceeded maximum rows" error.
num_rows = std::min<int32_t>(num_rows, kMaxParserNumRows);
// In the end, we will empirically generate between 400 kB and 4 MB of JSON data.

auto fields = GenerateTestFields(num_fields, 10);
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,14 @@ def test_small_random_json(self):
assert table.equals(expected)
assert table.to_pydict() == expected.to_pydict()

def test_load_large_json(self):
data, expected = make_random_json(num_cols=2, num_rows=100100)
# set block size is 10MB
read_options = ReadOptions(block_size=1024*1024*10)
table = self.read_bytes(data, read_options=read_options)
assert table.num_rows == 100100
assert expected.num_rows == 100100

def test_stress_block_sizes(self):
# Test a number of small block sizes to stress block stitching
data_base, expected = make_random_json(num_cols=2, num_rows=100)
Expand Down

0 comments on commit ca46557

Please sign in to comment.