Skip to content

Commit

Permalink
Fix NPE excpetion when reading dataset with nested array (#151)
Browse files Browse the repository at this point in the history
* Fix NPE excpetion when reading dataset with nested array

* Resolve comments

* Fix test

* add tests
  • Loading branch information
770120041 authored Dec 5, 2023
1 parent 563e7d9 commit 59b5423
Show file tree
Hide file tree
Showing 4 changed files with 352 additions and 2 deletions.
2 changes: 1 addition & 1 deletion core/src/main/java/org/apache/iceberg/SchemaParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ private static String defaultValueToJsonString(byte[] bytes) {

private static String defaultValueToJsonString(Object value) {
try {
return JsonUtil.mapper().writeValueAsString(value);
return JsonUtil.mapper().writeValueAsString(AvroSchemaUtil.convertToJavaDefaultValue(value));
} catch (JsonProcessingException e) {
throw new RuntimeException(e);
}
Expand Down
37 changes: 36 additions & 1 deletion core/src/main/java/org/apache/iceberg/avro/TypeToSchema.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

package org.apache.iceberg.avro;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Deque;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -103,7 +105,8 @@ public Schema struct(Types.StructType struct, List<Schema> fieldSchemas) {
String fieldName = isValidFieldName ? origFieldName : AvroSchemaUtil.sanitize(origFieldName);
Object defaultValue = structField.hasDefaultValue() ? structField.getDefaultValue() :
(structField.isOptional() ? JsonProperties.NULL_VALUE : null);
Schema.Field field = new Schema.Field(fieldName, fieldSchemas.get(i), structField.doc(), defaultValue);
Schema.Field field = new Schema.Field(fieldName, fieldSchemas.get(i), structField.doc(),
convertComplexNullToJsonNull(defaultValue));
if (!isValidFieldName) {
field.addProp(AvroSchemaUtil.ICEBERG_FIELD_NAME_PROP, origFieldName);
}
Expand Down Expand Up @@ -232,4 +235,36 @@ public Schema primitive(Type.PrimitiveType primitive) {

return primitiveSchema;
}

// This function ensures that all nested null are converted to JsonProperties.NULL_VALUE
// to make sure JacksonUtils.toJsonNode() converts them properly.
private Object convertComplexNullToJsonNull(Object defaultValue) {
if (defaultValue instanceof Map) {
for (Map.Entry<Object, Object> entry : ((Map<Object, Object>) defaultValue).entrySet()) {
if (entry.getValue() instanceof Map || entry.getValue() instanceof Collection) {
entry.setValue(convertComplexNullToJsonNull(entry.getValue()));
} else {
if (entry.getValue() == null) {
entry.setValue(JsonProperties.NULL_VALUE);
}
}
}
return defaultValue;
} else if (defaultValue instanceof List) {
List<Object> originalList = (List<Object>) defaultValue;
List<Object> copiedList = new ArrayList<>();

for (Object element : originalList) {
if (element instanceof Map || element instanceof Collection) {
copiedList.add(convertComplexNullToJsonNull(element));
} else if (element == null) {
copiedList.add(JsonProperties.NULL_VALUE);
} else {
copiedList.add(element);
}
}
return copiedList;
}
return defaultValue;
}
}
101 changes: 101 additions & 0 deletions core/src/test/java/org/apache/iceberg/avro/TestSchemaConversions.java
Original file line number Diff line number Diff line change
Expand Up @@ -516,4 +516,105 @@ public void testVariousTypesDefaultValues() {
Assert.assertTrue(IntStream.range(0, roundTripiSchema.columns().size())
.allMatch(i -> roundTripiSchema.columns().get(i).equals(iSchema.columns().get(i))));
}

@Test
public void testConversionOfRecordWithNestedSubElement() {
String schemaString = "{\n" +
" \"type\": \"record\",\n" +
" \"name\": \"Root\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"OuterRecord1\",\n" +
" \"type\": [\n" +
" \"null\",\n" +
" {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerElement\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField\",\n" +
" \"type\": {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerField1\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField1Param\",\n" +
" \"type\": [\n" +
" \"null\",\n" +
" \"string\"\n" +
" ],\n" +
" \"default\": null\n" +
" }\n" +
" ]\n" +
" },\n" +
" \"default\": {\n" +
" \"InnerField1Param\": null\n" +
" }\n" +
" }\n" +
" ]\n" +
" }\n" +
" ],\n" +
" \"default\": null\n" +
" },\n" +
" {\n" +
" \"name\": \"InnerElementV2\",\n" +
" \"type\": [\n" +
" \"null\",\n" +
" {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerElementV2\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField2\",\n" +
" \"type\": {\n" +
" \"type\": \"array\",\n" +
" \"items\": \"InnerElement\"\n" +
" },\n" +
" \"default\": []\n" +
" }\n" +
" ]\n" +
" }\n" +
" ],\n" +
" \"default\": null\n" +
" }\n" +
" ]\n" +
"}";
Schema schema = new Schema.Parser().parse(schemaString);
org.apache.iceberg.Schema iSchema = AvroSchemaUtil.toIceberg(schema);
String jSchema = SchemaParser.toJson(iSchema);
org.apache.iceberg.Schema roundTripiSchema = SchemaParser.fromJson(jSchema);
}
@Test
public void testConversionOfRecordWithNestedSubElementWithNotNullDefaultValue() {
String schemaString = "{\n" +
" \"type\": \"record\",\n" +
" \"name\": \"OuterRecord\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"nestedRecord\",\n" +
" \"type\": {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerRecord\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"myArray\",\n" +
" \"type\": {\n" +
" \"type\": \"array\",\n" +
" \"items\": \"int\"\n" +
" },\n" +
" \"default\": [1, 2, 3]\n" +
" }\n" +
" ],\n" +
" \"default\": {\"myArray\": [1, 2, 3]}\n" +
" },\n" +
" \"default\": {\"myArray\": [1, 2, 3]}\n" +
" }\n" +
" ],\n" +
" \"default\": {\"nestedRecord\": {\"myArray\": [1, 2, 3]}}\n" +
"}";
Schema schema = new Schema.Parser().parse(schemaString);
org.apache.iceberg.Schema iSchema = AvroSchemaUtil.toIceberg(schema);
String jSchema = SchemaParser.toJson(iSchema);
org.apache.iceberg.Schema roundTripiSchema = SchemaParser.fromJson(jSchema);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -198,5 +198,219 @@ public void testAvroDefaultValues() throws IOException {

}
}

/*
* Test nested array with default null on complex types
* if the table contains non-primitive Avro types (InnerElement in the test below)
* as the first field and arrays of InnerElement as the second field,
* it leads to a NullPointerException when operating on the table.
*/
@Test
public void testNestedArrayWithDefaultNullOnComplexTypes() throws IOException {
File testFile = temp.newFile();
String writeSchemaString = "{\n" +
" \"type\": \"record\",\n" +
" \"name\": \"Root\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"OuterRecord1\",\n" +
" \"type\": [\n" +
" \"null\",\n" +
" {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerElement\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField\",\n" +
" \"type\": {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerField1\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField1Param\",\n" +
" \"type\": [\n" +
" \"null\",\n" +
" \"string\"\n" +
" ],\n" +
" \"default\": null\n" +
" }\n" +
" ]\n" +
" },\n" +
" \"default\": {\n" +
" \"InnerField1Param\": null\n" +
" }\n" +
" }\n" +
" ]\n" +
" }\n" +
" ],\n" +
" \"default\": null\n" +
" },\n" +
" {\n" +
" \"name\": \"InnerElementV2\",\n" +
" \"type\": [\n" +
" \"null\",\n" +
" {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerElementV2\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField2\",\n" +
" \"type\": {\n" +
" \"type\": \"array\",\n" +
" \"items\": \"InnerElement\"\n" +
" },\n" +
" \"default\": []\n" +
" }\n" +
" ]\n" +
" }\n" +
" ],\n" +
" \"default\": null\n" +
" }\n" +
" ]\n" +
"}";
org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString);
org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema);
List<GenericData.Record> expected = RandomData.generateList(icebergWriteSchema, 2, 0L);
Assert.assertTrue("Delete should succeed", testFile.delete());

// write records with initial writeSchema
try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(testFile))
.schema(icebergWriteSchema)
.named("test")
.build()) {
for (GenericData.Record rec : expected) {
writer.add(rec);
}
}
}


/*
* Test nested array with default null on complex types.
* This test differs from testNestedArrayWithDefaultNullOnComplexTypes on the type
* of InnerField1Param, when it is a primitive type, no NPE is thrown when operating on the table.
*/
@Test
public void testNestedArrayWithDefaultNullOnPrimitiveTypes() throws IOException {
File testFile = temp.newFile();
String writeSchemaString = "{\n" +
" \"type\": \"record\",\n" +
" \"name\": \"Root\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"OuterRecord1\",\n" +
" \"type\": [\n" +
" \"null\",\n" +
" {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerElement\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField\",\n" +
" \"type\": {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerField1\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField1Param\",\n" +
" \"type\": \"int\",\n" +
" \"default\": 1\n" +
" }\n" +
" ]\n" +
" }\n" +
" }\n" +
" ]\n" +
" }\n" +
" ],\n" +
" \"default\": null\n" +
" },\n" +
" {\n" +
" \"name\": \"InnerElementV2\",\n" +
" \"type\": [\n" +
" \"null\",\n" +
" {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerElementV2\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"InnerField2\",\n" +
" \"type\": {\n" +
" \"type\": \"array\",\n" +
" \"items\": \"InnerElement\"\n" +
" },\n" +
" \"default\": []\n" +
" }\n" +
" ]\n" +
" }\n" +
" ],\n" +
" \"default\": null\n" +
" }\n" +
" ]\n" +
"}";
org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString);
org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema);

List<GenericData.Record> expected = RandomData.generateList(icebergWriteSchema, 2, 0L);


Assert.assertTrue("Delete should succeed", testFile.delete());

// write records with initial writeSchema
try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(testFile))
.schema(icebergWriteSchema)
.named("test")
.build()) {
for (GenericData.Record rec : expected) {
writer.add(rec);
}
}
}

@Test
public void testNestedArrayWithDefaultNullOnArrayTypes() throws IOException {
String writeSchemaString = "{\n" +
" \"type\": \"record\",\n" +
" \"name\": \"OuterRecord\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"nestedRecord\",\n" +
" \"type\": {\n" +
" \"type\": \"record\",\n" +
" \"name\": \"InnerRecord\",\n" +
" \"fields\": [\n" +
" {\n" +
" \"name\": \"myArray\",\n" +
" \"type\": {\n" +
" \"type\": \"array\",\n" +
" \"items\": \"int\"\n" +
" },\n" +
" \"default\": [1, 2, 3]\n" +
" }\n" +
" ],\n" +
" \"default\": {\"myArray\": [1, 2, 3]}\n" +
" },\n" +
" \"default\": {\"myArray\": [1, 2, 3]}\n" +
" }\n" +
" ],\n" +
" \"default\": {\"nestedRecord\": {\"myArray\": [1, 2, 3]}}\n" +
"}";
org.apache.avro.Schema writeSchema = new org.apache.avro.Schema.Parser().parse(writeSchemaString);
org.apache.iceberg.Schema icebergWriteSchema = AvroSchemaUtil.toIceberg(writeSchema);

List<GenericData.Record> expected = RandomData.generateList(icebergWriteSchema, 2, 0L);

File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());

// write records with initial writeSchema
try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(testFile))
.schema(icebergWriteSchema)
.named("test")
.build()) {
for (GenericData.Record rec : expected) {
writer.add(rec);
}
}
}
}

0 comments on commit 59b5423

Please sign in to comment.