spotify · clairemcginty · Jul 9, 2024 · Jul 9, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/build.sbt b/build.sbt
@@ -693,6 +693,7 @@ lazy val tools = project
       "com.google.apis" % "google-api-services-bigquery" % bigqueryVersion,
       "org.apache.avro" % "avro" % avroVersion % Provided,
       "org.apache.parquet" % "parquet-hadoop" % parquetVersion,
+      "org.apache.hadoop" % "hadoop-common" % hadoopVersion,
       "org.typelevel" %% "paiges-core" % paigesVersion
     )
   )
@@ -707,6 +708,7 @@ lazy val jmh: Project = project
     cats % Test,
     datastore % Test,
     guava % Test,
+    parquet % "test->test",
     protobuf % "test->test",
     scalacheck % Test,
     tensorflow % Test,
@@ -726,7 +728,12 @@ lazy val jmh: Project = project
       "com.google.apis" % "google-api-services-bigquery" % bigqueryVersion % Test,
       "com.google.cloud.datastore" % "datastore-v1-proto-client" % datastoreVersion % Test,
       "org.apache.avro" % "avro" % avroVersion % Test,
-      "org.tensorflow" % "tensorflow-core-api" % tensorflowVersion % Test
+      "org.tensorflow" % "tensorflow-core-api" % tensorflowVersion % Test,
+      "org.apache.parquet" % "parquet-avro" % parquetVersion % Test,
+      "org.apache.parquet" % "parquet-column" % parquetVersion % Test,
+      "org.apache.parquet" % "parquet-hadoop" % parquetVersion % Test,
+      "org.apache.hadoop" % "hadoop-common" % hadoopVersion % Test,
+      "org.apache.hadoop" % "hadoop-mapreduce-client-core" % hadoopVersion % Test
     )
   )
 

diff --git a/jmh/src/test/scala/magnolify/jmh/MagnolifyBench.scala b/jmh/src/test/scala/magnolify/jmh/MagnolifyBench.scala
@@ -16,13 +16,18 @@
 
 package magnolify.jmh
 
-import java.util.concurrent.TimeUnit
+import magnolify.parquet.{MagnolifyParquetProperties, ParquetType, TestInputFile, TestOutputFile}
 
+import java.util.concurrent.TimeUnit
 import magnolify.scalacheck.auto._
 import magnolify.test.Simple._
+import org.apache.hadoop.conf.Configuration
+import org.apache.parquet.hadoop.{ParquetReader, ParquetWriter}
 import org.scalacheck._
 import org.openjdk.jmh.annotations._
 
+import scala.jdk.CollectionConverters._
+
 object MagnolifyBench {
   val seed: rng.Seed = rng.Seed(0)
   val prms: Gen.Parameters = Gen.Parameters.default
@@ -87,6 +92,77 @@ class AvroBench {
   @Benchmark def avroSchema: Schema = AvroType[Nested].schema
 }
 
+@State(Scope.Benchmark)
+class ParquetReadState(pt: ParquetType[Nested]) {
+  var out: TestOutputFile = null
+  var reader: ParquetReader[Nested] = null
+
+  @Setup(Level.Invocation)
+  def setup(): Unit = {
+    out = new TestOutputFile
+    val writer = pt.writeBuilder(out).build()
+    writer.write(MagnolifyBench.nested)
+    writer.close()
+
+    val in = new TestInputFile(out.getBytes)
+    reader = pt.readBuilder(in).build()
+  }
+
+  @TearDown(Level.Invocation)
+  def tearDown(): Unit = {
+    reader.close()
+  }
+}
+
+@State(Scope.Benchmark)
+class ParquetWriteState(pt: ParquetType[Nested]) {
+  var writer: ParquetWriter[Nested] = null
+
+  @Setup(Level.Invocation)
+  def setup(): Unit = {
+    val out = new TestOutputFile
+    writer = pt.writeBuilder(out).build()
+  }
+
+  @TearDown(Level.Invocation)
+  def tearDown(): Unit = {
+    writer.close()
+  }
+}
+
+object ParquetStates {
+  def confWithGroupedArraysProp(propValue: Boolean): Configuration = {
+    val conf = new Configuration()
+    conf.setBoolean(MagnolifyParquetProperties.WriteGroupedArrays, propValue)
+    conf
+  }
+  class DefaultParquetReadState extends ParquetReadState(ParquetType[Nested](confWithGroupedArraysProp(false)))
+  class DefaultParquetWriteState extends ParquetWriteState(ParquetType[Nested](confWithGroupedArraysProp(false)))
+
+  class ParquetAvroCompatReadState extends ParquetReadState(ParquetType[Nested](confWithGroupedArraysProp(true)))
+  class ParquetAvroCompatWriteState extends ParquetWriteState(ParquetType[Nested](confWithGroupedArraysProp(true)))
+}
+
+@BenchmarkMode(Array(Mode.AverageTime))
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+class ParquetBench {
+  import MagnolifyBench._
+
+  @Benchmark def parquetWrite(state: ParquetStates.DefaultParquetWriteState): Unit = state.writer.write(nested)
+  @Benchmark def parquetRead(state: ParquetStates.DefaultParquetReadState): Nested = state.reader.read()
+}
+
+@BenchmarkMode(Array(Mode.AverageTime))
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+class ParquetAvroCompatBench {
+  import MagnolifyBench._
+
+  @Benchmark def parquetWrite(state: ParquetStates.ParquetAvroCompatWriteState): Unit = state.writer.write(nested)
+  @Benchmark def parquetRead(state: ParquetStates.ParquetAvroCompatReadState): Nested = state.reader.read()
+}
+
 @BenchmarkMode(Array(Mode.AverageTime))
 @OutputTimeUnit(TimeUnit.NANOSECONDS)
 @State(Scope.Thread)
@@ -157,7 +233,7 @@ class ExampleBench {
   private val exampleNested = implicitly[Arbitrary[ExampleNested]].arbitrary(prms, seed).get
   private val example = exampleType.to(exampleNested).build()
   @Benchmark def exampleTo: Example.Builder = exampleType.to(exampleNested)
-  @Benchmark def exampleFrom: ExampleNested = exampleType.from(example)
+  @Benchmark def exampleFrom: ExampleNested = exampleType.from(example.getFeatures.getFeatureMap.asScala.toMap)
 }
 
 // Collections are not supported

diff --git a/parquet/src/main/scala/magnolify/parquet/MagnolifyParquetProperties.scala b/parquet/src/main/scala/magnolify/parquet/MagnolifyParquetProperties.scala
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2024 Spotify AB
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package magnolify.parquet
+
+import org.apache.hadoop.conf.Configuration
+
+/**
+ * Properties for reading and writing Magnolify ParquetType classes, configurable via a Hadoop
+ * [[Configuration]] instance.
+ */
+object MagnolifyParquetProperties {
+  val WriteGroupedArrays: String = "magnolify.parquet.write-grouped-arrays"
+  val WriteGroupedArraysDefault: Boolean = false
+
+  val WriteAvroSchemaToMetadata: String = "magnolify.parquet.write-avro-schema"
+  val WriteAvroSchemaToMetadataDefault: Boolean = true
+
+  val ReadTypeKey = "parquet.type.read.type"
+  val WriteTypeKey = "parquet.type.write.type"
+
+  // Hash any Configuration values that might affect schema creation to use as part of Schema cache key
+  private[parquet] def hashValues(conf: Configuration): Int =
+    Option(conf.get(WriteGroupedArrays))
+      .map(_.toBoolean)
+      .getOrElse(WriteGroupedArraysDefault)
+      .hashCode()
+}