[3.3][Kernel][Defaults] Get rid of using package private classes from parquet-mr (#4494)

vkorukanti · web-flow · commit 6b7db369ffde · 2025-05-02T19:18:05.000-07:00
(Cherrypick #4429) Currently, we override the `InternalParquetRecordReader`, which is package private. However, this is causing IllegalAccess errors when parquet and kernel libraries are loaded in different classloaders (more details on the illegal access [here](https://stackoverflow.com/questions/14282726/urlclassloader-and-accessibility-of-package-private-methods/14283808#14283808)). Instead, use the `ParquetReader` builder to avoid accessing package private access issue. Manually tested in an environment that loads parquet and kernel in separate classloaders. Existing tests for verifying the functionality remain the same.
diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ParquetColumnReaders.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ParquetColumnReaders.java
@@ -38,12 +38,15 @@ class ParquetColumnReaders {
   public static Converter createConverter(
       int initialBatchSize, DataType typeFromClient, Type typeFromFile) {
     if (typeFromClient instanceof StructType) {
+      checkArgument(typeFromFile instanceof GroupType, "cannot be cast to GroupType");
       return new RowColumnReader(
           initialBatchSize, (StructType) typeFromClient, (GroupType) typeFromFile);
     } else if (typeFromClient instanceof ArrayType) {
+      checkArgument(typeFromFile instanceof GroupType, "cannot be cast to GroupType");
       return new ArrayColumnReader(
           initialBatchSize, (ArrayType) typeFromClient, (GroupType) typeFromFile);
     } else if (typeFromClient instanceof MapType) {
+      checkArgument(typeFromFile instanceof GroupType, "cannot be cast to GroupType");
       return new MapColumnReader(
           initialBatchSize, (MapType) typeFromClient, (GroupType) typeFromFile);
     } else if (typeFromClient instanceof StringType || typeFromClient instanceof BinaryType) {
diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ParquetFileReader.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/parquet/ParquetFileReader.java
@@ -18,7 +18,6 @@
 import static io.delta.kernel.defaults.internal.parquet.ParquetFilterUtils.toParquetFilter;
 import static io.delta.kernel.internal.util.Preconditions.checkArgument;
 import static java.util.Objects.requireNonNull;
-import static org.apache.parquet.hadoop.ParquetInputFormat.*;
 
 import io.delta.kernel.data.ColumnarBatch;
 import io.delta.kernel.exceptions.KernelEngineException;
@@ -31,8 +30,9 @@
 import java.util.*;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.parquet.filter2.compat.FilterCompat;
 import org.apache.parquet.filter2.predicate.FilterPredicate;
-import org.apache.parquet.hadoop.ParquetRecordReaderWrapper;
+import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.api.InitContext;
 import org.apache.parquet.hadoop.api.ReadSupport;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
@@ -60,7 +60,7 @@ public CloseableIterator<ColumnarBatch> read(
 
     return new CloseableIterator<ColumnarBatch>() {
       private final BatchReadSupport readSupport = new BatchReadSupport(maxBatchSize, schema);
-      private ParquetRecordReaderWrapper<Object> reader;
+      private ParquetReader<Object> reader;
       private boolean hasNotConsumedNextElement;
 
       @Override
@@ -76,9 +76,10 @@ public boolean hasNext() {
             return true;
           }
 
-          hasNotConsumedNextElement = reader.nextKeyValue() && reader.getCurrentValue() != null;
+          Object next = reader.read();
+          hasNotConsumedNextElement = next != null;
           return hasNotConsumedNextElement;
-        } catch (IOException | InterruptedException ex) {
+        } catch (IOException ex) {
           throw new KernelEngineException("Error reading Parquet file: " + path, ex);
         }
       }
@@ -115,28 +116,32 @@ private void initParquetReaderIfRequired() {
                 org.apache.parquet.hadoop.ParquetFileReader.readFooter(confCopy, filePath);
 
             MessageType parquetSchema = footer.getFileMetaData().getSchema();
+
             Optional<FilterPredicate> parquetPredicate =
                 predicate.flatMap(predicate -> toParquetFilter(parquetSchema, predicate));
 
-            if (parquetPredicate.isPresent()) {
-              // clone the configuration to avoid modifying the original one
-              confCopy = new Configuration(confCopy);
-
-              setFilterPredicate(confCopy, parquetPredicate.get());
-              // Disable the record level filtering as the `parquet-mr` evaluates
-              // the filter once the entire record has been materialized. Instead,
-              // we use the predicate to prune the row groups which is more efficient.
-              // In the future, we can consider using the record level filtering if a
-              // native Parquet reader is implemented in Kernel default module.
-              confCopy.set(RECORD_FILTERING_ENABLED, "false");
-              confCopy.set(DICTIONARY_FILTERING_ENABLED, "false");
-              confCopy.set(COLUMN_INDEX_FILTERING_ENABLED, "false");
-            }
-
-            // Pass the already read footer to the reader to avoid reading it again.
-            fileReader = new ParquetFileReaderWithFooter(filePath, confCopy, footer);
-            reader = new ParquetRecordReaderWrapper<>(readSupport);
-            reader.initialize(fileReader, confCopy);
+            // TODO: We can avoid reading the footer again if we can pass the footer, but there is
+            // no API to do that in the current version of parquet-mr which takes InputFile
+            // as input.
+            reader =
+                new ParquetReader.Builder<Object>(filePath) {
+                  @Override
+                  protected ReadSupport<Object> getReadSupport() {
+                    return readSupport;
+                  }
+                }.withFilter(parquetPredicate.map(FilterCompat::get).orElse(FilterCompat.NOOP))
+                    // Disable the record level filtering as the `parquet-mr` evaluates
+                    // the filter once the entire record has been materialized. Instead,
+                    // we use the predicate to prune the row groups which is more efficient.
+                    // In the future, we can consider using the record level filtering if a
+                    // native Parquet reader is implemented in Kernel default module.
+                    .useRecordFilter(false)
+                    .useStatsFilter(true) // only enable the row group level filtering
+                    .useBloomFilter(false)
+                    .useDictionaryFilter(false)
+                    .useColumnIndexFilter(false)
+                    .build();
+
           } catch (IOException e) {
             Utils.closeCloseablesSilently(fileReader, reader);
             throw new KernelEngineException("Error reading Parquet file: " + path, e);
diff --git a/kernel/kernel-defaults/src/main/java/org/apache/parquet/hadoop/ParquetRecordReaderWrapper.java b/kernel/kernel-defaults/src/main/java/org/apache/parquet/hadoop/ParquetRecordReaderWrapper.java
diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileReaderSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/parquet/ParquetFileReaderSuite.scala
@@ -22,12 +22,12 @@ import io.delta.golden.GoldenTableUtils.{goldenTableFile, goldenTablePath}
 import io.delta.kernel.defaults.utils.{ExpressionTestUtils, TestRow}
 import io.delta.kernel.test.VectorTestUtils
 import io.delta.kernel.types._
+
 import org.apache.spark.sql.internal.SQLConf
 import org.scalatest.funsuite.AnyFunSuite
-import org.apache.parquet.io.ParquetDecodingException
 
 class ParquetFileReaderSuite extends AnyFunSuite
-  with ParquetSuiteBase with VectorTestUtils with ExpressionTestUtils {
+    with ParquetSuiteBase with VectorTestUtils with ExpressionTestUtils {
 
   test("decimals encoded using dictionary encoding ") {
     // Below golden tables contains three decimal columns
@@ -191,9 +191,10 @@ class ParquetFileReaderSuite extends AnyFunSuite
     val ex = intercept[Throwable] {
       readParquetFilesUsingKernel(inputLocation, readSchema)
     }
+
     // We don't properly reject conversions and the error we get vary a lot, this checks various
     // error message we may get as result.
-    // TODO: Uniformize rejecting unsupported conversions.
+    // TODO(delta-io/delta#4493): Uniformize rejecting unsupported conversions.
     assert(
       ex.getMessage.contains("Can not read value") ||
       ex.getMessage.contains("column with Parquet type") ||