Skip to content

Commit e15132b

Browse files
authored
[Spark] Fall back to zordering when clustering on a single column (#3109)
## Description Fall back to zorder when clustering on a single column, because hilbert clustering doesn't support 1 column. Resolves #3087 ## How was this patch tested? New unit test.
1 parent 8a8e757 commit e15132b

File tree

2 files changed

+25
-0
lines changed

2 files changed

+25
-0
lines changed

spark/src/main/scala/org/apache/spark/sql/delta/skipping/MultiDimClustering.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ object MultiDimClustering {
5050
curve: String): DataFrame = {
5151
assert(colNames.nonEmpty, "Cannot cluster by zero columns!")
5252
val clusteringImpl = curve match {
53+
case "hilbert" if colNames.size == 1 => ZOrderClustering
5354
case "hilbert" => HilbertClustering
5455
case "zorder" => ZOrderClustering
5556
case unknownCurve =>

spark/src/test/scala/org/apache/spark/sql/delta/clustering/ClusteredTableClusteringSuite.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,28 @@ class ClusteredTableClusteringSuite extends SparkFunSuite
7979
}
8080
}
8181
}
82+
83+
test("cluster by 1 column") {
84+
withSQLConf(SQLConf.MAX_RECORDS_PER_FILE.key -> "2") {
85+
withClusteredTable(
86+
table = table,
87+
schema = "col1 int, col2 int",
88+
clusterBy = "col1") {
89+
addFiles(table, numFiles = 4)
90+
val files0 = getFiles(table)
91+
assert(files0.size === 4)
92+
assertNotClustered(files0)
93+
94+
// Optimize should cluster the data into two 2 files since MAX_RECORDS_PER_FILE is 2.
95+
runOptimize(table) { metrics =>
96+
assert(metrics.numFilesRemoved == 4)
97+
assert(metrics.numFilesAdded == 2)
98+
}
99+
100+
val files1 = getFiles(table)
101+
assert(files1.size == 2)
102+
assertClustered(files1)
103+
}
104+
}
105+
}
82106
}

0 commit comments

Comments
 (0)