Skip to content

Commit 4456a12

Browse files
author
jintao shen
authored
[Spark] Implement incremental clustering using ZCUBE approach
## Description Implement incremental Liquid clustering according to the deisgn [doc](https://docs.google.com/document/d/1FWR3odjOw4v4-hjFy_hVaNdxHVs4WuK1asfB6M6XEMw/edit?usp=sharing). This implementation uses ZCube based approach to achieve incremental clustering. When a Zcube size is big enough, the zcube is sealed and the next clustering won't re-cluster those files and so less write amplification. Key changes Each clustered file is tagged with ZCUBE_ID to track which ZCUBE it belongs to and the id is generated using UUID. Also anther tag ZCUBE_ZORDER_BY is used to track the clustering columns. Each clustered file has the clsuteringProvider populated with liquid. ## How was this patch tested? new unit tests. ## Does this PR introduce _any_ user-facing changes? No
1 parent c046547 commit 4456a12

File tree

12 files changed

+1131
-39
lines changed

12 files changed

+1131
-39
lines changed

spark/src/main/scala/org/apache/spark/sql/delta/commands/OptimizeTableCommand.scala

Lines changed: 27 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,15 @@ class OptimizeExecutor(
225225
optimizeContext: DeltaOptimizeContext)
226226
extends DeltaCommand with SQLMetricsReporting with Serializable {
227227

228+
/**
229+
* In which mode the Optimize command is running. There are three valid modes:
230+
* 1. Compaction
231+
* 2. ZOrder
232+
* 3. Clustering
233+
*/
234+
private val optimizeStrategy =
235+
OptimizeTableStrategy(sparkSession, txn.snapshot, optimizeContext, zOrderByColumns)
236+
228237
/** Timestamp to use in [[FileAction]] */
229238
private val operationTimestamp = new SystemClock().getTimeMillis()
230239

@@ -242,15 +251,6 @@ class OptimizeExecutor(
242251
}
243252
}
244253

245-
private lazy val curve: String = {
246-
if (zOrderByColumns.nonEmpty) {
247-
"zorder"
248-
} else {
249-
assert(isClusteredTable)
250-
"hilbert"
251-
}
252-
}
253-
254254
def optimize(): Seq[Row] = {
255255
recordDeltaOperation(txn.deltaLog, "delta.optimize") {
256256
val minFileSize = optimizeContext.minFileSize.getOrElse(
@@ -269,7 +269,7 @@ class OptimizeExecutor(
269269
}
270270
val partitionsToCompact = filesToProcess.groupBy(_.partitionValues).toSeq
271271

272-
val jobs = groupFilesIntoBins(partitionsToCompact, maxFileSize)
272+
val jobs = groupFilesIntoBins(partitionsToCompact)
273273

274274
val maxThreads =
275275
sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_THREADS)
@@ -320,18 +320,7 @@ class OptimizeExecutor(
320320
numDeletionVectorRowsRemoved = removedDVs.map(_.cardinality).sum))
321321
}
322322

323-
if (isMultiDimClustering) {
324-
val inputFileStats =
325-
ZOrderFileStats(removedFiles.size, removedFiles.map(_.size.getOrElse(0L)).sum)
326-
optimizeStats.zOrderStats = Some(ZOrderStats(
327-
strategyName = "all", // means process all files in a partition
328-
inputCubeFiles = ZOrderFileStats(0, 0),
329-
inputOtherFiles = inputFileStats,
330-
inputNumCubes = 0,
331-
mergedFiles = inputFileStats,
332-
// There will one z-cube for each partition
333-
numOutputCubes = optimizeStats.numPartitionsOptimized))
334-
}
323+
optimizeStrategy.updateOptimizeStats(optimizeStats, removedFiles, jobs)
335324

336325
return Seq(Row(txn.deltaLog.dataPath.toString, optimizeStats.toOptimizeMetrics))
337326
}
@@ -365,27 +354,31 @@ class OptimizeExecutor(
365354
*
366355
* @param partitionsToCompact List of files to compact group by partition.
367356
* Partition is defined by the partition values (partCol -> partValue)
368-
* @param maxTargetFileSize Max size (in bytes) of the compaction output file.
369357
* @return Sequence of bins. Each bin contains one or more files from the same
370358
* partition and targeted for one output file.
371359
*/
372360
private def groupFilesIntoBins(
373-
partitionsToCompact: Seq[(Map[String, String], Seq[AddFile])],
374-
maxTargetFileSize: Long): Seq[(Map[String, String], Seq[AddFile])] = {
361+
partitionsToCompact: Seq[(Map[String, String], Seq[AddFile])])
362+
: Seq[(Map[String, String], Seq[AddFile])] = {
363+
val maxBinSize = optimizeStrategy.maxBinSize
375364
partitionsToCompact.flatMap {
376365
case (partition, files) =>
377366
val bins = new ArrayBuffer[Seq[AddFile]]()
378367

379368
val currentBin = new ArrayBuffer[AddFile]()
380369
var currentBinSize = 0L
381370

382-
files.sortBy(_.size).foreach { file =>
371+
val preparedFiles = optimizeStrategy.prepareFilesPerPartition(files)
372+
preparedFiles.foreach { file =>
383373
// Generally, a bin is a group of existing files, whose total size does not exceed the
384-
// desired maxFileSize. They will be coalesced into a single output file.
385-
// However, if isMultiDimClustering = true, all files in a partition will be read by the
386-
// same job, the data will be range-partitioned and numFiles = totalFileSize / maxFileSize
387-
// will be produced. See below.
388-
if (file.size + currentBinSize > maxTargetFileSize && !isMultiDimClustering) {
374+
// desired maxBinSize. The output file size depends on the mode:
375+
// 1. Compaction: Files in a bin will be coalesced into a single output file.
376+
// 2. ZOrder: all files in a partition will be read by the
377+
// same job, the data will be range-partitioned and
378+
// numFiles = totalFileSize / maxFileSize will be produced.
379+
// 3. Clustering: Files in a bin belongs to one ZCUBE, the data will be
380+
// range-partitioned and numFiles = totalFileSize / maxFileSize.
381+
if (file.size + currentBinSize > maxBinSize) {
389382
bins += currentBin.toVector
390383
currentBin.clear()
391384
currentBin += file
@@ -431,7 +424,7 @@ class OptimizeExecutor(
431424
input,
432425
approxNumFiles,
433426
clusteringColumns,
434-
curve)
427+
optimizeStrategy.curve)
435428
} else {
436429
val useRepartition = sparkSession.sessionState.conf.getConf(
437430
DeltaSQLConf.DELTA_OPTIMIZE_REPARTITION_ENABLED)
@@ -450,13 +443,9 @@ class OptimizeExecutor(
450443
sparkSession.sparkContext.getLocalProperty(SPARK_JOB_GROUP_ID),
451444
description)
452445

446+
val binInfo = optimizeStrategy.initNewBin
453447
val addFiles = txn.writeFiles(repartitionDF, None, isOptimize = true, Nil).collect {
454-
case a: AddFile =>
455-
(if (isClusteredTable) {
456-
a.copy(clusteringProvider = Some(ClusteredTableUtils.clusteringProvider))
457-
} else {
458-
a
459-
}).copy(dataChange = false)
448+
case a: AddFile => optimizeStrategy.tagAddFile(a, binInfo)
460449
case other =>
461450
throw new IllegalStateException(
462451
s"Unexpected action $other with type ${other.getClass}. File compaction job output" +

0 commit comments

Comments
 (0)