@@ -225,6 +225,15 @@ class OptimizeExecutor(
225225 optimizeContext : DeltaOptimizeContext )
226226 extends DeltaCommand with SQLMetricsReporting with Serializable {
227227
228+ /**
229+ * In which mode the Optimize command is running. There are three valid modes:
230+ * 1. Compaction
231+ * 2. ZOrder
232+ * 3. Clustering
233+ */
234+ private val optimizeStrategy =
235+ OptimizeTableStrategy (sparkSession, txn.snapshot, optimizeContext, zOrderByColumns)
236+
228237 /** Timestamp to use in [[FileAction ]] */
229238 private val operationTimestamp = new SystemClock ().getTimeMillis()
230239
@@ -242,15 +251,6 @@ class OptimizeExecutor(
242251 }
243252 }
244253
245- private lazy val curve : String = {
246- if (zOrderByColumns.nonEmpty) {
247- " zorder"
248- } else {
249- assert(isClusteredTable)
250- " hilbert"
251- }
252- }
253-
254254 def optimize (): Seq [Row ] = {
255255 recordDeltaOperation(txn.deltaLog, " delta.optimize" ) {
256256 val minFileSize = optimizeContext.minFileSize.getOrElse(
@@ -269,7 +269,7 @@ class OptimizeExecutor(
269269 }
270270 val partitionsToCompact = filesToProcess.groupBy(_.partitionValues).toSeq
271271
272- val jobs = groupFilesIntoBins(partitionsToCompact, maxFileSize )
272+ val jobs = groupFilesIntoBins(partitionsToCompact)
273273
274274 val maxThreads =
275275 sparkSession.sessionState.conf.getConf(DeltaSQLConf .DELTA_OPTIMIZE_MAX_THREADS )
@@ -320,18 +320,7 @@ class OptimizeExecutor(
320320 numDeletionVectorRowsRemoved = removedDVs.map(_.cardinality).sum))
321321 }
322322
323- if (isMultiDimClustering) {
324- val inputFileStats =
325- ZOrderFileStats (removedFiles.size, removedFiles.map(_.size.getOrElse(0L )).sum)
326- optimizeStats.zOrderStats = Some (ZOrderStats (
327- strategyName = " all" , // means process all files in a partition
328- inputCubeFiles = ZOrderFileStats (0 , 0 ),
329- inputOtherFiles = inputFileStats,
330- inputNumCubes = 0 ,
331- mergedFiles = inputFileStats,
332- // There will one z-cube for each partition
333- numOutputCubes = optimizeStats.numPartitionsOptimized))
334- }
323+ optimizeStrategy.updateOptimizeStats(optimizeStats, removedFiles, jobs)
335324
336325 return Seq (Row (txn.deltaLog.dataPath.toString, optimizeStats.toOptimizeMetrics))
337326 }
@@ -365,27 +354,31 @@ class OptimizeExecutor(
365354 *
366355 * @param partitionsToCompact List of files to compact group by partition.
367356 * Partition is defined by the partition values (partCol -> partValue)
368- * @param maxTargetFileSize Max size (in bytes) of the compaction output file.
369357 * @return Sequence of bins. Each bin contains one or more files from the same
370358 * partition and targeted for one output file.
371359 */
372360 private def groupFilesIntoBins (
373- partitionsToCompact : Seq [(Map [String , String ], Seq [AddFile ])],
374- maxTargetFileSize : Long ): Seq [(Map [String , String ], Seq [AddFile ])] = {
361+ partitionsToCompact : Seq [(Map [String , String ], Seq [AddFile ])])
362+ : Seq [(Map [String , String ], Seq [AddFile ])] = {
363+ val maxBinSize = optimizeStrategy.maxBinSize
375364 partitionsToCompact.flatMap {
376365 case (partition, files) =>
377366 val bins = new ArrayBuffer [Seq [AddFile ]]()
378367
379368 val currentBin = new ArrayBuffer [AddFile ]()
380369 var currentBinSize = 0L
381370
382- files.sortBy(_.size).foreach { file =>
371+ val preparedFiles = optimizeStrategy.prepareFilesPerPartition(files)
372+ preparedFiles.foreach { file =>
383373 // Generally, a bin is a group of existing files, whose total size does not exceed the
384- // desired maxFileSize. They will be coalesced into a single output file.
385- // However, if isMultiDimClustering = true, all files in a partition will be read by the
386- // same job, the data will be range-partitioned and numFiles = totalFileSize / maxFileSize
387- // will be produced. See below.
388- if (file.size + currentBinSize > maxTargetFileSize && ! isMultiDimClustering) {
374+ // desired maxBinSize. The output file size depends on the mode:
375+ // 1. Compaction: Files in a bin will be coalesced into a single output file.
376+ // 2. ZOrder: all files in a partition will be read by the
377+ // same job, the data will be range-partitioned and
378+ // numFiles = totalFileSize / maxFileSize will be produced.
379+ // 3. Clustering: Files in a bin belongs to one ZCUBE, the data will be
380+ // range-partitioned and numFiles = totalFileSize / maxFileSize.
381+ if (file.size + currentBinSize > maxBinSize) {
389382 bins += currentBin.toVector
390383 currentBin.clear()
391384 currentBin += file
@@ -431,7 +424,7 @@ class OptimizeExecutor(
431424 input,
432425 approxNumFiles,
433426 clusteringColumns,
434- curve)
427+ optimizeStrategy. curve)
435428 } else {
436429 val useRepartition = sparkSession.sessionState.conf.getConf(
437430 DeltaSQLConf .DELTA_OPTIMIZE_REPARTITION_ENABLED )
@@ -450,13 +443,9 @@ class OptimizeExecutor(
450443 sparkSession.sparkContext.getLocalProperty(SPARK_JOB_GROUP_ID ),
451444 description)
452445
446+ val binInfo = optimizeStrategy.initNewBin
453447 val addFiles = txn.writeFiles(repartitionDF, None , isOptimize = true , Nil ).collect {
454- case a : AddFile =>
455- (if (isClusteredTable) {
456- a.copy(clusteringProvider = Some (ClusteredTableUtils .clusteringProvider))
457- } else {
458- a
459- }).copy(dataChange = false )
448+ case a : AddFile => optimizeStrategy.tagAddFile(a, binInfo)
460449 case other =>
461450 throw new IllegalStateException (
462451 s " Unexpected action $other with type ${other.getClass}. File compaction job output " +
0 commit comments