@@ -54,17 +54,28 @@ import org.apache.spark.util.Utils
5454/**
5555 * A class to help with comparing checkpoints with each other, where we may have had concurrent
5656 * writers that checkpoint with different number of parts.
57+ * The `numParts` field will be present only for multipart checkpoints (represented by
58+ * Format.WITH_PARTS).
59+ * The `fileName` field is present only for V2 Checkpoints (represented by Format.V2)
60+ * These additional fields are used as a tie breaker when comparing multiple checkpoint
61+ * instance of same Format for the same `version`.
5762 */
5863case class CheckpointInstance (
5964 version : Long ,
6065 format : CheckpointInstance .Format ,
66+ fileName : Option [String ] = None ,
6167 numParts : Option [Int ] = None ) extends Ordered [CheckpointInstance ] {
6268
6369 // Assert that numParts are present when checkpoint format is Format.WITH_PARTS.
6470 // For other formats, numParts must be None.
6571 require((format == CheckpointInstance .Format .WITH_PARTS ) == numParts.isDefined,
6672 s " numParts ( $numParts) must be present for checkpoint format " +
6773 s " ${CheckpointInstance .Format .WITH_PARTS .name}" )
74+ // Assert that filePath is present only when checkpoint format is Format.V2.
75+ // For other formats, filePath must be None.
76+ require((format == CheckpointInstance .Format .V2 ) == fileName.isDefined,
77+ s " fileName ( $fileName) must be present for checkpoint format " +
78+ s " ${CheckpointInstance .Format .V2 .name}" )
6879
6980 /**
7081 * Returns a [[CheckpointProvider ]] which can tell the files corresponding to this
@@ -81,7 +92,26 @@ case class CheckpointInstance(
8192 val lastCheckpointInfo = lastCheckpointInfoHint.filter(cm => CheckpointInstance (cm) == this )
8293 val cpFiles = filterFiles(deltaLog, filesForCheckpointConstruction)
8394 format match {
84- case CheckpointInstance .Format .WITH_PARTS | CheckpointInstance .Format .SINGLE =>
95+ // Treat single file checkpoints also as V2 Checkpoints because we don't know if it is
96+ // actually a V2 checkpoint until we read it.
97+ case CheckpointInstance .Format .V2 | CheckpointInstance .Format .SINGLE =>
98+ assert(cpFiles.size == 1 )
99+ val fileStatus = cpFiles.head
100+ if (format == CheckpointInstance .Format .V2 ) {
101+ val hadoopConf = deltaLog.newDeltaHadoopConf()
102+ UninitializedV2CheckpointProvider (
103+ version,
104+ fileStatus,
105+ logPath,
106+ hadoopConf,
107+ deltaLog.options,
108+ deltaLog.store,
109+ lastCheckpointInfo)
110+ } else {
111+ UninitializedV1OrV2ParquetCheckpointProvider (
112+ version, fileStatus, logPath, lastCheckpointInfo)
113+ }
114+ case CheckpointInstance .Format .WITH_PARTS =>
85115 PreloadedCheckpointProvider (cpFiles, lastCheckpointInfo)
86116 case CheckpointInstance .Format .SENTINEL =>
87117 throw DeltaErrors .assertionFailedError(
@@ -93,6 +123,23 @@ case class CheckpointInstance(
93123 filesForCheckpointConstruction : Seq [FileStatus ]) : Seq [FileStatus ] = {
94124 val logPath = deltaLog.logPath
95125 format match {
126+ // Treat Single File checkpoints also as V2 Checkpoints because we don't know if it is
127+ // actually a V2 checkpoint until we read it.
128+ case format if format.usesSidecars =>
129+ val checkpointFileName = format match {
130+ case CheckpointInstance .Format .V2 => fileName.get
131+ case CheckpointInstance .Format .SINGLE => checkpointFileSingular(logPath, version).getName
132+ case other =>
133+ throw new IllegalStateException (s " Unknown checkpoint format $other supporting sidecars " )
134+ }
135+ val fileStatus = filesForCheckpointConstruction
136+ .find(_.getPath.getName == checkpointFileName)
137+ .getOrElse {
138+ throw new IllegalStateException (" Failed in getting the file information for:\n " +
139+ fileName.get + " \n among\n " +
140+ filesForCheckpointConstruction.map(_.getPath.getName).mkString(" -" , " \n -" , " " ))
141+ }
142+ Seq (fileStatus)
96143 case CheckpointInstance .Format .WITH_PARTS | CheckpointInstance .Format .SINGLE =>
97144 val filePaths = if (format == CheckpointInstance .Format .WITH_PARTS ) {
98145 checkpointFileWithParts(logPath, version, numParts.get).toSet
@@ -119,28 +166,35 @@ case class CheckpointInstance(
119166 * Single part checkpoint.
120167 * 3. For Multi-part [[CheckpointInstance ]]s corresponding to same version, the one with more
121168 * parts is greater than the one with less parts.
169+ * 4. For V2 Checkpoints corresponding to same version, we use the fileName as tie breaker.
122170 */
123171 override def compare (other : CheckpointInstance ): Int = {
124- (version, format, numParts) compare (other.version, other.format, other.numParts)
172+ (version, format, numParts, fileName) compare
173+ (other.version, other.format, other.numParts, other.fileName)
125174 }
126175}
127176
128177object CheckpointInstance {
129178 sealed abstract class Format (val ordinal : Int , val name : String ) extends Ordered [Format ] {
130179 override def compare (other : Format ): Int = ordinal compare other.ordinal
180+ def usesSidecars : Boolean = this .isInstanceOf [FormatUsesSidecars ]
131181 }
182+ trait FormatUsesSidecars
132183
133184 object Format {
134185 def unapply (name : String ): Option [Format ] = name match {
135186 case SINGLE .name => Some (SINGLE )
136187 case WITH_PARTS .name => Some (WITH_PARTS )
188+ case V2 .name => Some (V2 )
137189 case _ => None
138190 }
139191
140192 /** single-file checkpoint format */
141- object SINGLE extends Format (0 , " SINGLE" )
193+ object SINGLE extends Format (0 , " SINGLE" ) with FormatUsesSidecars
142194 /** multi-file checkpoint format */
143195 object WITH_PARTS extends Format (1 , " WITH_PARTS" )
196+ /** V2 Checkpoint format */
197+ object V2 extends Format (2 , " V2" ) with FormatUsesSidecars
144198 /** Sentinel, for internal use only */
145199 object SENTINEL extends Format (Int .MaxValue , " SENTINEL" )
146200 }
@@ -149,7 +203,14 @@ object CheckpointInstance {
149203 // Three formats to worry about:
150204 // * <version>.checkpoint.parquet
151205 // * <version>.checkpoint.<i>.<n>.parquet
206+ // * <version>.checkpoint.<u>.parquet where u is a unique string
152207 path.getName.split(" \\ ." ) match {
208+ case Array (v, " checkpoint" , uniqueStr, format) if Seq (" json" , " parquet" ).contains(format) =>
209+ CheckpointInstance (
210+ version = v.toLong,
211+ format = Format .V2 ,
212+ numParts = None ,
213+ fileName = Some (path.getName))
153214 case Array (v, " checkpoint" , " parquet" ) =>
154215 CheckpointInstance (v.toLong, Format .SINGLE , numParts = None )
155216 case Array (v, " checkpoint" , _, n, " parquet" ) =>
@@ -384,6 +445,8 @@ trait Checkpoints extends DeltaLogging {
384445 case CheckpointInstance .Format .WITH_PARTS =>
385446 assert(ci.numParts.nonEmpty, " Multi-Part Checkpoint must have non empty numParts" )
386447 matchingCheckpointInstances.length == ci.numParts.get
448+ case CheckpointInstance .Format .V2 =>
449+ matchingCheckpointInstances.length == 1
387450 case CheckpointInstance .Format .SENTINEL =>
388451 false
389452 }
0 commit comments