-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add possibility to restore delta table using version or timestamp. Examples: io.delta.tables.DeltaTable.forPath("/some_delta_path").restore(1) io.delta.tables.DeltaTable.forPath("/some_delta_path").restore(java.sql.Timestamp.valueOf("2021-01-01 00:00:00.000")) Fixes #632 Signed-off-by: Maksym Dovhal <[email protected]>
- Loading branch information
Maksym Dovhal
committed
Dec 12, 2021
1 parent
2ddff5e
commit fca9b7c
Showing
7 changed files
with
391 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
184 changes: 184 additions & 0 deletions
184
core/src/main/scala/org/apache/spark/sql/delta/commands/RestoreTableCommand.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
/* | ||
* Copyright (2021) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.delta.commands | ||
|
||
import java.sql.Timestamp | ||
import org.apache.spark.sql.delta._ | ||
import org.apache.spark.sql.delta.actions.{AddFile, RemoveFile} | ||
import org.apache.spark.sql.delta.sources.DeltaSQLConf._ | ||
import org.apache.spark.sql.delta.util.DeltaFileOperations.absolutePath | ||
import org.apache.spark.sql.execution.command.LeafRunnableCommand | ||
import org.apache.spark.sql.{Dataset, Row, SparkSession} | ||
import org.apache.spark.sql.internal.SQLConf.IGNORE_MISSING_FILES | ||
import org.apache.spark.util.SerializableConfiguration | ||
|
||
/** | ||
* Perform restore of delta table to a specified version or timestamp | ||
* | ||
* Algorithm: | ||
* 1) Read the latest snapshot of the table. | ||
* 2) Read snapshot for version or timestamp to restore | ||
* 3) Compute files available in snapshot for restoring (files were removed by some commit) | ||
* but missed in the latest. Add these files into commit as AddFile action. | ||
* 4) Compute files available in the latest snapshot (files were added after version to restore) | ||
* but missed in the snapshot to restore. Add these files into commit as RemoveFile action. | ||
* 5) If SQLConf.IGNORE_MISSING_FILES option is false (default value) check availability of AddFile | ||
* in file system. | ||
* 6) Commit metadata, Protocol, all RemoveFile and AddFile actions | ||
* into delta log using `commitLarge`. | ||
* 7) If table was modified in parallel then ignore restore and raise exception. | ||
*/ | ||
case class RestoreTableCommand( | ||
deltaLog: DeltaLog, | ||
version: Option[Long], | ||
timestamp: Option[Timestamp] | ||
) extends LeafRunnableCommand with DeltaCommand { | ||
|
||
override def run(spark: SparkSession): Seq[Row] = { | ||
recordDeltaOperation(deltaLog, "delta.restore") { | ||
|
||
require(version.isEmpty ^ timestamp.isEmpty, | ||
"Either the version or timestamp should be provided for restore") | ||
val parallelism = restoreParallelism(spark) | ||
val latestSnapshot = deltaLog.update() | ||
val versionToRestore = version.getOrElse( | ||
deltaLog | ||
.history | ||
.getActiveCommitAtTime(timestamp.get, canReturnLastCommit = true) | ||
.version | ||
) | ||
|
||
require(versionToRestore < latestSnapshot.version, | ||
s"Version to restore ($versionToRestore) should be less then " + | ||
s"last available version (${latestSnapshot.version})") | ||
|
||
val snapshotToRestore = deltaLog.getSnapshotAt(versionToRestore) | ||
val latestSnapshotFiles = latestSnapshot.allFiles | ||
val snapshotToRestoreFiles = snapshotToRestore.allFiles | ||
|
||
import spark.implicits._ | ||
import collection.JavaConverters._ | ||
|
||
val filesToAdd = snapshotToRestoreFiles | ||
.join( | ||
latestSnapshotFiles, | ||
snapshotToRestoreFiles("path") === latestSnapshotFiles("path"), | ||
"left_anti") | ||
.as[AddFile] | ||
.map(_.copy(dataChange = true)) | ||
.repartition(parallelism) | ||
.cache() // To avoid Dataset recompute for each partition of toLocalIterator() | ||
|
||
checkSnapshotFilesAvailability(deltaLog, filesToAdd, versionToRestore) | ||
|
||
val filesToRemove = latestSnapshotFiles | ||
.join( | ||
snapshotToRestoreFiles, | ||
latestSnapshotFiles("path") === snapshotToRestoreFiles("path"), | ||
"left_anti") | ||
.as[AddFile] | ||
.map(_.removeWithTimestamp()) | ||
.repartition(parallelism) | ||
.cache() // To avoid Dataset recompute for each partition of toLocalIterator() | ||
|
||
// Commit files, metrics, protocol and metadata to delta log | ||
deltaLog.withNewTransaction { txn => | ||
|
||
val metrics = computeMetrics(filesToAdd, filesToRemove, snapshotToRestore) | ||
val addActions = filesToAdd.toLocalIterator().asScala | ||
val removeActions = filesToRemove.toLocalIterator().asScala | ||
|
||
txn.updateMetadata(snapshotToRestore.metadata) | ||
|
||
commitLarge( | ||
spark, | ||
txn, | ||
addActions ++ removeActions, | ||
DeltaOperations.Restore(version, timestamp.map(_.getTime)), | ||
Map.empty, | ||
metrics) | ||
} | ||
filesToAdd.unpersist() | ||
filesToRemove.unpersist() | ||
|
||
Seq.empty[Row] | ||
} | ||
} | ||
|
||
private def restoreParallelism(spark: SparkSession): Int = spark | ||
.sessionState | ||
.conf | ||
.getConf(DELTA_RESTORE_PARALLELISM) | ||
|
||
private def computeMetrics( | ||
toAdd: Dataset[AddFile], | ||
toRemove: Dataset[RemoveFile], | ||
snapshot: Snapshot | ||
): Map[String, String] = { | ||
import toAdd.sparkSession.implicits._ | ||
|
||
val (numRestoredFiles, restoredFilesSize) = toAdd | ||
.agg("size" -> "count", "size" -> "sum").as[(Long, Option[Long])].head() | ||
|
||
val (numRemovedFiles, removedFilesSize) = toRemove | ||
.agg("size" -> "count", "size" -> "sum").as[(Long, Option[Long])].head() | ||
|
||
Map( | ||
"numRestoredFiles" -> numRestoredFiles, | ||
"restoredFilesSize" -> restoredFilesSize.getOrElse(0), | ||
"numRemovedFiles" -> numRemovedFiles, | ||
"removedFilesSize" -> removedFilesSize.getOrElse(0), | ||
"numOfFilesAfterRestore" -> snapshot.numOfFiles, | ||
"tableSizeAfterRestore" -> snapshot.sizeInBytes | ||
).mapValues(_.toString).toMap | ||
} | ||
|
||
/* Prevent users from running restore to table version with missed | ||
* data files (manually deleted or vacuumed). Restoring to this version partially | ||
* is still possible if spark.sql.files.ignoreMissingFiles is set to true | ||
*/ | ||
private def checkSnapshotFilesAvailability( | ||
deltaLog: DeltaLog, files: Dataset[AddFile], version: Long): Unit = { | ||
|
||
implicit val spark: SparkSession = files.sparkSession | ||
val ignore = spark | ||
.sessionState | ||
.conf | ||
.getConf(IGNORE_MISSING_FILES) | ||
|
||
if (!ignore) { | ||
val path = deltaLog.dataPath | ||
val hadoopConf = spark.sparkContext.broadcast( | ||
new SerializableConfiguration(deltaLog.newDeltaHadoopConf())) | ||
|
||
import spark.implicits._ | ||
val missedFiles = files | ||
.repartition(restoreParallelism(spark)) | ||
.mapPartitions { files => | ||
val fs = path.getFileSystem(hadoopConf.value.value) | ||
val pathStr = path.toUri.getPath | ||
files.filterNot(f => fs.exists(absolutePath(pathStr, f.path))) | ||
} | ||
.map(_.path) | ||
.head(100) | ||
|
||
if (missedFiles.nonEmpty) { | ||
throw DeltaErrors.restoreMissedDataFilesError(missedFiles, version) | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.