public class RemoveOrphanFilesAction
extends java.lang.Object
FileSystem.
By default, this action cleans up the table location returned by Table.location() and
removes unreachable files that are older than 3 days using Table.io(). The behavior can be modified
by passing a custom location to location and a custom timestamp to olderThan(long).
For example, someone might point this action to the data folder to clean up only orphan data files.
In addition, there is a way to configure an alternative delete method via deleteWith(Consumer).
Note: It is dangerous to call this action with a short retention interval as it might corrupt the state of the table if another operation is writing at the same time.
| Modifier and Type | Method and Description |
|---|---|
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
buildManifestFileDF(org.apache.spark.sql.SparkSession spark,
java.lang.String tableName) |
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
buildManifestListDF(org.apache.spark.sql.SparkSession spark,
java.lang.String metadataFileLocation) |
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
buildManifestListDF(org.apache.spark.sql.SparkSession spark,
Table table) |
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
buildOtherMetadataFileDF(org.apache.spark.sql.SparkSession spark,
TableOperations ops) |
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
buildValidDataFileDF(org.apache.spark.sql.SparkSession spark) |
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
buildValidDataFileDF(org.apache.spark.sql.SparkSession spark,
java.lang.String tableName) |
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
buildValidMetadataFileDF(org.apache.spark.sql.SparkSession spark,
Table table,
TableOperations ops) |
RemoveOrphanFilesAction |
deleteWith(java.util.function.Consumer<java.lang.String> newDeleteFunc)
Passes an alternative delete implementation that will be used to delete orphan files.
|
java.util.List<java.lang.String> |
execute()
Executes this action.
|
protected static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
loadMetadataTable(org.apache.spark.sql.SparkSession spark,
java.lang.String tableName,
java.lang.String tableLocation,
MetadataTableType type) |
RemoveOrphanFilesAction |
location(java.lang.String newLocation)
Removes orphan files in the given location.
|
RemoveOrphanFilesAction |
olderThan(long newOlderThanTimestamp)
Removes orphan files that are older than the given timestamp.
|
protected Table |
table() |
protected Table table()
public RemoveOrphanFilesAction location(java.lang.String newLocation)
newLocation - a locationpublic RemoveOrphanFilesAction olderThan(long newOlderThanTimestamp)
newOlderThanTimestamp - a timestamp in millisecondspublic RemoveOrphanFilesAction deleteWith(java.util.function.Consumer<java.lang.String> newDeleteFunc)
newDeleteFunc - a delete funcpublic java.util.List<java.lang.String> execute()
Actionprotected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> buildValidDataFileDF(org.apache.spark.sql.SparkSession spark)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> buildValidDataFileDF(org.apache.spark.sql.SparkSession spark,
java.lang.String tableName)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> buildManifestFileDF(org.apache.spark.sql.SparkSession spark,
java.lang.String tableName)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> buildManifestListDF(org.apache.spark.sql.SparkSession spark,
Table table)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> buildManifestListDF(org.apache.spark.sql.SparkSession spark,
java.lang.String metadataFileLocation)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> buildOtherMetadataFileDF(org.apache.spark.sql.SparkSession spark,
TableOperations ops)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> buildValidMetadataFileDF(org.apache.spark.sql.SparkSession spark,
Table table,
TableOperations ops)
protected static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> loadMetadataTable(org.apache.spark.sql.SparkSession spark,
java.lang.String tableName,
java.lang.String tableLocation,
MetadataTableType type)