public class DeleteOrphanFilesSparkAction extends java.lang.Object implements DeleteOrphanFiles
FileSystem
.
By default, this action cleans up the table location returned by Table.location()
and
removes unreachable files that are older than 3 days using Table.io()
. The behavior can
be modified by passing a custom location to location
and a custom timestamp to olderThan(long)
. For example, someone might point this action to the data folder to clean up
only orphan data files.
Configure an alternative delete method using deleteWith(Consumer)
.
For full control of the set of files being evaluated, use the compareToFileList(Dataset)
argument. This skips the directory listing - any files in the
dataset provided which are not found in table metadata will be deleted, using the same Table.location()
and olderThan(long)
filtering as above.
Note: It is dangerous to call this action with a short retention interval as it might corrupt the state of the table if another operation is writing at the same time.
Modifier and Type | Class and Description |
---|---|
static class |
DeleteOrphanFilesSparkAction.FileURI |
DeleteOrphanFiles.PrefixMismatchMode, DeleteOrphanFiles.Result
Modifier and Type | Field and Description |
---|---|
protected static org.apache.iceberg.relocated.com.google.common.base.Joiner |
COMMA_JOINER |
protected static org.apache.iceberg.relocated.com.google.common.base.Splitter |
COMMA_SPLITTER |
protected static java.lang.String |
FILE_PATH |
protected static java.lang.String |
LAST_MODIFIED |
protected static java.lang.String |
MANIFEST |
protected static java.lang.String |
MANIFEST_LIST |
protected static java.lang.String |
OTHERS |
Modifier and Type | Method and Description |
---|---|
protected org.apache.spark.sql.Dataset<FileInfo> |
allReachableOtherMetadataFileDS(Table table) |
DeleteOrphanFilesSparkAction |
compareToFileList(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> files) |
protected org.apache.spark.sql.Dataset<FileInfo> |
contentFileDS(Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
contentFileDS(Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary |
deleteFiles(java.util.concurrent.ExecutorService executorService,
java.util.function.Consumer<java.lang.String> deleteFunc,
java.util.Iterator<FileInfo> files)
Deletes files and keeps track of how many files were removed for each file type.
|
DeleteOrphanFilesSparkAction |
deleteWith(java.util.function.Consumer<java.lang.String> newDeleteFunc)
Passes an alternative delete implementation that will be used for orphan files.
|
DeleteOrphanFilesSparkAction |
equalAuthorities(java.util.Map<java.lang.String,java.lang.String> newEqualAuthorities)
Passes authorities that should be considered equal.
|
DeleteOrphanFilesSparkAction |
equalSchemes(java.util.Map<java.lang.String,java.lang.String> newEqualSchemes)
Passes schemes that should be considered equal.
|
DeleteOrphanFiles.Result |
execute()
Executes this action.
|
DeleteOrphanFilesSparkAction |
executeDeleteWith(java.util.concurrent.ExecutorService executorService)
Passes an alternative executor service that will be used for removing orphaned files.
|
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
loadMetadataTable(Table table,
MetadataTableType type) |
DeleteOrphanFilesSparkAction |
location(java.lang.String newLocation)
Passes a location which should be scanned for orphan files.
|
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestDS(Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestDS(Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestListDS(Table table) |
protected org.apache.spark.sql.Dataset<FileInfo> |
manifestListDS(Table table,
java.util.Set<java.lang.Long> snapshotIds) |
protected JobGroupInfo |
newJobGroupInfo(java.lang.String groupId,
java.lang.String desc) |
protected Table |
newStaticTable(TableMetadata metadata,
FileIO io) |
DeleteOrphanFilesSparkAction |
olderThan(long newOlderThanTimestamp)
Removes orphan files only if they are older than the given timestamp.
|
ThisT |
option(java.lang.String name,
java.lang.String value) |
protected java.util.Map<java.lang.String,java.lang.String> |
options() |
ThisT |
options(java.util.Map<java.lang.String,java.lang.String> newOptions) |
protected org.apache.spark.sql.Dataset<FileInfo> |
otherMetadataFileDS(Table table) |
DeleteOrphanFilesSparkAction |
prefixMismatchMode(DeleteOrphanFiles.PrefixMismatchMode newPrefixMismatchMode)
Passes a prefix mismatch mode that determines how this action should handle situations when the
metadata references files that match listed/provided files except for authority/scheme.
|
protected DeleteOrphanFilesSparkAction |
self() |
protected org.apache.spark.sql.SparkSession |
spark() |
protected org.apache.spark.api.java.JavaSparkContext |
sparkContext() |
protected <T> T |
withJobGroupInfo(JobGroupInfo info,
java.util.function.Supplier<T> supplier) |
protected static final java.lang.String MANIFEST
protected static final java.lang.String MANIFEST_LIST
protected static final java.lang.String OTHERS
protected static final java.lang.String FILE_PATH
protected static final java.lang.String LAST_MODIFIED
protected static final org.apache.iceberg.relocated.com.google.common.base.Splitter COMMA_SPLITTER
protected static final org.apache.iceberg.relocated.com.google.common.base.Joiner COMMA_JOINER
protected DeleteOrphanFilesSparkAction self()
public DeleteOrphanFilesSparkAction executeDeleteWith(java.util.concurrent.ExecutorService executorService)
DeleteOrphanFiles
If this method is not called, orphaned manifests and data files will still be deleted in the current thread.
executeDeleteWith
in interface DeleteOrphanFiles
executorService
- the service to usepublic DeleteOrphanFilesSparkAction prefixMismatchMode(DeleteOrphanFiles.PrefixMismatchMode newPrefixMismatchMode)
DeleteOrphanFiles
Possible values are "ERROR", "IGNORE", "DELETE". The default mismatch mode is "ERROR", which
means an exception is thrown whenever there is a mismatch in authority/scheme. It's the
recommended mismatch mode and should be changed only in some rare circumstances. If there is a
mismatch, use DeleteOrphanFiles.equalSchemes(Map)
and DeleteOrphanFiles.equalAuthorities(Map)
to resolve
conflicts by providing equivalent schemes and authorities. If it is impossible to determine
whether the conflicting authorities/schemes are equal, set the prefix mismatch mode to "IGNORE"
to skip files with mismatches. If you have manually inspected all conflicting
authorities/schemes, provided equivalent schemes/authorities and are absolutely confident the
remaining ones are different, set the prefix mismatch mode to "DELETE" to consider files with
mismatches as orphan. It will be impossible to recover files after deletion, so the "DELETE"
prefix mismatch mode must be used with extreme caution.
prefixMismatchMode
in interface DeleteOrphanFiles
newPrefixMismatchMode
- mode for handling prefix mismatchespublic DeleteOrphanFilesSparkAction equalSchemes(java.util.Map<java.lang.String,java.lang.String> newEqualSchemes)
DeleteOrphanFiles
The key may include a comma-separated list of schemes. For instance, Map("s3a,s3,s3n", "s3").
equalSchemes
in interface DeleteOrphanFiles
newEqualSchemes
- list of equal schemespublic DeleteOrphanFilesSparkAction equalAuthorities(java.util.Map<java.lang.String,java.lang.String> newEqualAuthorities)
DeleteOrphanFiles
The key may include a comma-separate list of authorities. For instance, Map("s1name,s2name", "servicename").
equalAuthorities
in interface DeleteOrphanFiles
newEqualAuthorities
- list of equal authoritiespublic DeleteOrphanFilesSparkAction location(java.lang.String newLocation)
DeleteOrphanFiles
If not set, the root table location will be scanned potentially removing both orphan data and metadata files.
location
in interface DeleteOrphanFiles
newLocation
- the location where to look for orphan filespublic DeleteOrphanFilesSparkAction olderThan(long newOlderThanTimestamp)
DeleteOrphanFiles
This is a safety measure to avoid removing files that are being added to the table. For example, there may be a concurrent operation adding new files while this action searches for orphan files. New files may not be referenced by the metadata yet but they are not orphan.
If not set, defaults to a timestamp 3 days ago.
olderThan
in interface DeleteOrphanFiles
newOlderThanTimestamp
- a long timestamp, as returned by System.currentTimeMillis()
public DeleteOrphanFilesSparkAction deleteWith(java.util.function.Consumer<java.lang.String> newDeleteFunc)
DeleteOrphanFiles
This method allows users to customize the delete func. For example, one may set a custom delete func and collect all orphan files into a set instead of physically removing them.
If not set, defaults to using the table's io
implementation.
deleteWith
in interface DeleteOrphanFiles
newDeleteFunc
- a function that will be called to delete filespublic DeleteOrphanFilesSparkAction compareToFileList(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> files)
public DeleteOrphanFiles.Result execute()
Action
execute
in interface Action<DeleteOrphanFiles,DeleteOrphanFiles.Result>
protected org.apache.spark.sql.SparkSession spark()
protected org.apache.spark.api.java.JavaSparkContext sparkContext()
public ThisT option(java.lang.String name, java.lang.String value)
public ThisT options(java.util.Map<java.lang.String,java.lang.String> newOptions)
protected java.util.Map<java.lang.String,java.lang.String> options()
protected <T> T withJobGroupInfo(JobGroupInfo info, java.util.function.Supplier<T> supplier)
protected JobGroupInfo newJobGroupInfo(java.lang.String groupId, java.lang.String desc)
protected Table newStaticTable(TableMetadata metadata, FileIO io)
protected org.apache.spark.sql.Dataset<FileInfo> contentFileDS(Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> manifestDS(Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> manifestListDS(Table table, java.util.Set<java.lang.Long> snapshotIds)
protected org.apache.spark.sql.Dataset<FileInfo> otherMetadataFileDS(Table table)
protected org.apache.spark.sql.Dataset<FileInfo> allReachableOtherMetadataFileDS(Table table)
protected org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> loadMetadataTable(Table table, MetadataTableType type)
protected org.apache.iceberg.spark.actions.BaseSparkAction.DeleteSummary deleteFiles(java.util.concurrent.ExecutorService executorService, java.util.function.Consumer<java.lang.String> deleteFunc, java.util.Iterator<FileInfo> files)
executorService
- an executor service to use for parallel deletesdeleteFunc
- a delete funcfiles
- an iterator of Spark rows of the structure (path: String, type: String)