-
Notifications
You must be signed in to change notification settings - Fork 242
Support bulk deletion in batch file cleanup task #1179
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -77,17 +77,12 @@ public boolean handleTask(TaskEntity task, CallContext callContext) { | |
missingFiles.size()); | ||
} | ||
|
||
// Schedule the deletion for each file asynchronously | ||
List<CompletableFuture<Void>> deleteFutures = | ||
validFiles.stream() | ||
.map(file -> super.tryDelete(tableId, authorizedFileIO, null, file, null, 1)) | ||
.toList(); | ||
CompletableFuture<Void> deleteFutures = | ||
tryDelete( | ||
tableId, authorizedFileIO, validFiles, cleanupTask.type().getValue(), true, null, 1); | ||
|
||
try { | ||
// Wait for all delete operations to finish | ||
CompletableFuture<Void> allDeletes = | ||
CompletableFuture.allOf(deleteFutures.toArray(new CompletableFuture[0])); | ||
allDeletes.join(); | ||
deleteFutures.join(); | ||
} catch (Exception e) { | ||
LOGGER.error("Exception detected during batch files deletion", e); | ||
return false; | ||
|
@@ -97,5 +92,25 @@ public boolean handleTask(TaskEntity task, CallContext callContext) { | |
} | ||
} | ||
|
||
public record BatchFileCleanupTask(TableIdentifier tableId, List<String> batchFiles) {} | ||
public enum BatchFileType { | ||
TABLE_METADATA("table_metadata"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not completely sure here, so am asking for context: why did we introduce this enum type here instead of keeping it as a record? Is this going to be extensible for something in the immediate future? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The record in line 114 is still there — we added this enum to specify what kind of file the BatchFileCleanupTask is cleaning up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That makes a lot more sense :) Thanks! |
||
|
||
private final String value; | ||
|
||
BatchFileType(String value) { | ||
this.value = value; | ||
} | ||
|
||
public String getValue() { | ||
return value; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return value; | ||
} | ||
} | ||
|
||
public record BatchFileCleanupTask( | ||
TableIdentifier tableId, List<String> batchFiles, BatchFileType type) {} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
import java.util.concurrent.CompletableFuture; | ||
import java.util.concurrent.ExecutorService; | ||
import java.util.concurrent.TimeUnit; | ||
import org.apache.iceberg.CatalogUtil; | ||
import org.apache.iceberg.catalog.TableIdentifier; | ||
import org.apache.iceberg.io.FileIO; | ||
import org.apache.polaris.core.context.CallContext; | ||
|
@@ -29,9 +30,12 @@ | |
import org.slf4j.LoggerFactory; | ||
|
||
/** | ||
* {@link FileCleanupTaskHandler} responsible for cleaning up files in table tasks. Handles retries | ||
* for file deletions and skips files that are already missing. Subclasses must implement | ||
* task-specific logic. | ||
* Abstract base class for handling file cleanup tasks within Apache Polaris. | ||
* | ||
* <p>This class is for performing asynchronous file deletions with retry logic. | ||
* | ||
* <p>Subclasses must implement {@link #canHandleTask(TaskEntity)} and {@link | ||
* #handleTask(TaskEntity, CallContext)} to define task-specific handling logic. | ||
*/ | ||
public abstract class FileCleanupTaskHandler implements TaskHandler { | ||
|
||
|
@@ -53,6 +57,19 @@ public FileCleanupTaskHandler( | |
@Override | ||
public abstract boolean handleTask(TaskEntity task, CallContext callContext); | ||
|
||
/** | ||
* Attempts to delete a single file with retry logic. If the file does not exist, it logs a | ||
* message and does not retry. If an error occurs, it retries up to {@link #MAX_ATTEMPTS} times | ||
* before failing. | ||
* | ||
* @param tableId The identifier of the table associated with the file. | ||
* @param fileIO The {@link FileIO} instance used for file operations. | ||
* @param baseFile An optional base file associated with the file being deleted (can be null). | ||
* @param file The path of the file to be deleted. | ||
* @param e The exception from the previous attempt, if any. | ||
* @param attempt The current retry attempt count. | ||
* @return A {@link CompletableFuture} representing the asynchronous deletion operation. | ||
*/ | ||
public CompletableFuture<Void> tryDelete( | ||
TableIdentifier tableId, | ||
FileIO fileIO, | ||
|
@@ -103,4 +120,53 @@ public CompletableFuture<Void> tryDelete( | |
CompletableFuture.delayedExecutor( | ||
FILE_DELETION_RETRY_MILLIS, TimeUnit.MILLISECONDS, executorService)); | ||
} | ||
|
||
/** | ||
* Attempts to delete multiple files in a batch operation with retry logic. If an error occurs, it | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This LGTM, but it's important to note that we may not retry in the event that the service dies. Eventually, we should have Polaris try to drain the task queue for any tasks that failed the first time they were run There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's a good catch, do you mind if I follow up with a new PR to do this, it might involve refactoring on existing delete task There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @eric-maynard - I was looking at the currently existent There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No @adnanhemani , current PR mainly focusing on provide more efficient bulk deletion, but can not guarantee task got eventually executed. For Eric's comment, we have a plan to fix that, please refer to #774 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @danielhumanmod - my question was that does the current |
||
* retries up to {@link #MAX_ATTEMPTS} times before failing. | ||
* | ||
* @param tableId The identifier of the table associated with the files. | ||
* @param fileIO The {@link FileIO} instance used for file operations. | ||
* @param files The list of file paths to be deleted. | ||
* @param type The type of files being deleted (e.g., data files, metadata files). | ||
* @param isConcurrent Whether the deletion should be performed concurrently. | ||
* @param e The exception from the previous attempt, if any. | ||
* @param attempt The current retry attempt count. | ||
* @return A {@link CompletableFuture} representing the asynchronous batch deletion operation. | ||
*/ | ||
public CompletableFuture<Void> tryDelete( | ||
TableIdentifier tableId, | ||
FileIO fileIO, | ||
Iterable<String> files, | ||
String type, | ||
Boolean isConcurrent, | ||
Throwable e, | ||
int attempt) { | ||
if (e != null && attempt <= MAX_ATTEMPTS) { | ||
LOGGER | ||
.atWarn() | ||
.addKeyValue("files", files) | ||
.addKeyValue("attempt", attempt) | ||
.addKeyValue("error", e.getMessage()) | ||
.addKeyValue("type", type) | ||
.log("Error encountered attempting to delete files"); | ||
} | ||
if (attempt > MAX_ATTEMPTS && e != null) { | ||
return CompletableFuture.failedFuture(e); | ||
} | ||
return CompletableFuture.runAsync( | ||
() -> CatalogUtil.deleteFiles(fileIO, files, type, isConcurrent), executorService) | ||
.exceptionallyComposeAsync( | ||
newEx -> { | ||
LOGGER | ||
.atWarn() | ||
.addKeyValue("files", files) | ||
.addKeyValue("tableIdentifier", tableId) | ||
.addKeyValue("type", type) | ||
.log("Exception caught deleting data files", newEx); | ||
return tryDelete(tableId, fileIO, files, type, isConcurrent, newEx, attempt + 1); | ||
}, | ||
CompletableFuture.delayedExecutor( | ||
FILE_DELETION_RETRY_MILLIS, TimeUnit.MILLISECONDS, executorService)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: this is now a single future, so maybe we should name this
deleteFuture
instead