Skip to content

Commit 4769feb

Browse files
HADOOP-19554. LocalDirAllocator still doesn't always recover from directory deletion (#7651)
The LocalDIrManager class used to allocate disk space for HDFS block storage and cloud connector buffers should now recover from directory deletion on all codepaths. This situation can occur for the cloud buffer uses where cron jobs can clean up temp directories on a schedule. There is also significantly better logging of the allocation process in the case of failure: all steps attempted before concluding there was no disk space will be logged at error on the failing host. Contributed by Steve Loughran
1 parent 47ad1c0 commit 4769feb

File tree

2 files changed

+129
-33
lines changed

2 files changed

+129
-33
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalDirAllocator.java

+94-24
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,10 @@
6565
@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
6666
@InterfaceStability.Unstable
6767
public class LocalDirAllocator {
68-
68+
69+
static final String E_NO_SPACE_AVAILABLE =
70+
"No space available in any of the local directories";
71+
6972
//A Map from the config item names like "mapred.local.dir"
7073
//to the instance of the AllocatorPerContext. This
7174
//is a static object to make sure there exists exactly one instance per JVM
@@ -384,6 +387,24 @@ int getCurrentDirectoryIndex() {
384387
return currentContext.get().dirNumLastAccessed.get();
385388
}
386389

390+
/**
391+
* Format a string, log at debug and append it to the history as a new line.
392+
*
393+
* @param history history to fill in
394+
* @param fmt format string
395+
* @param args varags
396+
*/
397+
private void note(StringBuilder history, String fmt, Object... args) {
398+
try {
399+
final String s = String.format(fmt, args);
400+
history.append(s).append("\n");
401+
LOG.debug(s);
402+
} catch (Exception e) {
403+
// some resilience in case the format string is wrong
404+
LOG.debug(fmt, e);
405+
}
406+
}
407+
387408
/** Get a path from the local FS. If size is known, we go
388409
* round-robin over the set of disks (via the configured dirs) and return
389410
* the first complete path which has enough space.
@@ -393,6 +414,12 @@ int getCurrentDirectoryIndex() {
393414
*/
394415
public Path getLocalPathForWrite(String pathStr, long size,
395416
Configuration conf, boolean checkWrite) throws IOException {
417+
418+
// history is built up and logged at error if the alloc
419+
StringBuilder history = new StringBuilder();
420+
421+
note(history, "Searchng for a directory for file at %s, size = %,d; checkWrite=%s",
422+
pathStr, size, checkWrite);
396423
Context ctx = confChanged(conf);
397424
int numDirs = ctx.localDirs.length;
398425
int numDirsSearched = 0;
@@ -406,27 +433,56 @@ public Path getLocalPathForWrite(String pathStr, long size,
406433
pathStr = pathStr.substring(1);
407434
}
408435
Path returnPath = null;
409-
410-
if(size == SIZE_UNKNOWN) { //do roulette selection: pick dir with probability
411-
//proportional to available size
412-
long[] availableOnDisk = new long[ctx.dirDF.length];
413-
long totalAvailable = 0;
414-
415-
//build the "roulette wheel"
416-
for(int i =0; i < ctx.dirDF.length; ++i) {
417-
final DF target = ctx.dirDF[i];
418-
// attempt to recreate the dir so that getAvailable() is valid
419-
// if it fails, getAvailable() will return 0, so the dir will
420-
// be declared unavailable.
421-
// return value is logged at debug to keep spotbugs quiet.
422-
final boolean b = new File(target.getDirPath()).mkdirs();
423-
LOG.debug("mkdirs of {}={}", target, b);
424-
availableOnDisk[i] = target.getAvailable();
436+
437+
final int dirCount = ctx.dirDF.length;
438+
long[] availableOnDisk = new long[dirCount];
439+
long totalAvailable = 0;
440+
441+
StringBuilder pathNames = new StringBuilder();
442+
443+
//build the "roulette wheel"
444+
for (int i =0; i < dirCount; ++i) {
445+
final DF target = ctx.dirDF[i];
446+
// attempt to recreate the dir so that getAvailable() is valid
447+
// if it fails, getAvailable() will return 0, so the dir will
448+
// be declared unavailable.
449+
// return value is logged at debug to keep spotbugs quiet.
450+
final String name = target.getDirPath();
451+
pathNames.append(" ").append(name);
452+
final File dirPath = new File(name);
453+
454+
// existence probe
455+
if (!dirPath.exists()) {
456+
LOG.debug("creating buffer dir {}", name);
457+
if (dirPath.mkdirs()) {
458+
note(history, "Created buffer dir %s", name);
459+
} else {
460+
note(history, "Failed to create buffer dir %s", name);
461+
}
462+
}
463+
464+
// path already existed or the mkdir call had an outcome
465+
// make sure the path is present and a dir, and if so add its availability
466+
if (dirPath.isDirectory()) {
467+
final long available = target.getAvailable();
468+
availableOnDisk[i] = available;
469+
note(history, "%s available under path %s", pathStr, available);
425470
totalAvailable += availableOnDisk[i];
471+
} else {
472+
note(history, "%s does not exist/is not a directory", pathStr);
426473
}
474+
}
475+
476+
note(history, "Directory count is %d; total available capacity is %,d{}",
477+
dirCount, totalAvailable);
427478

428-
if (totalAvailable == 0){
429-
throw new DiskErrorException("No space available in any of the local directories.");
479+
if (size == SIZE_UNKNOWN) {
480+
//do roulette selection: pick dir with probability
481+
// proportional to available size
482+
note(history, "Size not specified, so picking at random.");
483+
484+
if (totalAvailable == 0) {
485+
throw new DiskErrorException(E_NO_SPACE_AVAILABLE + pathNames + "; history=" + history);
430486
}
431487

432488
// Keep rolling the wheel till we get a valid path
@@ -439,14 +495,19 @@ public Path getLocalPathForWrite(String pathStr, long size,
439495
dir++;
440496
}
441497
ctx.dirNumLastAccessed.set(dir);
442-
returnPath = createPath(ctx.localDirs[dir], pathStr, checkWrite);
498+
final Path localDir = ctx.localDirs[dir];
499+
returnPath = createPath(localDir, pathStr, checkWrite);
443500
if (returnPath == null) {
444501
totalAvailable -= availableOnDisk[dir];
445502
availableOnDisk[dir] = 0; // skip this disk
446503
numDirsSearched++;
504+
note(history, "No capacity in %s", localDir);
505+
} else {
506+
note(history, "Allocated file %s in %s", returnPath, localDir);
447507
}
448508
}
449509
} else {
510+
note(history, "Size is %,d; searching", size);
450511
// Start linear search with random increment if possible
451512
int randomInc = 1;
452513
if (numDirs > 2) {
@@ -459,17 +520,22 @@ public Path getLocalPathForWrite(String pathStr, long size,
459520
maxCapacity = capacity;
460521
}
461522
if (capacity > size) {
523+
final Path localDir = ctx.localDirs[dirNum];
462524
try {
463-
returnPath = createPath(ctx.localDirs[dirNum], pathStr,
464-
checkWrite);
525+
returnPath = createPath(localDir, pathStr, checkWrite);
465526
} catch (IOException e) {
466527
errorText = e.getMessage();
467528
diskException = e;
468-
LOG.debug("DiskException caught for dir {}", ctx.localDirs[dirNum], e);
529+
note(history, "Exception while creating path %s: %s", localDir, errorText);
530+
LOG.debug("DiskException caught for dir {}", localDir, e);
469531
}
470532
if (returnPath != null) {
533+
// success
471534
ctx.getAndIncrDirNumLastAccessed(numDirsSearched);
535+
note(history, "Allocated file %s in %s", returnPath, localDir);
472536
break;
537+
} else {
538+
note(history, "No capacity in %s", localDir);
473539
}
474540
}
475541
dirNum++;
@@ -484,10 +550,14 @@ public Path getLocalPathForWrite(String pathStr, long size,
484550
//no path found
485551
String newErrorText = "Could not find any valid local directory for " +
486552
pathStr + " with requested size " + size +
487-
" as the max capacity in any directory is " + maxCapacity;
553+
" as the max capacity in any directory"
554+
+ " (" + pathNames + " )"
555+
+ " is " + maxCapacity;
488556
if (errorText != null) {
489557
newErrorText = newErrorText + " due to " + errorText;
490558
}
559+
LOG.error(newErrorText);
560+
LOG.error(history.toString());
491561
throw new DiskErrorException(newErrorText, diskException);
492562
}
493563

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalDirAllocator.java

+35-9
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,16 @@
2626
import java.util.NoSuchElementException;
2727

2828
import org.apache.hadoop.conf.Configuration;
29-
import org.apache.hadoop.test.LambdaTestUtils;
3029
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
3130
import org.apache.hadoop.util.Shell;
3231

32+
import org.assertj.core.api.Assertions;
3333
import org.junit.jupiter.api.Timeout;
3434
import org.junit.jupiter.params.ParameterizedTest;
3535
import org.junit.jupiter.params.provider.MethodSource;
3636

37+
import static org.apache.hadoop.fs.LocalDirAllocator.E_NO_SPACE_AVAILABLE;
38+
import static org.apache.hadoop.test.LambdaTestUtils.intercept;
3739
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
3840
import static org.junit.jupiter.api.Assertions.assertEquals;
3941
import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -564,13 +566,8 @@ public void testGetLocalPathForWriteForInvalidPaths(String paramRoot, String par
564566
throws Exception {
565567
initTestLocalDirAllocator(paramRoot, paramPrefix);
566568
conf.set(CONTEXT, " ");
567-
try {
568-
dirAllocator.getLocalPathForWrite("/test", conf);
569-
fail("not throwing the exception");
570-
} catch (IOException e) {
571-
assertEquals("No space available in any of the local directories.",
572-
e.getMessage(), "Incorrect exception message");
573-
}
569+
intercept(IOException.class, E_NO_SPACE_AVAILABLE, () ->
570+
dirAllocator.getLocalPathForWrite("/test", conf));
574571
}
575572

576573
/**
@@ -587,10 +584,13 @@ public void testGetLocalPathForWriteForLessSpace(String paramRoot, String paramP
587584
String dir0 = buildBufferDir(root, 0);
588585
String dir1 = buildBufferDir(root, 1);
589586
conf.set(CONTEXT, dir0 + "," + dir1);
590-
LambdaTestUtils.intercept(DiskErrorException.class,
587+
final DiskErrorException ex = intercept(DiskErrorException.class,
591588
String.format("Could not find any valid local directory for %s with requested size %s",
592589
"p1/x", Long.MAX_VALUE - 1), "Expect a DiskErrorException.",
593590
() -> dirAllocator.getLocalPathForWrite("p1/x", Long.MAX_VALUE - 1, conf));
591+
Assertions.assertThat(ex.getMessage())
592+
.contains(new File(dir0).getName())
593+
.contains(new File(dir1).getName());
594594
}
595595

596596
/**
@@ -614,5 +614,31 @@ public void testDirectoryRecovery(String paramRoot, String paramPrefix) throws T
614614
// and expect to get a new file back
615615
dirAllocator.getLocalPathForWrite("file2", -1, conf);
616616
}
617+
618+
619+
/**
620+
* Test for HADOOP-19554. LocalDirAllocator still doesn't always recover
621+
* from directory tree deletion.
622+
*/
623+
@Timeout(value = 30)
624+
@MethodSource("params")
625+
@ParameterizedTest
626+
public void testDirectoryRecoveryKnownSize(String paramRoot, String paramPrefix) throws Throwable {
627+
initTestLocalDirAllocator(paramRoot, paramPrefix);
628+
String dir0 = buildBufferDir(root, 0);
629+
String subdir = dir0 + "/subdir1/subdir2";
630+
631+
conf.set(CONTEXT, subdir);
632+
// get local path and an ancestor
633+
final Path pathForWrite = dirAllocator.getLocalPathForWrite("file", 512, conf);
634+
final Path ancestor = pathForWrite.getParent().getParent();
635+
636+
// delete that ancestor
637+
localFs.delete(ancestor, true);
638+
// and expect to get a new file back
639+
dirAllocator.getLocalPathForWrite("file2", -1, conf);
640+
}
641+
642+
617643
}
618644

0 commit comments

Comments
 (0)