Skip to content

Commit 81e4848

Browse files
HADOOP-19554. LocalDirAllocator still doesn't always recover from directory deletion (#7683)
The LocalDIrManager class used to allocate disk space for HDFS block storage and cloud connector buffers should now recover from directory deletion on all codepaths. This situation can occur in cloud connectors uses when cron jobs clean up temp directories on a schedule. There is also significantly better logging of the allocation process failures. Contributed by Steve Loughran
1 parent cd68d13 commit 81e4848

File tree

2 files changed

+141
-35
lines changed

2 files changed

+141
-35
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/LocalDirAllocator.java

Lines changed: 107 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
import org.slf4j.Logger;
3232
import org.slf4j.LoggerFactory;
3333

34+
import static org.apache.hadoop.net.NetUtils.getHostname;
35+
3436
/** An implementation of a round-robin scheme for disk allocation for creating
3537
* files. The way it works is that it is kept track what disk was last
3638
* allocated for a file write. For the current request, the next disk from
@@ -65,7 +67,10 @@
6567
@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
6668
@InterfaceStability.Unstable
6769
public class LocalDirAllocator {
68-
70+
71+
static final String E_NO_SPACE_AVAILABLE =
72+
"No space available in any of the local directories";
73+
6974
//A Map from the config item names like "mapred.local.dir"
7075
//to the instance of the AllocatorPerContext. This
7176
//is a static object to make sure there exists exactly one instance per JVM
@@ -384,6 +389,24 @@ int getCurrentDirectoryIndex() {
384389
return currentContext.get().dirNumLastAccessed.get();
385390
}
386391

392+
/**
393+
* Format a string, log at debug and append it to the history as a new line.
394+
*
395+
* @param history history to fill in
396+
* @param fmt format string
397+
* @param args varags
398+
*/
399+
private void note(StringBuilder history, String fmt, Object... args) {
400+
try {
401+
final String s = String.format(fmt, args);
402+
history.append(s).append("\n");
403+
LOG.debug(s);
404+
} catch (Exception e) {
405+
// some resilience in case the format string is wrong
406+
LOG.debug(fmt, e);
407+
}
408+
}
409+
387410
/** Get a path from the local FS. If size is known, we go
388411
* round-robin over the set of disks (via the configured dirs) and return
389412
* the first complete path which has enough space.
@@ -393,6 +416,12 @@ int getCurrentDirectoryIndex() {
393416
*/
394417
public Path getLocalPathForWrite(String pathStr, long size,
395418
Configuration conf, boolean checkWrite) throws IOException {
419+
420+
// history is built up and logged at error if the alloc
421+
StringBuilder history = new StringBuilder();
422+
423+
note(history, "Searching for a directory for file \"%s\", size = %,d; checkWrite=%s",
424+
pathStr, size, checkWrite);
396425
Context ctx = confChanged(conf);
397426
int numDirs = ctx.localDirs.length;
398427
int numDirsSearched = 0;
@@ -406,27 +435,62 @@ public Path getLocalPathForWrite(String pathStr, long size,
406435
pathStr = pathStr.substring(1);
407436
}
408437
Path returnPath = null;
409-
410-
if(size == SIZE_UNKNOWN) { //do roulette selection: pick dir with probability
411-
//proportional to available size
412-
long[] availableOnDisk = new long[ctx.dirDF.length];
413-
long totalAvailable = 0;
414-
415-
//build the "roulette wheel"
416-
for(int i =0; i < ctx.dirDF.length; ++i) {
417-
final DF target = ctx.dirDF[i];
418-
// attempt to recreate the dir so that getAvailable() is valid
419-
// if it fails, getAvailable() will return 0, so the dir will
420-
// be declared unavailable.
421-
// return value is logged at debug to keep spotbugs quiet.
422-
final boolean b = new File(target.getDirPath()).mkdirs();
423-
LOG.debug("mkdirs of {}={}", target, b);
424-
availableOnDisk[i] = target.getAvailable();
438+
439+
final int dirCount = ctx.dirDF.length;
440+
long[] availableOnDisk = new long[dirCount];
441+
long totalAvailable = 0;
442+
443+
StringBuilder pathNames = new StringBuilder();
444+
445+
//build the "roulette wheel"
446+
for (int i =0; i < dirCount; ++i) {
447+
final DF target = ctx.dirDF[i];
448+
// attempt to recreate the dir so that getAvailable() is valid
449+
// if it fails, getAvailable() will return 0, so the dir will
450+
// be declared unavailable.
451+
// return value is logged at debug to keep spotbugs quiet.
452+
final String name = target.getDirPath();
453+
pathNames.append(" ").append(name);
454+
final File dirPath = new File(name);
455+
456+
// existence probe with directory recreation
457+
if (!dirPath.exists()) {
458+
LOG.debug("Creating buffer dir {}", name);
459+
if (dirPath.mkdirs()) {
460+
note(history, "Created buffer dir %s", name);
461+
} else {
462+
note(history, "Failed to create buffer dir %s", name);
463+
}
464+
}
465+
466+
// path already existed or the mkdir call had an outcome
467+
// make sure the path is present and a dir, and if so add its availability
468+
if (dirPath.isDirectory()) {
469+
final long available = target.getAvailable();
470+
availableOnDisk[i] = available;
471+
note(history, "%,d bytes available under path %s", available, name);
425472
totalAvailable += availableOnDisk[i];
473+
} else {
474+
note(history, "%s does not exist/is not a directory", name);
426475
}
476+
}
427477

428-
if (totalAvailable == 0){
429-
throw new DiskErrorException("No space available in any of the local directories.");
478+
note(history, "Directory count is %d; total available capacity is %,d",
479+
dirCount, totalAvailable);
480+
481+
if (size == SIZE_UNKNOWN) {
482+
//do roulette selection: pick dir with probability
483+
// proportional to available size
484+
note(history, "Size not specified, so picking directories at random.");
485+
486+
if (totalAvailable == 0) {
487+
// log error and history
488+
String newErrorText = E_NO_SPACE_AVAILABLE + pathNames
489+
+ " on host" + getHostname();
490+
LOG.error(newErrorText);
491+
LOG.error(history.toString());
492+
// then raise the exception
493+
throw new DiskErrorException(newErrorText);
430494
}
431495

432496
// Keep rolling the wheel till we get a valid path
@@ -439,14 +503,20 @@ public Path getLocalPathForWrite(String pathStr, long size,
439503
dir++;
440504
}
441505
ctx.dirNumLastAccessed.set(dir);
442-
returnPath = createPath(ctx.localDirs[dir], pathStr, checkWrite);
506+
final Path localDir = ctx.localDirs[dir];
507+
returnPath = createPath(localDir, pathStr, checkWrite);
443508
if (returnPath == null) {
444509
totalAvailable -= availableOnDisk[dir];
445510
availableOnDisk[dir] = 0; // skip this disk
446511
numDirsSearched++;
512+
note(history, "No capacity in %s", localDir);
513+
} else {
514+
note(history, "Allocated file %s in %s", returnPath, localDir);
447515
}
448516
}
449517
} else {
518+
note(history, "Requested file size is %,d; searching for a suitable directory",
519+
size);
450520
// Start linear search with random increment if possible
451521
int randomInc = 1;
452522
if (numDirs > 2) {
@@ -459,17 +529,22 @@ public Path getLocalPathForWrite(String pathStr, long size,
459529
maxCapacity = capacity;
460530
}
461531
if (capacity > size) {
532+
final Path localDir = ctx.localDirs[dirNum];
462533
try {
463-
returnPath = createPath(ctx.localDirs[dirNum], pathStr,
464-
checkWrite);
534+
returnPath = createPath(localDir, pathStr, checkWrite);
465535
} catch (IOException e) {
466536
errorText = e.getMessage();
467537
diskException = e;
468-
LOG.debug("DiskException caught for dir {}", ctx.localDirs[dirNum], e);
538+
note(history, "Exception while creating path %s: %s", localDir, errorText);
539+
LOG.debug("DiskException caught for dir {}", localDir, e);
469540
}
470541
if (returnPath != null) {
542+
// success
471543
ctx.getAndIncrDirNumLastAccessed(numDirsSearched);
544+
note(history, "Allocated file %s in %s", returnPath, localDir);
472545
break;
546+
} else {
547+
note(history, "No capacity in %s", localDir);
473548
}
474549
}
475550
dirNum++;
@@ -482,12 +557,18 @@ public Path getLocalPathForWrite(String pathStr, long size,
482557
}
483558

484559
//no path found
485-
String newErrorText = "Could not find any valid local directory for " +
486-
pathStr + " with requested size " + size +
487-
" as the max capacity in any directory is " + maxCapacity;
560+
String hostname = getHostname();
561+
String newErrorText = "Could not find any valid local directory for "
562+
+ pathStr + " with requested size " + size
563+
+ " on host " + hostname
564+
+ " as the max capacity in any directory"
565+
+ " (" + pathNames + " )"
566+
+ " is " + maxCapacity;
488567
if (errorText != null) {
489568
newErrorText = newErrorText + " due to " + errorText;
490569
}
570+
LOG.error(newErrorText);
571+
LOG.error(history.toString());
491572
throw new DiskErrorException(newErrorText, diskException);
492573
}
493574

hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalDirAllocator.java

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,16 @@
2626
import java.util.NoSuchElementException;
2727

2828
import org.apache.hadoop.conf.Configuration;
29-
import org.apache.hadoop.test.LambdaTestUtils;
3029
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
3130
import org.apache.hadoop.util.Shell;
3231

32+
import org.assertj.core.api.Assertions;
3333
import org.junit.jupiter.api.Timeout;
3434
import org.junit.jupiter.params.ParameterizedTest;
3535
import org.junit.jupiter.params.provider.MethodSource;
3636

37+
import static org.apache.hadoop.fs.LocalDirAllocator.E_NO_SPACE_AVAILABLE;
38+
import static org.apache.hadoop.test.LambdaTestUtils.intercept;
3739
import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
3840
import static org.junit.jupiter.api.Assertions.assertEquals;
3941
import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -564,13 +566,8 @@ public void testGetLocalPathForWriteForInvalidPaths(String paramRoot, String par
564566
throws Exception {
565567
initTestLocalDirAllocator(paramRoot, paramPrefix);
566568
conf.set(CONTEXT, " ");
567-
try {
568-
dirAllocator.getLocalPathForWrite("/test", conf);
569-
fail("not throwing the exception");
570-
} catch (IOException e) {
571-
assertEquals("No space available in any of the local directories.",
572-
e.getMessage(), "Incorrect exception message");
573-
}
569+
intercept(IOException.class, E_NO_SPACE_AVAILABLE, () ->
570+
dirAllocator.getLocalPathForWrite("/test", conf));
574571
}
575572

576573
/**
@@ -587,10 +584,13 @@ public void testGetLocalPathForWriteForLessSpace(String paramRoot, String paramP
587584
String dir0 = buildBufferDir(root, 0);
588585
String dir1 = buildBufferDir(root, 1);
589586
conf.set(CONTEXT, dir0 + "," + dir1);
590-
LambdaTestUtils.intercept(DiskErrorException.class,
587+
final DiskErrorException ex = intercept(DiskErrorException.class,
591588
String.format("Could not find any valid local directory for %s with requested size %s",
592589
"p1/x", Long.MAX_VALUE - 1), "Expect a DiskErrorException.",
593590
() -> dirAllocator.getLocalPathForWrite("p1/x", Long.MAX_VALUE - 1, conf));
591+
Assertions.assertThat(ex.getMessage())
592+
.contains(new File(dir0).getName())
593+
.contains(new File(dir1).getName());
594594
}
595595

596596
/**
@@ -614,5 +614,30 @@ public void testDirectoryRecovery(String paramRoot, String paramPrefix) throws T
614614
// and expect to get a new file back
615615
dirAllocator.getLocalPathForWrite("file2", -1, conf);
616616
}
617+
618+
/**
619+
* Test for HADOOP-19554. LocalDirAllocator still doesn't always recover
620+
* from directory tree deletion.
621+
*/
622+
@Timeout(value = 30)
623+
@MethodSource("params")
624+
@ParameterizedTest
625+
public void testDirectoryRecoveryKnownSize(String paramRoot, String paramPrefix) throws Throwable {
626+
initTestLocalDirAllocator(paramRoot, paramPrefix);
627+
String dir0 = buildBufferDir(root, 0);
628+
String subdir = dir0 + "/subdir1/subdir2";
629+
630+
conf.set(CONTEXT, subdir);
631+
// get local path and an ancestor
632+
final Path pathForWrite = dirAllocator.getLocalPathForWrite("file", 512, conf);
633+
final Path ancestor = pathForWrite.getParent().getParent();
634+
635+
// delete that ancestor
636+
localFs.delete(ancestor, true);
637+
// and expect to get a new file back
638+
dirAllocator.getLocalPathForWrite("file2", -1, conf);
639+
}
640+
641+
617642
}
618643

0 commit comments

Comments
 (0)