Skip to content

Commit 76b511b

Browse files
committed
HBASE-29216 Recovered replication stuck , when enabled hbase.separate.oldlogdir.by.regionserver
1 parent a8ea489 commit 76b511b

File tree

2 files changed

+78
-5
lines changed

2 files changed

+78
-5
lines changed

hbase-server/src/main/java/org/apache/hadoop/hbase/wal/AbstractFSWALProvider.java

+55-4
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,16 @@
1818
package org.apache.hadoop.hbase.wal;
1919

2020
import java.io.IOException;
21+
import java.io.UnsupportedEncodingException;
22+
import java.net.URLDecoder;
2123
import java.util.ArrayList;
2224
import java.util.Collections;
2325
import java.util.Comparator;
2426
import java.util.List;
2527
import java.util.concurrent.atomic.AtomicBoolean;
2628
import java.util.concurrent.locks.ReadWriteLock;
2729
import java.util.concurrent.locks.ReentrantReadWriteLock;
30+
import java.util.regex.Matcher;
2831
import java.util.regex.Pattern;
2932
import org.apache.hadoop.conf.Configuration;
3033
import org.apache.hadoop.fs.FileSystem;
@@ -36,6 +39,7 @@
3639
import org.apache.hadoop.hbase.client.RegionInfo;
3740
import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
3841
import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
42+
import org.apache.hadoop.hbase.util.Addressing;
3943
import org.apache.hadoop.hbase.util.CancelableProgressable;
4044
import org.apache.hadoop.hbase.util.CommonFSUtils;
4145
import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils;
@@ -45,6 +49,7 @@
4549
import org.slf4j.LoggerFactory;
4650

4751
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
52+
import static org.apache.hadoop.hbase.ServerName.SERVERNAME_SEPARATOR;
4853

4954
/**
5055
* Base class of a WAL Provider that returns a single thread safe WAL that writes to Hadoop FS. By
@@ -243,6 +248,17 @@ static void requestLogRoll(final WAL wal) {
243248

244249
// should be package private; more visible for use in AbstractFSWAL
245250
public static final String WAL_FILE_NAME_DELIMITER = ".";
251+
252+
/**
253+
* Pattern used to parse server name from WAL file name
254+
* see {@link #getServerNameFromWALFileName(Path)}
255+
*/
256+
public static final Pattern SERVERNAME_IN_FILE_PATTERN =
257+
Pattern.compile("[^" + SERVERNAME_SEPARATOR + "]+" +
258+
SERVERNAME_SEPARATOR + Addressing.VALID_PORT_REGEX +
259+
SERVERNAME_SEPARATOR + Addressing.VALID_PORT_REGEX +
260+
"[^" + WAL_FILE_NAME_DELIMITER + "]+");
261+
246262
/** The hbase:meta region's WAL filename extension */
247263
public static final String META_WAL_PROVIDER_ID = ".meta";
248264
static final String DEFAULT_PROVIDER_ID = "default";
@@ -404,6 +420,39 @@ public static ServerName getServerNameFromWALDirectoryName(Path logFile) {
404420
return serverName;
405421
}
406422

423+
/**
424+
* This function returns region server name from a log file name which is in one of the following
425+
* formats:
426+
* <ul>
427+
* <li>hdfs://&lt;name node&gt;/hbase/oldWALs/hostname%2C22101%2C1487767381290.providerId.regiongroup-0.1487785392316</li>
428+
* <li>hdfs://&lt;name node&gt;/hbase/oldWALs/hostname%2C22101%2C1487767381290.1487785392316</li>
429+
* </ul>
430+
* @return null if the passed in logFile isn't a valid WAL file path
431+
*/
432+
public static ServerName getServerNameFromWALFileName(Path logFile) {
433+
String fileName = logFile.getName();
434+
// We were passed the directory and not a file in it.
435+
ServerName serverName = null;
436+
try {
437+
fileName = URLDecoder.decode(fileName, HConstants.UTF8_ENCODING);
438+
Matcher matcher = SERVERNAME_IN_FILE_PATTERN.matcher(fileName);
439+
if (!matcher.find()) {
440+
throw new IllegalArgumentException("Cannot parse a server name form filename=" + fileName);
441+
}
442+
String strServerName = matcher.group(0);
443+
serverName = ServerName.parseServerName(strServerName);
444+
} catch (IllegalArgumentException | IllegalStateException | UnsupportedEncodingException ex) {
445+
serverName = null;
446+
LOG.warn("Cannot parse a server name from path={}", logFile, ex);
447+
}
448+
if (serverName != null && serverName.getStartCode() < 0) {
449+
LOG.warn("Invalid log file path={}, start code {} is less than 0", logFile,
450+
serverName.getStartCode());
451+
serverName = null;
452+
}
453+
return serverName;
454+
}
455+
407456
public static boolean isMetaFile(Path p) {
408457
return isMetaFile(p.getName());
409458
}
@@ -448,10 +497,6 @@ public static boolean isArchivedLogFile(Path p) {
448497
* @throws IOException exception
449498
*/
450499
public static Path findArchivedLog(Path path, Configuration conf) throws IOException {
451-
// If the path contains oldWALs keyword then exit early.
452-
if (path.toString().contains(HConstants.HREGION_OLDLOGDIR_NAME)) {
453-
return null;
454-
}
455500
Path walRootDir = CommonFSUtils.getWALRootDir(conf);
456501
FileSystem fs = path.getFileSystem(conf);
457502
// Try finding the log in old dir
@@ -463,6 +508,12 @@ public static Path findArchivedLog(Path path, Configuration conf) throws IOExcep
463508
}
464509

465510
ServerName serverName = getServerNameFromWALDirectoryName(path);
511+
if (serverName == null) {
512+
// try to parse server name from wal file name.
513+
LOG.warn("Parse server name from wal directory failed,"
514+
+ " try to parse from wal filename");
515+
serverName = getServerNameFromWALFileName(path);
516+
}
466517
if (serverName == null) {
467518
LOG.warn("Can not extract server name from path {}, "
468519
+ "give up searching the separated old log dir", path);

hbase-server/src/test/java/org/apache/hadoop/hbase/replication/regionserver/TestReplicationSource.java

+23-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.hadoop.hbase.replication.regionserver;
1919

2020
import static org.apache.hadoop.hbase.wal.AbstractFSWALProvider.META_WAL_PROVIDER_ID;
21+
import static org.apache.hadoop.hbase.wal.AbstractFSWALProvider.SEPARATE_OLDLOGDIR;
2122
import static org.junit.Assert.assertEquals;
2223
import static org.junit.Assert.assertFalse;
2324
import static org.junit.Assert.assertNotNull;
@@ -305,14 +306,32 @@ public void testTerminateClearsBuffer() throws Exception {
305306
}
306307

307308
/**
308-
* Tests that recovered queues are preserved on a regionserver shutdown. See HBASE-18192
309+
* Tests that recovered queues are preserved on a regionserver shutdown.
310+
* See HBASE-18192
309311
*/
310312
@Test
311313
public void testServerShutdownRecoveredQueue() throws Exception {
314+
// Test "hbase.separate.oldlogdir.by.regionserver" = disabled
315+
testServerShutdownRecovered(false);
316+
}
317+
318+
/**
319+
* Tests that recovered queues are preserved on a regionserver shutdown with
320+
* "hbase.separate.oldlogdir.by.regionserver" enabled.
321+
* See HBASE-29216
322+
*/
323+
@Test
324+
public void testServerShutdownRecoveredQueueWithSeparateOldDir() throws Exception {
325+
// Test "hbase.separate.oldlogdir.by.regionserver" = enabled
326+
testServerShutdownRecovered(true);
327+
}
328+
329+
private void testServerShutdownRecovered(boolean separateOldWalDir) throws Exception {
312330
try {
313331
// Ensure single-threaded WAL
314332
conf.set("hbase.wal.provider", "defaultProvider");
315333
conf.setInt("replication.sleep.before.failover", 2000);
334+
conf.setBoolean(SEPARATE_OLDLOGDIR, separateOldWalDir);
316335
// Introduces a delay in regionserver shutdown to give the race condition a chance to kick in.
317336
conf.set(HConstants.REGION_SERVER_IMPL, ShutdownDelayRegionServer.class.getName());
318337
MiniHBaseCluster cluster = TEST_UTIL.startMiniCluster(2);
@@ -376,6 +395,9 @@ public boolean evaluate() throws Exception {
376395
(Waiter.Predicate<Exception>) () -> managerC.getOldSources().size() == 0);
377396
} finally {
378397
conf.set(HConstants.REGION_SERVER_IMPL, HRegionServer.class.getName());
398+
conf.setBoolean(SEPARATE_OLDLOGDIR, false);
399+
TEST_UTIL.shutdownMiniCluster();
400+
TEST_UTIL_PEER.shutdownMiniCluster();
379401
}
380402
}
381403

0 commit comments

Comments
 (0)