Skip to content

Commit

Permalink
HBASE-27579 CatalogJanitor can cause data loss due to errors during c…
Browse files Browse the repository at this point in the history
…leanMergeRegion (#4986)

Signed-off-by: Andrew Purtell <[email protected]>
Signed-off-by: Viraj Jasani <[email protected]>
Signed-off-by: Duo Zhang <[email protected]>
  • Loading branch information
bbeaudreault committed Jan 20, 2023
1 parent 8fc3ef3 commit c6a1722
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.ScheduledChore;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Get;
Expand Down Expand Up @@ -183,12 +182,12 @@ public int scan() throws IOException {
for (Map.Entry<RegionInfo, Result> e : mergedRegions.entrySet()) {
if (this.services.isInMaintenanceMode()) {
// Stop cleaning if the master is in maintenance mode
LOG.debug("In maintenence mode, not cleaning");
LOG.debug("In maintenance mode, not cleaning");
break;
}

List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(e.getValue().rawCells());
if (parents != null && cleanMergeRegion(e.getKey(), parents)) {
if (parents != null && cleanMergeRegion(this.services, e.getKey(), parents)) {
gcs++;
}
}
Expand All @@ -201,7 +200,7 @@ public int scan() throws IOException {
if (this.services.isInMaintenanceMode()) {
// Stop cleaning if the master is in maintenance mode
if (LOG.isDebugEnabled()) {
LOG.debug("In maintenence mode, not cleaning");
LOG.debug("In maintenance mode, not cleaning");
}
break;
}
Expand Down Expand Up @@ -248,30 +247,24 @@ public CatalogJanitorReport getLastReport() {
* @return true if we delete references in merged region on hbase:meta and archive the files on
* the file system
*/
private boolean cleanMergeRegion(final RegionInfo mergedRegion, List<RegionInfo> parents)
throws IOException {
static boolean cleanMergeRegion(MasterServices services, final RegionInfo mergedRegion,
List<RegionInfo> parents) throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("Cleaning merged region {}", mergedRegion);
}
FileSystem fs = this.services.getMasterFileSystem().getFileSystem();
Path rootdir = this.services.getMasterFileSystem().getRootDir();
Path tabledir = CommonFSUtils.getTableDir(rootdir, mergedRegion.getTable());
TableDescriptor htd = getDescriptor(mergedRegion.getTable());
HRegionFileSystem regionFs = null;
try {
regionFs = HRegionFileSystem.openRegionFromFileSystem(this.services.getConfiguration(), fs,
tabledir, mergedRegion, true);
} catch (IOException e) {
LOG.warn("Merged region does not exist: " + mergedRegion.getEncodedName());
}
if (regionFs == null || !regionFs.hasReferences(htd)) {

Pair<Boolean, Boolean> result =
checkRegionReferences(services, mergedRegion.getTable(), mergedRegion);

if (hasNoReferences(result)) {
if (LOG.isDebugEnabled()) {
LOG.debug(
"Deleting parents ({}) from fs; merged child {} no longer holds references", parents
.stream().map(r -> RegionInfo.getShortNameToLog(r)).collect(Collectors.joining(", ")),
mergedRegion);
}
ProcedureExecutor<MasterProcedureEnv> pe = this.services.getMasterProcedureExecutor();

ProcedureExecutor<MasterProcedureEnv> pe = services.getMasterProcedureExecutor();
GCMultipleMergedRegionsProcedure mergeRegionProcedure =
new GCMultipleMergedRegionsProcedure(pe.getEnvironment(), mergedRegion, parents);
pe.submitProcedure(mergeRegionProcedure);
Expand All @@ -280,7 +273,15 @@ private boolean cleanMergeRegion(final RegionInfo mergedRegion, List<RegionInfo>
mergedRegion);
}
return true;
} else {
if (LOG.isDebugEnabled()) {
LOG.debug(
"Deferring cleanup up of {} parents of merged region {}, because references "
+ "still exist in merged region or we encountered an exception in checking",
parents.size(), mergedRegion.getEncodedName());
}
}

return false;
}

Expand Down Expand Up @@ -332,8 +333,10 @@ static boolean cleanParent(MasterServices services, RegionInfo parent, Result ro
}
// Run checks on each daughter split.
PairOfSameType<RegionInfo> daughters = MetaTableAccessor.getDaughterRegions(rowContent);
Pair<Boolean, Boolean> a = checkDaughterInFs(services, parent, daughters.getFirst());
Pair<Boolean, Boolean> b = checkDaughterInFs(services, parent, daughters.getSecond());
Pair<Boolean, Boolean> a =
checkRegionReferences(services, parent.getTable(), daughters.getFirst());
Pair<Boolean, Boolean> b =
checkRegionReferences(services, parent.getTable(), daughters.getSecond());
if (hasNoReferences(a) && hasNoReferences(b)) {
String daughterA =
daughters.getFirst() != null ? daughters.getFirst().getShortNameToLog() : "null";
Expand Down Expand Up @@ -386,59 +389,45 @@ private static boolean hasNoReferences(final Pair<Boolean, Boolean> p) {
}

/**
* Checks if a daughter region -- either splitA or splitB -- still holds references to parent.
* @param parent Parent region
* @param daughter Daughter region
* @return A pair where the first boolean says whether or not the daughter region directory exists
* in the filesystem and then the second boolean says whether the daughter has references
* to the parent.
* Checks if a region still holds references to parent.
* @param tableName The table for the region
* @param region The region to check
* @return A pair where the first boolean says whether the region directory exists in the
* filesystem and then the second boolean says whether the region has references to a
* parent.
*/
private static Pair<Boolean, Boolean> checkDaughterInFs(MasterServices services,
final RegionInfo parent, final RegionInfo daughter) throws IOException {
if (daughter == null) {
private static Pair<Boolean, Boolean> checkRegionReferences(MasterServices services,
TableName tableName, RegionInfo region) throws IOException {
if (region == null) {
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
}

FileSystem fs = services.getMasterFileSystem().getFileSystem();
Path rootdir = services.getMasterFileSystem().getRootDir();
Path tabledir = CommonFSUtils.getTableDir(rootdir, daughter.getTable());

Path daughterRegionDir = new Path(tabledir, daughter.getEncodedName());

HRegionFileSystem regionFs;
Path tabledir = CommonFSUtils.getTableDir(rootdir, tableName);
Path regionDir = new Path(tabledir, region.getEncodedName());

try {
if (!CommonFSUtils.isExists(fs, daughterRegionDir)) {
if (!CommonFSUtils.isExists(fs, regionDir)) {
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
}
} catch (IOException ioe) {
LOG.error("Error trying to determine if daughter region exists, "
+ "assuming exists and has references", ioe);
LOG.error("Error trying to determine if region exists, assuming exists and has references",
ioe);
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
}

boolean references = false;
TableDescriptor parentDescriptor = services.getTableDescriptors().get(parent.getTable());
TableDescriptor tableDescriptor = services.getTableDescriptors().get(tableName);
try {
regionFs = HRegionFileSystem.openRegionFromFileSystem(services.getConfiguration(), fs,
tabledir, daughter, true);

for (ColumnFamilyDescriptor family : parentDescriptor.getColumnFamilies()) {
references = regionFs.hasReferences(family.getNameAsString());
if (references) {
break;
}
}
HRegionFileSystem regionFs = HRegionFileSystem
.openRegionFromFileSystem(services.getConfiguration(), fs, tabledir, region, true);
boolean references = regionFs.hasReferences(tableDescriptor);
return new Pair<>(Boolean.TRUE, references);
} catch (IOException e) {
LOG.error("Error trying to determine referenced files from : " + daughter.getEncodedName()
+ ", to: " + parent.getEncodedName() + " assuming has references", e);
LOG.error("Error trying to determine if region {} has references, assuming it does",
region.getEncodedName(), e);
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
}
return new Pair<>(Boolean.TRUE, references);
}

private TableDescriptor getDescriptor(final TableName tableName) throws IOException {
return this.services.getTableDescriptors().get(tableName);
}

private void updateAssignmentManagerMetrics() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,12 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.any;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.when;

import java.io.IOException;
import java.util.ArrayList;
Expand All @@ -34,6 +38,7 @@
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
Expand All @@ -46,10 +51,12 @@
import org.apache.hadoop.hbase.MetaMockingUtil;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.io.Reference;
import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.assignment.MockMasterServices;
import org.apache.hadoop.hbase.master.janitor.CatalogJanitor.SplitParentFirstComparator;
Expand Down Expand Up @@ -114,6 +121,100 @@ public void teardown() {
this.masterServices.stop("DONE");
}

@Test
public void testCleanMerge() throws IOException {
TableDescriptor td = createTableDescriptorForCurrentMethod();
// Create regions.
HRegionInfo merged =
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("eee"));
HRegionInfo parenta =
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("ccc"));
HRegionInfo parentb =
new HRegionInfo(td.getTableName(), Bytes.toBytes("ccc"), Bytes.toBytes("eee"));

List<RegionInfo> parents = new ArrayList<>();
parents.add(parenta);
parents.add(parentb);

Path rootdir = this.masterServices.getMasterFileSystem().getRootDir();
Path tabledir = CommonFSUtils.getTableDir(rootdir, td.getTableName());
Path storedir =
HRegionFileSystem.getStoreHomedir(tabledir, merged, td.getColumnFamilies()[0].getName());

Path parentaRef = createMergeReferenceFile(storedir, merged, parenta);
Path parentbRef = createMergeReferenceFile(storedir, merged, parentb);

// references exist, should not clean
assertFalse(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));

masterServices.getMasterFileSystem().getFileSystem().delete(parentaRef, false);

// one reference still exists, should not clean
assertFalse(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));

masterServices.getMasterFileSystem().getFileSystem().delete(parentbRef, false);

// all references removed, should clean
assertTrue(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
}

@Test
public void testDontCleanMergeIfFileSystemException() throws IOException {
TableDescriptor td = createTableDescriptorForCurrentMethod();
// Create regions.
HRegionInfo merged =
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("eee"));
HRegionInfo parenta =
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("ccc"));
HRegionInfo parentb =
new HRegionInfo(td.getTableName(), Bytes.toBytes("ccc"), Bytes.toBytes("eee"));

List<RegionInfo> parents = new ArrayList<>();
parents.add(parenta);
parents.add(parentb);

Path rootdir = this.masterServices.getMasterFileSystem().getRootDir();
Path tabledir = CommonFSUtils.getTableDir(rootdir, td.getTableName());
Path storedir =
HRegionFileSystem.getStoreHomedir(tabledir, merged, td.getColumnFamilies()[0].getName());
createMergeReferenceFile(storedir, merged, parenta);

MasterServices mockedMasterServices = spy(masterServices);
MasterFileSystem mockedMasterFileSystem = spy(masterServices.getMasterFileSystem());
FileSystem mockedFileSystem = spy(masterServices.getMasterFileSystem().getFileSystem());

when(mockedMasterServices.getMasterFileSystem()).thenReturn(mockedMasterFileSystem);
when(mockedMasterFileSystem.getFileSystem()).thenReturn(mockedFileSystem);

// throw on the first exists check
doThrow(new IOException("Some exception")).when(mockedFileSystem).exists(any());

assertFalse(CatalogJanitor.cleanMergeRegion(mockedMasterServices, merged, parents));

// throw on the second exists check (within HRegionfileSystem)
AtomicBoolean returned = new AtomicBoolean(false);
doAnswer(invocationOnMock -> {
if (!returned.get()) {
returned.set(true);
return masterServices.getMasterFileSystem().getFileSystem()
.exists(invocationOnMock.getArgument(0));
}
throw new IOException("Some exception");
}).when(mockedFileSystem).exists(any());

assertFalse(CatalogJanitor.cleanMergeRegion(mockedMasterServices, merged, parents));
}

private Path createMergeReferenceFile(Path storeDir, HRegionInfo mergedRegion,
HRegionInfo parentRegion) throws IOException {
Reference ref = Reference.createTopReference(mergedRegion.getStartKey());
long now = EnvironmentEdgeManager.currentTime();
// Reference name has this format: StoreFile#REF_NAME_PARSER
Path p = new Path(storeDir, Long.toString(now) + "." + parentRegion.getEncodedName());
FileSystem fs = this.masterServices.getMasterFileSystem().getFileSystem();
return ref.write(fs, p);
}

/**
* Test clearing a split parent.
*/
Expand Down

0 comments on commit c6a1722

Please sign in to comment.