From 82a2ce10f24a828b2c4960ba85b714a0203c8441 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Sat, 9 Dec 2023 21:55:11 +0800 Subject: [PATCH] HBASE-28248 Race between RegionRemoteProcedureBase and rollback operation could lead to ROLLEDBACK state be persisent to procedure store (#5567) Signed-off-by: GeorryHuang Signed-off-by: Yi Mei --- .../assignment/RegionRemoteProcedureBase.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java index d27e0068b0ca..f6668d9c14b1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java @@ -47,6 +47,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseState; import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData; import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; /** @@ -183,7 +184,20 @@ protected abstract void updateTransitionWithoutPersistingToMeta(MasterProcedureE // A bit strange but the procedure store will throw RuntimeException if we can not persist the // state, so upper layer should take care of this... private void persistAndWake(MasterProcedureEnv env, RegionStateNode regionNode) { - env.getMasterServices().getMasterProcedureExecutor().getStore().update(this); + // The synchronization here is to guard with ProcedureExecutor.executeRollback, as here we will + // not hold the procedure execution lock, but we should not persist a procedure in ROLLEDBACK + // state to the procedure store. + // The ProcedureStore.update must be inside the lock, so here the check for procedure state and + // update could be atomic. In ProcedureExecutor.cleanupAfterRollbackOneStep, we will set the + // state to ROLLEDBACK, which will hold the same lock too as the Procedure.setState method is + // synchronized. This is the key to keep us safe. + synchronized (this) { + if (getState() == ProcedureState.ROLLEDBACK) { + LOG.warn("Procedure {} has already been rolled back, skip persistent", this); + return; + } + env.getMasterServices().getMasterProcedureExecutor().getStore().update(this); + } regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); }