-
Notifications
You must be signed in to change notification settings - Fork 751
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[GOBBLIN-1744] Improve handling of null value edge cases when querying Helix #3603
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.gobblin.cluster; | ||
|
||
/** | ||
* Exception to describe situations where Gobblin sees unexpected state from Helix. Historically, we've seen unexpected | ||
* null values, which bubble up as NPE. This exception is explicitly used to differentiate bad Gobblin code from | ||
* Helix failures (i.e. seeing a NPE implies Gobblin bug) | ||
*/ | ||
public class GobblinHelixUnexpectedStateException extends Exception { | ||
public GobblinHelixUnexpectedStateException(String message, Object... args) { | ||
super(String.format(message, args)); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,10 +28,13 @@ | |
import java.util.Set; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.TimeoutException; | ||
|
||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.gobblin.configuration.ConfigurationKeys; | ||
import org.apache.gobblin.metrics.Tag; | ||
import org.apache.gobblin.metrics.event.TimingEvent; | ||
import org.apache.gobblin.runtime.JobException; | ||
import org.apache.gobblin.runtime.JobState; | ||
import org.apache.gobblin.runtime.listeners.JobListener; | ||
import org.apache.gobblin.util.Id; | ||
import org.apache.gobblin.util.JobLauncherUtils; | ||
import org.apache.helix.HelixAdmin; | ||
|
@@ -54,13 +57,7 @@ | |
import org.apache.helix.task.WorkflowContext; | ||
import org.apache.helix.tools.ClusterSetup; | ||
|
||
import lombok.extern.slf4j.Slf4j; | ||
|
||
import org.apache.gobblin.configuration.ConfigurationKeys; | ||
import org.apache.gobblin.runtime.JobException; | ||
import org.apache.gobblin.runtime.listeners.JobListener; | ||
|
||
import static org.apache.helix.task.TaskState.STOPPED; | ||
import static org.apache.helix.task.TaskState.*; | ||
|
||
|
||
/** | ||
|
@@ -393,13 +390,28 @@ private static void deleteStoppedHelixJob(HelixManager helixManager, String work | |
* | ||
* @param jobNames a list of Gobblin job names. | ||
* @return a map from jobNames to their Helix Workflow Ids. | ||
* @throws GobblinHelixUnexpectedStateException when there is inconsistent helix state. This implies that we should retry the call | ||
* to avoid acting on stale data | ||
*/ | ||
public static Map<String, String> getWorkflowIdsFromJobNames(HelixManager helixManager, Collection<String> jobNames) { | ||
Map<String, String> jobNameToWorkflowId = new HashMap<>(); | ||
public static Map<String, String> getWorkflowIdsFromJobNames(HelixManager helixManager, Collection<String> jobNames) | ||
throws GobblinHelixUnexpectedStateException { | ||
TaskDriver taskDriver = new TaskDriver(helixManager); | ||
return getWorkflowIdsFromJobNames(taskDriver, jobNames); | ||
} | ||
|
||
public static Map<String, String> getWorkflowIdsFromJobNames(TaskDriver taskDriver, Collection<String> jobNames) | ||
throws GobblinHelixUnexpectedStateException { | ||
Map<String, String> jobNameToWorkflowId = new HashMap<>(); | ||
Map<String, WorkflowConfig> workflowConfigMap = taskDriver.getWorkflows(); | ||
for (String workflow : workflowConfigMap.keySet()) { | ||
WorkflowConfig workflowConfig = taskDriver.getWorkflowConfig(workflow); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This previous version made 2 calls to to zookeeper when only 1 is necessary. Doing 2 calls makes it more likely to see inconsistent state. |
||
for (Map.Entry<String, WorkflowConfig> entry : workflowConfigMap.entrySet()) { | ||
String workflow = entry.getKey(); | ||
WorkflowConfig workflowConfig = entry.getValue(); | ||
if (workflowConfig == null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, if this is not expected, throw exception instead of just log it out There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Made change to throw exception. I made the choice to create a new custom exception because there are other places where make API calls to ZK / Helix, and I want to use this exception to explicitly say there is a Helix issue. The caller of this API (Job scheduler) is using a retryer that will automatically retry this API call again if there is an exception. |
||
// As of Helix 1.0.2 implementation, this in theory shouldn't happen. But this null check is here in case implementation changes | ||
// because the API doesn't technically prohibit null configs, maps allowing null values is implementation based, and we want to fail loudly with a clear root cause. | ||
// the caller of this API should retry this API call | ||
throw new GobblinHelixUnexpectedStateException("Received null workflow config from Helix. We should not see any null configs when reading all workflows. workflowId=%s", workflow); | ||
} | ||
//Filter out any stale Helix workflows which are not running. | ||
if (workflowConfig.getTargetState() != TargetState.START) { | ||
continue; | ||
|
@@ -450,4 +462,4 @@ public static void dropInstanceIfExists(HelixAdmin admin, String clusterName, St | |
log.error("Could not drop instance: {} due to: {}", helixInstanceName, e); | ||
} | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please read the PR description to understand why these null checks are needed in the PR and how I tested these changes. These pieces of code are important and critical pieces with a fair amount of nuance related to helix.