Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make zone load balancing logic depend on local_cluster host distribution. #174

Merged
merged 13 commits into from
Oct 31, 2016
4 changes: 3 additions & 1 deletion include/envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,9 @@ class HostSet {
COUNTER(zone_over_percentage) \
COUNTER(zone_routing_sampled) \
COUNTER(zone_routing_no_sampled) \
GAUGE (max_host_weight)
GAUGE (max_host_weight) \
COUNTER(local_cluster_not_ok) \
COUNTER(zone_number_differs)
// clang-format on

/**
Expand Down
115 changes: 86 additions & 29 deletions source/common/upstream/load_balancer_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,26 @@ bool LoadBalancerBase::earlyExitNonZoneRouting() {
return true;
}

// If local cluster is not set, or we are in panic mode for it.
if (local_host_set_ == nullptr || local_host_set_->hosts().empty() ||
isGlobalPanic(*local_host_set_)) {
stats_.local_cluster_not_ok_.inc();
return true;
}

// Same number of zones should be for local and upstream cluster.
if (host_set_.healthyHostsPerZone().size() != local_host_set_->healthyHostsPerZone().size()) {
stats_.zone_number_differs_.inc();
return true;
}

return false;
}

bool LoadBalancerBase::isGlobalPanic() {
bool LoadBalancerBase::isGlobalPanic(const HostSet& host_set) {
uint64_t global_panic_threshold =
std::min(100UL, runtime_.snapshot().getInteger("upstream.healthy_panic_threshold", 50));
double healthy_percent = 100.0 * host_set_.healthyHosts().size() / host_set_.hosts().size();
double healthy_percent = 100.0 * host_set.healthyHosts().size() / host_set.hosts().size();

// If the % of healthy hosts in the cluster is less than our panic threshold, we use all hosts.
if (healthy_percent < global_panic_threshold) {
Expand All @@ -46,48 +59,92 @@ bool LoadBalancerBase::isGlobalPanic() {
return false;
}

const std::vector<HostPtr>& LoadBalancerBase::hostsToUse() {
ASSERT(host_set_.healthyHosts().size() <= host_set_.hosts().size());
std::vector<uint64_t>
LoadBalancerBase::calculateZonePercentage(const std::vector<std::vector<HostPtr>>& hosts_per_zone) {
std::vector<uint64_t> percentage(hosts_per_zone.size());

if (host_set_.hosts().empty() || isGlobalPanic()) {
return host_set_.hosts();
uint64_t total_hosts = 0;
for (const auto& zone_hosts : hosts_per_zone) {
total_hosts += zone_hosts.size();
}

if (earlyExitNonZoneRouting()) {
return host_set_.healthyHosts();
if (total_hosts != 0) {
size_t pos = 0;
for (const auto& zone_hosts : hosts_per_zone) {
percentage[pos++] = 10000ULL * zone_hosts.size() / total_hosts;
}
}

return percentage;
}

const std::vector<HostPtr>& LoadBalancerBase::tryChooseLocalZoneHosts() {
// At this point it's guaranteed to be at least 2 zones.
uint32_t number_of_zones = host_set_.healthyHostsPerZone().size();
ASSERT(number_of_zones >= 2U);
const std::vector<HostPtr>& local_zone_healthy_hosts = host_set_.healthyHostsPerZone()[0];
ASSERT(host_set_.healthyHostsPerZone().size() >= 2U);

// If number of hosts in a local zone big enough then route all requests to the same zone.
if (local_zone_healthy_hosts.size() * number_of_zones >= host_set_.healthyHosts().size()) {
std::vector<uint64_t> local_percentage =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use a stack allocated array with the right size, rework the functions above to take the array and populate it if you want to do that. You can't allocate things like vectors in this code path.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or just do the pre-calculation work now. I would actually prefer you just do that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i made stack alloc for now, i want make sure things works good in prod

calculateZonePercentage(local_host_set_->healthyHostsPerZone());
std::vector<uint64_t> upstream_percentage =
calculateZonePercentage(host_set_.healthyHostsPerZone());

// Try to push all of the requests to the same zone first.
// If we have lower percent of hosts in the local cluster in the same zone,
// we can push all of the requests directly to upstream cluster in the same zone.
if (upstream_percentage[0] >= local_percentage[0]) {
stats_.zone_over_percentage_.inc();
return local_zone_healthy_hosts;
return host_set_.healthyHostsPerZone()[0];
}

// If we cannot route all requests to the same zone, calculate what percentage can be routed.
// For example, if local percentage is 20% and upstream is 10%
// we can route only 50% of requests directly.
uint64_t local_percent_route = upstream_percentage[0] * 10000 / local_percentage[0];
if (random_.random() % 10000 < local_percent_route) {
stats_.zone_routing_sampled_.inc();
return host_set_.healthyHostsPerZone()[0];
}

// If local zone ratio is lower than expected we should only partially route requests from the
// same zone.
double zone_host_ratio = 1.0 * local_zone_healthy_hosts.size() / host_set_.healthyHosts().size();
double ratio_to_route = zone_host_ratio * number_of_zones;
// At this point we should route cross zone as we cannot route to the local zone.
stats_.zone_routing_no_sampled_.inc();

std::vector<uint64_t> capacity_left;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stack allocated array

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

// Local zone does not have additional capacity (we already routed what we could), but
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More comments. Don't understand what is happening here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

// put it to the capacity_left so that index in the array matches to the zone index.
capacity_left.push_back(0);
for (size_t i = 1; i < local_percentage.size(); ++i) {
// Only route to the zones that have additional capacity.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still having trouble quickly understanding this logic. More comments, and potentially an example.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

more comments added

if (upstream_percentage[i] > local_percentage[i]) {
capacity_left.push_back(capacity_left[i - 1] + upstream_percentage[i] - local_percentage[i]);
} else {
capacity_left.push_back(capacity_left[i - 1]);
}
}

// Not zone routed requests will be distributed between all hosts and hence
// we need to route only fraction of req_percent_to_route to the local zone.
double actual_routing_ratio = (ratio_to_route - zone_host_ratio) / (1 - zone_host_ratio);
// Select specific zone for cross zone traffic based on the additional capacity in zones.
uint64_t threshold = random_.random() % capacity_left.back();

// Scale actual_routing_ratio to improve precision.
const uint64_t scale_factor = 10000;
uint64_t zone_routing_threshold = scale_factor * actual_routing_ratio;
// This potentially can be optimized to be O(log(N)) where N is the number of zones.
// Linear scan should be faster for smaller N, in most of the scenarios N will be small.
int pos = 0;
while (threshold > capacity_left[pos]) {
pos++;
}

if (random_.random() % 10000 < zone_routing_threshold) {
stats_.zone_routing_sampled_.inc();
return local_zone_healthy_hosts;
} else {
stats_.zone_routing_no_sampled_.inc();
return host_set_.healthyHostsPerZone()[pos];
}

const std::vector<HostPtr>& LoadBalancerBase::hostsToUse() {
ASSERT(host_set_.healthyHosts().size() <= host_set_.hosts().size());

if (host_set_.hosts().empty() || isGlobalPanic(host_set_)) {
return host_set_.hosts();
}

if (earlyExitNonZoneRouting()) {
return host_set_.healthyHosts();
}

return tryChooseLocalZoneHosts();
}

ConstHostPtr RoundRobinLoadBalancer::chooseHost() {
Expand Down
19 changes: 18 additions & 1 deletion source/common/upstream/load_balancer_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,25 @@ class LoadBalancerBase {
Runtime::RandomGenerator& random_;

private:
/*
* @return decision on quick exit from zone aware host selection.
*/
bool earlyExitNonZoneRouting();
bool isGlobalPanic();

/**
* For the given host_set it @return if we should be in a panic mode or not.
* For example, if majority of hosts are unhealthy we'll be likely in a panic mode.
* In this case we'll route requests to hosts no matter if they are healthy or not.
*/
bool isGlobalPanic(const HostSet& host_set);
const std::vector<HostPtr>& tryChooseLocalZoneHosts();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: add comment or newline before this line

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put comments.


/**
* @return ratio of hosts in a given zone to total number of hosts. The result is scaled by 10000
* multiplier.
*/
std::vector<uint64_t>
calculateZonePercentage(const std::vector<std::vector<HostPtr>>& hosts_per_zone);

const HostSet& host_set_;
const HostSet* local_host_set_;
Expand Down
2 changes: 1 addition & 1 deletion source/common/upstream/upstream_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ typedef std::shared_ptr<std::vector<std::vector<HostPtr>>> HostListsPtr;
typedef std::shared_ptr<const std::vector<std::vector<HostPtr>>> ConstHostListsPtr;

/**
* Base clase for all clusters as well as thread local host sets.
* Base class for all clusters as well as thread local host sets.
*/
class HostSetImpl : public virtual HostSet {
public:
Expand Down
Loading