Skip to content

Commit

Permalink
retry_policy: don't use wildcard '_' in QueryError match
Browse files Browse the repository at this point in the history
Since last time, during error refactor I introduced a silent
bug to the code (scylladb#1075),
I'd like to prevent that from happening in the future. This is why
we replace a `_` match with explicit error variants in retry policy
modules.

We also enable `wildcard_enum_match_arm` clippy lint in this place
for QueryError and DbError matches.
  • Loading branch information
muzarski committed Oct 14, 2024
1 parent 5785a57 commit 4ef58f5
Show file tree
Hide file tree
Showing 2 changed files with 200 additions and 119 deletions.
167 changes: 104 additions & 63 deletions scylla/src/transport/downgrading_consistency_retry_policy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,87 +91,128 @@ impl RetrySession for DowngradingConsistencyRetrySession {
decision
}

// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match query_info.error {
// Basic errors - there are some problems on this node
// Retry on a different one if possible
QueryError::BrokenConnection(_)
| QueryError::ConnectionPoolError(_)
| QueryError::DbError(DbError::Overloaded, _)
| QueryError::DbError(DbError::ServerError, _)
| QueryError::DbError(DbError::TruncateError, _) => {
QueryError::BrokenConnection(_) | QueryError::ConnectionPoolError(_) => {
if query_info.is_idempotent {
RetryDecision::RetryNextNode(None)
} else {
RetryDecision::DontRetry
}
}
// Unavailable - the current node believes that not enough nodes
// are alive to satisfy specified consistency requirements.
QueryError::DbError(DbError::Unavailable { alive, .. }, _) => {
if !self.was_retry {
self.was_retry = true;
max_likely_to_work_cl(*alive, cl)
} else {
RetryDecision::DontRetry
}
}
// ReadTimeout - coordinator didn't receive enough replies in time.
QueryError::DbError(
DbError::ReadTimeout {
received,
required,
data_present,
..
},
_,
) => {
if self.was_retry {
RetryDecision::DontRetry
} else if received < required {
self.was_retry = true;
max_likely_to_work_cl(*received, cl)
} else if !*data_present {
self.was_retry = true;
RetryDecision::RetrySameNode(None)
} else {
RetryDecision::DontRetry
}
}
// Write timeout - coordinator didn't receive enough replies in time.
QueryError::DbError(
DbError::WriteTimeout {
write_type,
received,
..
},
_,
) => {
if self.was_retry || !query_info.is_idempotent {
RetryDecision::DontRetry
} else {
self.was_retry = true;
match write_type {
WriteType::Batch | WriteType::Simple if *received > 0 => {
RetryDecision::IgnoreWriteError
// DbErrors
QueryError::DbError(db_error, _) => {
// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match db_error {
// Basic errors - there are some problems on this node
// Retry on a different one if possible
DbError::Overloaded | DbError::ServerError | DbError::TruncateError => {
if query_info.is_idempotent {
RetryDecision::RetryNextNode(None)
} else {
RetryDecision::DontRetry
}

WriteType::UnloggedBatch => {
// Since only part of the batch could have been persisted,
// retry with whatever consistency should allow to persist all
}
// Unavailable - the current node believes that not enough nodes
// are alive to satisfy specified consistency requirements.
DbError::Unavailable { alive, .. } => {
if !self.was_retry {
self.was_retry = true;
max_likely_to_work_cl(*alive, cl)
} else {
RetryDecision::DontRetry
}
}
// ReadTimeout - coordinator didn't receive enough replies in time.
DbError::ReadTimeout {
received,
required,
data_present,
..
} => {
if self.was_retry {
RetryDecision::DontRetry
} else if received < required {
self.was_retry = true;
max_likely_to_work_cl(*received, cl)
} else if !*data_present {
self.was_retry = true;
RetryDecision::RetrySameNode(None)
} else {
RetryDecision::DontRetry
}
}
// Write timeout - coordinator didn't receive enough replies in time.
DbError::WriteTimeout {
write_type,
received,
..
} => {
if self.was_retry || !query_info.is_idempotent {
RetryDecision::DontRetry
} else {
self.was_retry = true;
match write_type {
WriteType::Batch | WriteType::Simple if *received > 0 => {
RetryDecision::IgnoreWriteError
}

WriteType::UnloggedBatch => {
// Since only part of the batch could have been persisted,
// retry with whatever consistency should allow to persist all
max_likely_to_work_cl(*received, cl)
}
WriteType::BatchLog => RetryDecision::RetrySameNode(None),

WriteType::Counter
| WriteType::Cas
| WriteType::View
| WriteType::Cdc
| WriteType::Simple
| WriteType::Batch
| WriteType::Other(_) => RetryDecision::DontRetry,
}
}
WriteType::BatchLog => RetryDecision::RetrySameNode(None),

_ => RetryDecision::DontRetry,
}
// The node is still bootstrapping it can't execute the query, we should try another one
DbError::IsBootstrapping => RetryDecision::RetryNextNode(None),
// In all other cases propagate the error to the user
DbError::SyntaxError
| DbError::Invalid
| DbError::AlreadyExists { .. }
| DbError::FunctionFailure { .. }
| DbError::AuthenticationError
| DbError::Unauthorized
| DbError::ConfigError
| DbError::ReadFailure { .. }
| DbError::WriteFailure { .. }
| DbError::Unprepared { .. }
| DbError::ProtocolError
| DbError::RateLimitReached { .. }
| DbError::Other(_) => RetryDecision::DontRetry,
}
}
// The node is still bootstrapping it can't execute the query, we should try another one
QueryError::DbError(DbError::IsBootstrapping, _) => RetryDecision::RetryNextNode(None),
// Connection to the contacted node is overloaded, try another one
QueryError::UnableToAllocStreamId => RetryDecision::RetryNextNode(None),
// In all other cases propagate the error to the user
_ => RetryDecision::DontRetry,
QueryError::BadQuery(_)
| QueryError::MetadataError(_)
| QueryError::CqlRequestSerialization(_)
| QueryError::BodyExtensionsParseError(_)
| QueryError::EmptyPlan
| QueryError::CqlResultParseError(_)
| QueryError::CqlErrorParseError(_)
| QueryError::ProtocolError(_)
| QueryError::TimeoutError
| QueryError::RequestTimeout(_) => RetryDecision::DontRetry,
}
}

Expand Down
152 changes: 96 additions & 56 deletions scylla/src/transport/retry_policy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,76 +139,116 @@ impl RetrySession for DefaultRetrySession {
if query_info.consistency.is_serial() {
return RetryDecision::DontRetry;
};

// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match query_info.error {
// Basic errors - there are some problems on this node
// Retry on a different one if possible
QueryError::BrokenConnection(_)
| QueryError::ConnectionPoolError(_)
| QueryError::DbError(DbError::Overloaded, _)
| QueryError::DbError(DbError::ServerError, _)
| QueryError::DbError(DbError::TruncateError, _) => {
QueryError::BrokenConnection(_) | QueryError::ConnectionPoolError(_) => {
if query_info.is_idempotent {
RetryDecision::RetryNextNode(None)
} else {
RetryDecision::DontRetry
}
}
// Unavailable - the current node believes that not enough nodes
// are alive to satisfy specified consistency requirements.
// Maybe this node has network problems - try a different one.
// Perform at most one retry - it's unlikely that two nodes
// have network problems at the same time
QueryError::DbError(DbError::Unavailable { .. }, _) => {
if !self.was_unavailable_retry {
self.was_unavailable_retry = true;
RetryDecision::RetryNextNode(None)
} else {
RetryDecision::DontRetry
}
}
// ReadTimeout - coordinator didn't receive enough replies in time.
// Retry at most once and only if there were actually enough replies
// to satisfy consistency but they were all just checksums (data_present == false).
// This happens when the coordinator picked replicas that were overloaded/dying.
// Retried request should have some useful response because the node will detect
// that these replicas are dead.
QueryError::DbError(
DbError::ReadTimeout {
received,
required,
data_present,
..
},
_,
) => {
if !self.was_read_timeout_retry && received >= required && !*data_present {
self.was_read_timeout_retry = true;
RetryDecision::RetrySameNode(None)
} else {
RetryDecision::DontRetry
}
}
// Write timeout - coordinator didn't receive enough replies in time.
// Retry at most once and only for BatchLog write.
// Coordinator probably didn't detect the nodes as dead.
// By the time we retry they should be detected as dead.
QueryError::DbError(DbError::WriteTimeout { write_type, .. }, _) => {
if !self.was_write_timeout_retry
&& query_info.is_idempotent
&& *write_type == WriteType::BatchLog
{
self.was_write_timeout_retry = true;
RetryDecision::RetrySameNode(None)
} else {
RetryDecision::DontRetry

// DbErrors
QueryError::DbError(db_error, _) => {
// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match db_error {
// Basic errors - there are some problems on this node
// Retry on a different one if possible
DbError::Overloaded | DbError::ServerError | DbError::TruncateError => {
if query_info.is_idempotent {
RetryDecision::RetryNextNode(None)
} else {
RetryDecision::DontRetry
}
}
// Unavailable - the current node believes that not enough nodes
// are alive to satisfy specified consistency requirements.
// Maybe this node has network problems - try a different one.
// Perform at most one retry - it's unlikely that two nodes
// have network problems at the same time
DbError::Unavailable { .. } => {
if !self.was_unavailable_retry {
self.was_unavailable_retry = true;
RetryDecision::RetryNextNode(None)
} else {
RetryDecision::DontRetry
}
}
// ReadTimeout - coordinator didn't receive enough replies in time.
// Retry at most once and only if there were actually enough replies
// to satisfy consistency but they were all just checksums (data_present == false).
// This happens when the coordinator picked replicas that were overloaded/dying.
// Retried request should have some useful response because the node will detect
// that these replicas are dead.
DbError::ReadTimeout {
received,
required,
data_present,
..
} => {
if !self.was_read_timeout_retry && received >= required && !*data_present {
self.was_read_timeout_retry = true;
RetryDecision::RetrySameNode(None)
} else {
RetryDecision::DontRetry
}
}
// Write timeout - coordinator didn't receive enough replies in time.
// Retry at most once and only for BatchLog write.
// Coordinator probably didn't detect the nodes as dead.
// By the time we retry they should be detected as dead.
DbError::WriteTimeout { write_type, .. } => {
if !self.was_write_timeout_retry
&& query_info.is_idempotent
&& *write_type == WriteType::BatchLog
{
self.was_write_timeout_retry = true;
RetryDecision::RetrySameNode(None)
} else {
RetryDecision::DontRetry
}
}
// The node is still bootstrapping it can't execute the query, we should try another one
DbError::IsBootstrapping => RetryDecision::RetryNextNode(None),
// In all other cases propagate the error to the user
DbError::SyntaxError
| DbError::Invalid
| DbError::AlreadyExists { .. }
| DbError::FunctionFailure { .. }
| DbError::AuthenticationError
| DbError::Unauthorized
| DbError::ConfigError
| DbError::ReadFailure { .. }
| DbError::WriteFailure { .. }
| DbError::Unprepared { .. }
| DbError::ProtocolError
| DbError::RateLimitReached { .. }
| DbError::Other(_) => RetryDecision::DontRetry,
}
}
// The node is still bootstrapping it can't execute the query, we should try another one
QueryError::DbError(DbError::IsBootstrapping, _) => RetryDecision::RetryNextNode(None),
// Connection to the contacted node is overloaded, try another one
QueryError::UnableToAllocStreamId => RetryDecision::RetryNextNode(None),
// In all other cases propagate the error to the user
_ => RetryDecision::DontRetry,
QueryError::BadQuery(_)
| QueryError::MetadataError(_)
| QueryError::CqlRequestSerialization(_)
| QueryError::BodyExtensionsParseError(_)
| QueryError::EmptyPlan
| QueryError::CqlResultParseError(_)
| QueryError::CqlErrorParseError(_)
| QueryError::ProtocolError(_)
| QueryError::TimeoutError
| QueryError::RequestTimeout(_) => RetryDecision::DontRetry,
}
}

Expand Down

0 comments on commit 4ef58f5

Please sign in to comment.