Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

error: use explicit matches for errors when the decision is crucial for correctness/performance #1083

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions scylla/src/transport/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -784,12 +784,13 @@ pub(crate) fn use_keyspace_result(
for result in use_keyspace_results {
match result {
Ok(()) => was_ok = true,
Err(err) => match err {
QueryError::BrokenConnection(_) | QueryError::ConnectionPoolError(_) => {
Err(err) => {
if err.is_connection_broken() {
broken_conn_error = Some(err)
} else {
return Err(err);
}
_ => return Err(err),
},
}
}
}

Expand Down
167 changes: 104 additions & 63 deletions scylla/src/transport/downgrading_consistency_retry_policy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,87 +91,128 @@ impl RetrySession for DowngradingConsistencyRetrySession {
decision
}

// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match query_info.error {
// Basic errors - there are some problems on this node
// Retry on a different one if possible
QueryError::BrokenConnection(_)
| QueryError::ConnectionPoolError(_)
| QueryError::DbError(DbError::Overloaded, _)
| QueryError::DbError(DbError::ServerError, _)
| QueryError::DbError(DbError::TruncateError, _) => {
QueryError::BrokenConnection(_) | QueryError::ConnectionPoolError(_) => {
if query_info.is_idempotent {
RetryDecision::RetryNextNode(None)
} else {
RetryDecision::DontRetry
}
}
// Unavailable - the current node believes that not enough nodes
// are alive to satisfy specified consistency requirements.
QueryError::DbError(DbError::Unavailable { alive, .. }, _) => {
if !self.was_retry {
self.was_retry = true;
max_likely_to_work_cl(*alive, cl)
} else {
RetryDecision::DontRetry
}
}
// ReadTimeout - coordinator didn't receive enough replies in time.
QueryError::DbError(
DbError::ReadTimeout {
received,
required,
data_present,
..
},
_,
) => {
if self.was_retry {
RetryDecision::DontRetry
} else if received < required {
self.was_retry = true;
max_likely_to_work_cl(*received, cl)
} else if !*data_present {
self.was_retry = true;
RetryDecision::RetrySameNode(None)
} else {
RetryDecision::DontRetry
}
}
// Write timeout - coordinator didn't receive enough replies in time.
QueryError::DbError(
DbError::WriteTimeout {
write_type,
received,
..
},
_,
) => {
if self.was_retry || !query_info.is_idempotent {
RetryDecision::DontRetry
} else {
self.was_retry = true;
match write_type {
WriteType::Batch | WriteType::Simple if *received > 0 => {
RetryDecision::IgnoreWriteError
// DbErrors
QueryError::DbError(db_error, _) => {
// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match db_error {
// Basic errors - there are some problems on this node
// Retry on a different one if possible
DbError::Overloaded | DbError::ServerError | DbError::TruncateError => {
if query_info.is_idempotent {
RetryDecision::RetryNextNode(None)
} else {
RetryDecision::DontRetry
}

WriteType::UnloggedBatch => {
// Since only part of the batch could have been persisted,
// retry with whatever consistency should allow to persist all
}
// Unavailable - the current node believes that not enough nodes
// are alive to satisfy specified consistency requirements.
DbError::Unavailable { alive, .. } => {
if !self.was_retry {
self.was_retry = true;
max_likely_to_work_cl(*alive, cl)
} else {
RetryDecision::DontRetry
}
}
// ReadTimeout - coordinator didn't receive enough replies in time.
DbError::ReadTimeout {
received,
required,
data_present,
..
} => {
if self.was_retry {
RetryDecision::DontRetry
} else if received < required {
self.was_retry = true;
max_likely_to_work_cl(*received, cl)
} else if !*data_present {
self.was_retry = true;
RetryDecision::RetrySameNode(None)
} else {
RetryDecision::DontRetry
}
}
// Write timeout - coordinator didn't receive enough replies in time.
DbError::WriteTimeout {
write_type,
received,
..
} => {
if self.was_retry || !query_info.is_idempotent {
RetryDecision::DontRetry
} else {
self.was_retry = true;
match write_type {
WriteType::Batch | WriteType::Simple if *received > 0 => {
RetryDecision::IgnoreWriteError
}

WriteType::UnloggedBatch => {
// Since only part of the batch could have been persisted,
// retry with whatever consistency should allow to persist all
max_likely_to_work_cl(*received, cl)
}
WriteType::BatchLog => RetryDecision::RetrySameNode(None),

WriteType::Counter
| WriteType::Cas
| WriteType::View
| WriteType::Cdc
| WriteType::Simple
| WriteType::Batch
| WriteType::Other(_) => RetryDecision::DontRetry,
}
}
WriteType::BatchLog => RetryDecision::RetrySameNode(None),

_ => RetryDecision::DontRetry,
}
// The node is still bootstrapping it can't execute the query, we should try another one
DbError::IsBootstrapping => RetryDecision::RetryNextNode(None),
// In all other cases propagate the error to the user
DbError::SyntaxError
| DbError::Invalid
| DbError::AlreadyExists { .. }
| DbError::FunctionFailure { .. }
| DbError::AuthenticationError
| DbError::Unauthorized
| DbError::ConfigError
| DbError::ReadFailure { .. }
| DbError::WriteFailure { .. }
| DbError::Unprepared { .. }
| DbError::ProtocolError
| DbError::RateLimitReached { .. }
| DbError::Other(_) => RetryDecision::DontRetry,
}
}
// The node is still bootstrapping it can't execute the query, we should try another one
QueryError::DbError(DbError::IsBootstrapping, _) => RetryDecision::RetryNextNode(None),
// Connection to the contacted node is overloaded, try another one
QueryError::UnableToAllocStreamId => RetryDecision::RetryNextNode(None),
// In all other cases propagate the error to the user
_ => RetryDecision::DontRetry,
QueryError::BadQuery(_)
| QueryError::MetadataError(_)
| QueryError::CqlRequestSerialization(_)
| QueryError::BodyExtensionsParseError(_)
| QueryError::EmptyPlan
| QueryError::CqlResultParseError(_)
| QueryError::CqlErrorParseError(_)
| QueryError::ProtocolError(_)
| QueryError::TimeoutError
Comment on lines 205 to +214
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we really propagate TimeoutError instead of trying another node? Well, it's the weird error about use_keyspace...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Funny thing is that this error variant can't even appear there. AFAIU, use keyspace requests go through different path than regular requests. They are not a subject to retries, since we send use keyspace requests to all nodes simultaneously.

Copy link
Contributor Author

@muzarski muzarski Oct 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, there are some errors that can't possibly appear in some places (same for speculative_execution::can_be_ignored). This is because QueryError is a really broad error type, whose specific variants are constructed in multiple driver layers. IMO, the only way to fix this, is to narrow the internal return error types as much as possible. I'm not sure how much work it would require, though.

| QueryError::RequestTimeout(_) => RetryDecision::DontRetry,
}
}

Expand Down
27 changes: 27 additions & 0 deletions scylla/src/transport/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,33 @@ pub enum QueryError {
RequestTimeout(String),
}

impl QueryError {
pub(crate) fn is_connection_broken(&self) -> bool {
// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match self {
// Error variants that imply that some connection error appeared before/during execution.
QueryError::BrokenConnection(_) | QueryError::ConnectionPoolError(_) => true,

// Other errors.
QueryError::DbError(_, _)
| QueryError::BadQuery(_)
| QueryError::CqlRequestSerialization(_)
| QueryError::BodyExtensionsParseError(_)
| QueryError::EmptyPlan
| QueryError::CqlResultParseError(_)
| QueryError::CqlErrorParseError(_)
| QueryError::MetadataError(_)
| QueryError::ProtocolError(_)
| QueryError::TimeoutError
| QueryError::UnableToAllocStreamId
| QueryError::RequestTimeout(_) => false,
}
}
}

impl From<SerializeValuesError> for QueryError {
fn from(serialized_err: SerializeValuesError) -> QueryError {
QueryError::BadQuery(BadQuery::SerializeValuesError(serialized_err))
Expand Down
50 changes: 41 additions & 9 deletions scylla/src/transport/load_balancing/default.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2838,29 +2838,61 @@ mod latency_awareness {
}

pub(crate) fn reliable_latency_measure(error: &QueryError) -> bool {
// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match error {
// "fast" errors, i.e. ones that are returned quickly after the query begins
// "fast" errors, i.e. ones that appeared on driver side, before sending the request
QueryError::BadQuery(_)
| QueryError::CqlRequestSerialization(_)
| QueryError::BrokenConnection(_)
| QueryError::ConnectionPoolError(_)
| QueryError::EmptyPlan
| QueryError::UnableToAllocStreamId
| QueryError::DbError(DbError::IsBootstrapping, _)
| QueryError::DbError(DbError::Unavailable { .. }, _)
| QueryError::DbError(DbError::Unprepared { .. }, _)
| QueryError::DbError(DbError::Overloaded { .. }, _)
| QueryError::DbError(DbError::RateLimitReached { .. }, _) => false,
| QueryError::UnableToAllocStreamId => false,

// "slow" errors, i.e. ones that are returned after considerable time of query being run
QueryError::DbError(_, _)
| QueryError::CqlResultParseError(_)
QueryError::CqlResultParseError(_)
| QueryError::CqlErrorParseError(_)
| QueryError::BodyExtensionsParseError(_)
| QueryError::MetadataError(_)
| QueryError::ProtocolError(_)
| QueryError::TimeoutError
| QueryError::RequestTimeout(_) => true,

// handle db errors
QueryError::DbError(db_error, _) => {
// Do not remove this lint!
// It's there for a reason - we don't want new variants
// automatically fall under `_` pattern when they are introduced.
#[deny(clippy::wildcard_enum_match_arm)]
match db_error {
// An errors that appeared on server side, but did not require selected node
// to contact other nodes (i.e. fast server side errors).
DbError::IsBootstrapping
| DbError::Unavailable { .. }
| DbError::Unprepared { .. }
| DbError::Overloaded { .. }
| DbError::RateLimitReached { .. } => false,

// Rest of server side errors that take considerable amount of time to be returned
DbError::SyntaxError
| DbError::Invalid
| DbError::AlreadyExists { .. }
| DbError::FunctionFailure { .. }
| DbError::AuthenticationError
| DbError::TruncateError
| DbError::ReadTimeout { .. }
| DbError::WriteTimeout { .. }
| DbError::ReadFailure { .. }
| DbError::WriteFailure { .. }
| DbError::ServerError
| DbError::ProtocolError
| DbError::Unauthorized
| DbError::ConfigError
| DbError::Other(_) => true,
}
}
}
}
}
Expand Down
Loading
Loading