openzeppelin_relayer/utils/aws_error.rs
1//! Helpers for diagnosing AWS SDK errors.
2//!
3//! `SdkError`'s `Display` impl collapses everything below the SDK
4//! (DNS, TCP, TLS, connector pool, credential providers) into a single
5//! short string like `"dispatch failure"`, which makes prod logs nearly
6//! useless for distinguishing root causes.
7//!
8//! This module provides two utilities meant to be paired at every AWS SDK
9//! call-site:
10//!
11//! * [`classify_sdk_error`] — returns a stable, low-cardinality `&'static str`
12//! suitable for a `tracing` field or metric label, distinguishing the
13//! actionable subcategories of `DispatchFailure` (timeout / io / user /
14//! other) from `TimeoutError`, `ServiceError`, etc.
15//! * [`DisplayErrorContext`] — re-export of the SDK's own helper that walks
16//! the full `std::error::Error::source()` chain so the underlying cause
17//! (e.g., `connect timed out`, `dns error: failed to lookup address`)
18//! appears in the log instead of just the top-level wrapper.
19//!
20//! # Caution: log-only — do not embed in returned error values
21//!
22//! `DisplayErrorContext` walks the underlying SDK chain and can surface
23//! internal infrastructure details (endpoint URLs, connector kinds,
24//! credential-provider failures). Keep it confined to `tracing` fields
25//! and metrics. For error values returned to upstream callers — which
26//! ultimately reach API clients via `ApiError::InternalError(err.to_string())` —
27//! prefer the stable kind tag from [`classify_sdk_error`].
28//!
29//! Typical usage:
30//!
31//! ```ignore
32//! tracing::error!(
33//! error.kind = classify_sdk_error(&err),
34//! error.detail = %DisplayErrorContext(&err),
35//! "AWS call failed"
36//! );
37//! // Returned error value carries only the kind tag, not the full chain:
38//! return Err(MyError::Wrapped(format!("op X failed: {}", classify_sdk_error(&err))));
39//! ```
40
41pub use aws_smithy_types::error::display::DisplayErrorContext;
42
43use aws_smithy_runtime_api::client::result::SdkError;
44
45/// Classify an [`SdkError`] into a stable, low-cardinality kind tag.
46///
47/// `DispatchFailure` is split by its underlying [`ConnectorError`] kind so
48/// log aggregators can distinguish a `dispatch_timeout` (likely runtime
49/// starvation or a slow upstream) from a `dispatch_io` (connection reset,
50/// pool exhaustion) without parsing free-form strings.
51///
52/// [`ConnectorError`]: aws_smithy_runtime_api::client::result::ConnectorError
53pub fn classify_sdk_error<E, R>(err: &SdkError<E, R>) -> &'static str {
54 match err {
55 SdkError::ConstructionFailure(_) => "construction",
56 SdkError::TimeoutError(_) => "timeout",
57 SdkError::DispatchFailure(inner) => match inner.as_connector_error() {
58 Some(ce) if ce.is_timeout() => "dispatch_timeout",
59 Some(ce) if ce.is_io() => "dispatch_io",
60 Some(ce) if ce.is_user() => "dispatch_user",
61 Some(_) => "dispatch_other",
62 None => "dispatch_unknown",
63 },
64 SdkError::ResponseError(_) => "response_parse",
65 SdkError::ServiceError(_) => "service",
66 // SdkError is `#[non_exhaustive]`; future variants get a stable label.
67 _ => "unknown",
68 }
69}
70
71#[cfg(test)]
72mod tests {
73 use super::*;
74 use aws_smithy_runtime_api::client::orchestrator::HttpResponse;
75 use aws_smithy_runtime_api::client::result::ConnectorError;
76 use std::convert::Infallible;
77 use std::io;
78
79 // SdkError<E, R> — E is the operation error type, R is the response type.
80 // ConstructionFailure / TimeoutError / DispatchFailure don't actually carry
81 // an E, so Infallible is the cheapest stand-in. R is supplied explicitly
82 // because aws-smithy-runtime-api 1.11+ no longer defaults it.
83 type TestErr = SdkError<Infallible, HttpResponse>;
84
85 fn boxed(msg: &str) -> Box<dyn std::error::Error + Send + Sync> {
86 Box::new(io::Error::other(msg.to_string()))
87 }
88
89 #[test]
90 fn classifies_construction_failure() {
91 let err: TestErr = SdkError::construction_failure(boxed("could not build request"));
92 assert_eq!(classify_sdk_error(&err), "construction");
93 }
94
95 #[test]
96 fn classifies_timeout_error() {
97 let err: TestErr = SdkError::timeout_error(boxed("operation timed out"));
98 assert_eq!(classify_sdk_error(&err), "timeout");
99 }
100
101 #[test]
102 fn classifies_dispatch_failure_timeout() {
103 // Connector-level timeout is the most likely shape under runtime
104 // saturation; this kind tag is what should drive the "we're starving
105 // the AWS SDK connector futures" diagnosis.
106 let err: TestErr =
107 SdkError::dispatch_failure(ConnectorError::timeout(boxed("connect timed out")));
108 assert_eq!(classify_sdk_error(&err), "dispatch_timeout");
109 }
110
111 #[test]
112 fn classifies_dispatch_failure_io() {
113 let err: TestErr =
114 SdkError::dispatch_failure(ConnectorError::io(boxed("connection reset by peer")));
115 assert_eq!(classify_sdk_error(&err), "dispatch_io");
116 }
117
118 #[test]
119 fn classifies_dispatch_failure_user() {
120 let err: TestErr =
121 SdkError::dispatch_failure(ConnectorError::user(boxed("invalid endpoint URL")));
122 assert_eq!(classify_sdk_error(&err), "dispatch_user");
123 }
124
125 #[test]
126 fn classifies_dispatch_failure_other() {
127 let err: TestErr =
128 SdkError::dispatch_failure(ConnectorError::other(boxed("unexpected"), None));
129 assert_eq!(classify_sdk_error(&err), "dispatch_other");
130 }
131
132 /// Sensitive marker that mimics the kind of content `DisplayErrorContext`
133 /// can surface via the SDK error source chain (endpoint URL, connector
134 /// internals, credential-provider failures). Tests below assert this
135 /// marker never leaks into strings produced by the call-site format.
136 const SENSITIVE_MARKER: &str = "https://kms.us-east-1.amazonaws.com/internal-endpoint";
137
138 fn dispatch_timeout_with_sensitive_chain() -> TestErr {
139 let inner = io::Error::new(
140 io::ErrorKind::TimedOut,
141 format!("connect timed out to {SENSITIVE_MARKER}"),
142 );
143 SdkError::dispatch_failure(ConnectorError::timeout(Box::new(inner)))
144 }
145
146 // The pattern every AWS call-site uses for the *returned* error value:
147 // format!("op X failed for key 'Y': {}", classify_sdk_error(&e))
148 // This contract test pins that the bounded form never embeds the source
149 // chain — which is the whole security argument behind the split between
150 // DisplayErrorContext (log-only) and classify_sdk_error (return-safe).
151 #[test]
152 fn returned_error_string_is_bounded_to_kind_tag() {
153 let err = dispatch_timeout_with_sensitive_chain();
154 let returned = format!(
155 "Failed to sign secp256k1 digest for key 'alias/test-key': {}",
156 classify_sdk_error(&err)
157 );
158 assert!(
159 returned.contains("dispatch_timeout"),
160 "returned error should carry the kind tag; got: {returned}"
161 );
162 assert!(
163 !returned.contains(SENSITIVE_MARKER),
164 "returned error must not leak the source chain; got: {returned}"
165 );
166 // Also pin against DisplayErrorContext accidentally creeping in: it
167 // would surface phrases like "connect timed out" from the inner error.
168 assert!(
169 !returned.contains("connect timed out"),
170 "returned error must not embed inner cause text; got: {returned}"
171 );
172 }
173
174 // Counterpart: DisplayErrorContext is *expected* to surface the chain —
175 // that's why it's log-only. This pins both halves of the contract.
176 #[test]
177 fn display_error_context_does_surface_sensitive_chain() {
178 let err = dispatch_timeout_with_sensitive_chain();
179 let rendered = format!("{}", DisplayErrorContext(&err));
180 assert!(
181 rendered.contains(SENSITIVE_MARKER),
182 "DisplayErrorContext must surface the chain for ops logs; got: {rendered}"
183 );
184 }
185
186 #[test]
187 fn display_error_context_surfaces_underlying_cause() {
188 // Re-pins the behaviour the helper relies on: DisplayErrorContext must
189 // walk source() chains, otherwise the prod logs would still collapse to
190 // "dispatch failure" and we'd be back where we started.
191 let inner = io::Error::new(io::ErrorKind::TimedOut, "tcp connect timed out at layer 4");
192 let err: TestErr = SdkError::dispatch_failure(ConnectorError::timeout(Box::new(inner)));
193 let rendered = format!("{}", DisplayErrorContext(&err));
194 assert!(
195 rendered.contains("tcp connect timed out at layer 4"),
196 "DisplayErrorContext should expose the inner cause; got: {rendered}"
197 );
198 }
199}