openzeppelin_relayer/utils/
aws_error.rs

1//! Helpers for diagnosing AWS SDK errors.
2//!
3//! `SdkError`'s `Display` impl collapses everything below the SDK
4//! (DNS, TCP, TLS, connector pool, credential providers) into a single
5//! short string like `"dispatch failure"`, which makes prod logs nearly
6//! useless for distinguishing root causes.
7//!
8//! This module provides two utilities meant to be paired at every AWS SDK
9//! call-site:
10//!
11//! * [`classify_sdk_error`] — returns a stable, low-cardinality `&'static str`
12//!   suitable for a `tracing` field or metric label, distinguishing the
13//!   actionable subcategories of `DispatchFailure` (timeout / io / user /
14//!   other) from `TimeoutError`, `ServiceError`, etc.
15//! * [`DisplayErrorContext`] — re-export of the SDK's own helper that walks
16//!   the full `std::error::Error::source()` chain so the underlying cause
17//!   (e.g., `connect timed out`, `dns error: failed to lookup address`)
18//!   appears in the log instead of just the top-level wrapper.
19//!
20//! # Caution: log-only — do not embed in returned error values
21//!
22//! `DisplayErrorContext` walks the underlying SDK chain and can surface
23//! internal infrastructure details (endpoint URLs, connector kinds,
24//! credential-provider failures). Keep it confined to `tracing` fields
25//! and metrics. For error values returned to upstream callers — which
26//! ultimately reach API clients via `ApiError::InternalError(err.to_string())` —
27//! prefer the stable kind tag from [`classify_sdk_error`].
28//!
29//! Typical usage:
30//!
31//! ```ignore
32//! tracing::error!(
33//!     error.kind = classify_sdk_error(&err),
34//!     error.detail = %DisplayErrorContext(&err),
35//!     "AWS call failed"
36//! );
37//! // Returned error value carries only the kind tag, not the full chain:
38//! return Err(MyError::Wrapped(format!("op X failed: {}", classify_sdk_error(&err))));
39//! ```
40
41pub use aws_smithy_types::error::display::DisplayErrorContext;
42
43use aws_smithy_runtime_api::client::result::SdkError;
44
45/// Classify an [`SdkError`] into a stable, low-cardinality kind tag.
46///
47/// `DispatchFailure` is split by its underlying [`ConnectorError`] kind so
48/// log aggregators can distinguish a `dispatch_timeout` (likely runtime
49/// starvation or a slow upstream) from a `dispatch_io` (connection reset,
50/// pool exhaustion) without parsing free-form strings.
51///
52/// [`ConnectorError`]: aws_smithy_runtime_api::client::result::ConnectorError
53pub fn classify_sdk_error<E, R>(err: &SdkError<E, R>) -> &'static str {
54    match err {
55        SdkError::ConstructionFailure(_) => "construction",
56        SdkError::TimeoutError(_) => "timeout",
57        SdkError::DispatchFailure(inner) => match inner.as_connector_error() {
58            Some(ce) if ce.is_timeout() => "dispatch_timeout",
59            Some(ce) if ce.is_io() => "dispatch_io",
60            Some(ce) if ce.is_user() => "dispatch_user",
61            Some(_) => "dispatch_other",
62            None => "dispatch_unknown",
63        },
64        SdkError::ResponseError(_) => "response_parse",
65        SdkError::ServiceError(_) => "service",
66        // SdkError is `#[non_exhaustive]`; future variants get a stable label.
67        _ => "unknown",
68    }
69}
70
71#[cfg(test)]
72mod tests {
73    use super::*;
74    use aws_smithy_runtime_api::client::orchestrator::HttpResponse;
75    use aws_smithy_runtime_api::client::result::ConnectorError;
76    use std::convert::Infallible;
77    use std::io;
78
79    // SdkError<E, R> — E is the operation error type, R is the response type.
80    // ConstructionFailure / TimeoutError / DispatchFailure don't actually carry
81    // an E, so Infallible is the cheapest stand-in. R is supplied explicitly
82    // because aws-smithy-runtime-api 1.11+ no longer defaults it.
83    type TestErr = SdkError<Infallible, HttpResponse>;
84
85    fn boxed(msg: &str) -> Box<dyn std::error::Error + Send + Sync> {
86        Box::new(io::Error::other(msg.to_string()))
87    }
88
89    #[test]
90    fn classifies_construction_failure() {
91        let err: TestErr = SdkError::construction_failure(boxed("could not build request"));
92        assert_eq!(classify_sdk_error(&err), "construction");
93    }
94
95    #[test]
96    fn classifies_timeout_error() {
97        let err: TestErr = SdkError::timeout_error(boxed("operation timed out"));
98        assert_eq!(classify_sdk_error(&err), "timeout");
99    }
100
101    #[test]
102    fn classifies_dispatch_failure_timeout() {
103        // Connector-level timeout is the most likely shape under runtime
104        // saturation; this kind tag is what should drive the "we're starving
105        // the AWS SDK connector futures" diagnosis.
106        let err: TestErr =
107            SdkError::dispatch_failure(ConnectorError::timeout(boxed("connect timed out")));
108        assert_eq!(classify_sdk_error(&err), "dispatch_timeout");
109    }
110
111    #[test]
112    fn classifies_dispatch_failure_io() {
113        let err: TestErr =
114            SdkError::dispatch_failure(ConnectorError::io(boxed("connection reset by peer")));
115        assert_eq!(classify_sdk_error(&err), "dispatch_io");
116    }
117
118    #[test]
119    fn classifies_dispatch_failure_user() {
120        let err: TestErr =
121            SdkError::dispatch_failure(ConnectorError::user(boxed("invalid endpoint URL")));
122        assert_eq!(classify_sdk_error(&err), "dispatch_user");
123    }
124
125    #[test]
126    fn classifies_dispatch_failure_other() {
127        let err: TestErr =
128            SdkError::dispatch_failure(ConnectorError::other(boxed("unexpected"), None));
129        assert_eq!(classify_sdk_error(&err), "dispatch_other");
130    }
131
132    /// Sensitive marker that mimics the kind of content `DisplayErrorContext`
133    /// can surface via the SDK error source chain (endpoint URL, connector
134    /// internals, credential-provider failures). Tests below assert this
135    /// marker never leaks into strings produced by the call-site format.
136    const SENSITIVE_MARKER: &str = "https://kms.us-east-1.amazonaws.com/internal-endpoint";
137
138    fn dispatch_timeout_with_sensitive_chain() -> TestErr {
139        let inner = io::Error::new(
140            io::ErrorKind::TimedOut,
141            format!("connect timed out to {SENSITIVE_MARKER}"),
142        );
143        SdkError::dispatch_failure(ConnectorError::timeout(Box::new(inner)))
144    }
145
146    // The pattern every AWS call-site uses for the *returned* error value:
147    //   format!("op X failed for key 'Y': {}", classify_sdk_error(&e))
148    // This contract test pins that the bounded form never embeds the source
149    // chain — which is the whole security argument behind the split between
150    // DisplayErrorContext (log-only) and classify_sdk_error (return-safe).
151    #[test]
152    fn returned_error_string_is_bounded_to_kind_tag() {
153        let err = dispatch_timeout_with_sensitive_chain();
154        let returned = format!(
155            "Failed to sign secp256k1 digest for key 'alias/test-key': {}",
156            classify_sdk_error(&err)
157        );
158        assert!(
159            returned.contains("dispatch_timeout"),
160            "returned error should carry the kind tag; got: {returned}"
161        );
162        assert!(
163            !returned.contains(SENSITIVE_MARKER),
164            "returned error must not leak the source chain; got: {returned}"
165        );
166        // Also pin against DisplayErrorContext accidentally creeping in: it
167        // would surface phrases like "connect timed out" from the inner error.
168        assert!(
169            !returned.contains("connect timed out"),
170            "returned error must not embed inner cause text; got: {returned}"
171        );
172    }
173
174    // Counterpart: DisplayErrorContext is *expected* to surface the chain —
175    // that's why it's log-only. This pins both halves of the contract.
176    #[test]
177    fn display_error_context_does_surface_sensitive_chain() {
178        let err = dispatch_timeout_with_sensitive_chain();
179        let rendered = format!("{}", DisplayErrorContext(&err));
180        assert!(
181            rendered.contains(SENSITIVE_MARKER),
182            "DisplayErrorContext must surface the chain for ops logs; got: {rendered}"
183        );
184    }
185
186    #[test]
187    fn display_error_context_surfaces_underlying_cause() {
188        // Re-pins the behaviour the helper relies on: DisplayErrorContext must
189        // walk source() chains, otherwise the prod logs would still collapse to
190        // "dispatch failure" and we'd be back where we started.
191        let inner = io::Error::new(io::ErrorKind::TimedOut, "tcp connect timed out at layer 4");
192        let err: TestErr = SdkError::dispatch_failure(ConnectorError::timeout(Box::new(inner)));
193        let rendered = format!("{}", DisplayErrorContext(&err));
194        assert!(
195            rendered.contains("tcp connect timed out at layer 4"),
196            "DisplayErrorContext should expose the inner cause; got: {rendered}"
197        );
198    }
199}