Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/execution/src/object_store.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! ObjectStoreRegistry holds all the object stores at Runtime with a scheme for each store.
19
//! This allows the user to extend DataFusion with different storage systems such as S3 or HDFS
20
//! and query data inside these systems.
21
22
use dashmap::DashMap;
23
use datafusion_common::{exec_err, DataFusionError, Result};
24
#[cfg(not(target_arch = "wasm32"))]
25
use object_store::local::LocalFileSystem;
26
use object_store::ObjectStore;
27
use std::sync::Arc;
28
use url::Url;
29
30
/// A parsed URL identifying a particular [`ObjectStore`] instance
31
///
32
/// For example:
33
/// * `file://` for local file system
34
/// * `s3://bucket` for AWS S3 bucket
35
/// * `oss://bucket` for Aliyun OSS bucket
36
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
37
pub struct ObjectStoreUrl {
38
    url: Url,
39
}
40
41
impl ObjectStoreUrl {
42
    /// Parse an [`ObjectStoreUrl`] from a string
43
    ///
44
    /// # Example
45
    /// ```
46
    /// # use url::Url;
47
    /// # use datafusion_execution::object_store::ObjectStoreUrl;
48
    /// let object_store_url = ObjectStoreUrl::parse("s3://bucket").unwrap();
49
    /// assert_eq!(object_store_url.as_str(), "s3://bucket/");
50
    /// // can also access the underlying `Url`
51
    /// let url: &Url = object_store_url.as_ref();
52
    /// assert_eq!(url.scheme(), "s3");
53
    /// assert_eq!(url.host_str(), Some("bucket"));
54
    /// assert_eq!(url.path(), "/");
55
    /// ```
56
0
    pub fn parse(s: impl AsRef<str>) -> Result<Self> {
57
0
        let mut parsed =
58
0
            Url::parse(s.as_ref()).map_err(|e| DataFusionError::External(Box::new(e)))?;
59
60
0
        let remaining = &parsed[url::Position::BeforePath..];
61
0
        if !remaining.is_empty() && remaining != "/" {
62
0
            return exec_err!(
63
0
                "ObjectStoreUrl must only contain scheme and authority, got: {remaining}"
64
0
            );
65
0
        }
66
0
67
0
        // Always set path for consistency
68
0
        parsed.set_path("/");
69
0
        Ok(Self { url: parsed })
70
0
    }
71
72
    /// An [`ObjectStoreUrl`] for the local filesystem (`file://`)
73
    ///
74
    /// # Example
75
    /// ```
76
    /// # use datafusion_execution::object_store::ObjectStoreUrl;
77
    /// let local_fs = ObjectStoreUrl::parse("file://").unwrap();
78
    /// assert_eq!(local_fs, ObjectStoreUrl::local_filesystem())
79
    /// ```
80
0
    pub fn local_filesystem() -> Self {
81
0
        Self::parse("file://").unwrap()
82
0
    }
83
84
    /// Returns this [`ObjectStoreUrl`] as a string
85
0
    pub fn as_str(&self) -> &str {
86
0
        self.as_ref()
87
0
    }
88
}
89
90
impl AsRef<str> for ObjectStoreUrl {
91
0
    fn as_ref(&self) -> &str {
92
0
        self.url.as_ref()
93
0
    }
94
}
95
96
impl AsRef<Url> for ObjectStoreUrl {
97
0
    fn as_ref(&self) -> &Url {
98
0
        &self.url
99
0
    }
100
}
101
102
impl std::fmt::Display for ObjectStoreUrl {
103
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
104
0
        self.as_str().fmt(f)
105
0
    }
106
}
107
108
/// [`ObjectStoreRegistry`] maps a URL to an [`ObjectStore`] instance,
109
/// and allows DataFusion to read from different [`ObjectStore`]
110
/// instances. For example DataFusion might be configured so that
111
///
112
/// 1. `s3://my_bucket/lineitem/` mapped to the `/lineitem` path on an
113
///    AWS S3 object store bound to `my_bucket`
114
///
115
/// 2. `s3://my_other_bucket/lineitem/` mapped to the (same)
116
///    `/lineitem` path on a *different* AWS S3 object store bound to
117
///    `my_other_bucket`
118
///
119
/// When given a [`ListingTableUrl`], DataFusion tries to find an
120
/// appropriate [`ObjectStore`]. For example
121
///
122
/// ```sql
123
/// create external table unicorns stored as parquet location 's3://my_bucket/lineitem/';
124
/// ```
125
///
126
/// In this particular case, the url `s3://my_bucket/lineitem/` will be provided to
127
/// [`ObjectStoreRegistry::get_store`] and one of three things will happen:
128
///
129
/// - If an [`ObjectStore`] has been registered with [`ObjectStoreRegistry::register_store`] with
130
///   `s3://my_bucket`, that [`ObjectStore`] will be returned
131
///
132
/// - If an AWS S3 object store can be ad-hoc discovered by the url `s3://my_bucket/lineitem/`, this
133
///   object store will be registered with key `s3://my_bucket` and returned.
134
///
135
/// - Otherwise an error will be returned, indicating that no suitable [`ObjectStore`] could
136
///   be found
137
///
138
/// This allows for two different use-cases:
139
///
140
/// 1. Systems where object store buckets are explicitly created using DDL, can register these
141
///    buckets using [`ObjectStoreRegistry::register_store`]
142
///
143
/// 2. Systems relying on ad-hoc discovery, without corresponding DDL, can create [`ObjectStore`]
144
///    lazily by providing a custom implementation of [`ObjectStoreRegistry`]
145
///
146
/// <!-- is in a different crate so normal rustdoc links don't work -->
147
/// [`ListingTableUrl`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTableUrl.html
148
/// [`ObjectStore`]: object_store::ObjectStore
149
pub trait ObjectStoreRegistry: Send + Sync + std::fmt::Debug + 'static {
150
    /// If a store with the same key existed before, it is replaced and returned
151
    fn register_store(
152
        &self,
153
        url: &Url,
154
        store: Arc<dyn ObjectStore>,
155
    ) -> Option<Arc<dyn ObjectStore>>;
156
157
    /// Get a suitable store for the provided URL. For example:
158
    ///
159
    /// - URL with scheme `file:///` or no scheme will return the default LocalFS store
160
    /// - URL with scheme `s3://bucket/` will return the S3 store
161
    /// - URL with scheme `hdfs://hostname:port/` will return the hdfs store
162
    ///
163
    /// If no [`ObjectStore`] found for the `url`, ad-hoc discovery may be executed depending on
164
    /// the `url` and [`ObjectStoreRegistry`] implementation. An [`ObjectStore`] may be lazily
165
    /// created and registered.
166
    fn get_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>>;
167
}
168
169
/// The default [`ObjectStoreRegistry`]
170
pub struct DefaultObjectStoreRegistry {
171
    /// A map from scheme to object store that serve list / read operations for the store
172
    object_stores: DashMap<String, Arc<dyn ObjectStore>>,
173
}
174
175
impl std::fmt::Debug for DefaultObjectStoreRegistry {
176
0
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
177
0
        f.debug_struct("DefaultObjectStoreRegistry")
178
0
            .field(
179
0
                "schemes",
180
0
                &self
181
0
                    .object_stores
182
0
                    .iter()
183
0
                    .map(|o| o.key().clone())
184
0
                    .collect::<Vec<_>>(),
185
0
            )
186
0
            .finish()
187
0
    }
188
}
189
190
impl Default for DefaultObjectStoreRegistry {
191
937
    fn default() -> Self {
192
937
        Self::new()
193
937
    }
194
}
195
196
impl DefaultObjectStoreRegistry {
197
    /// This will register [`LocalFileSystem`] to handle `file://` paths
198
    #[cfg(not(target_arch = "wasm32"))]
199
937
    pub fn new() -> Self {
200
937
        let object_stores: DashMap<String, Arc<dyn ObjectStore>> = DashMap::new();
201
937
        object_stores.insert("file://".to_string(), Arc::new(LocalFileSystem::new()));
202
937
        Self { object_stores }
203
937
    }
204
205
    /// Default without any backend registered.
206
    #[cfg(target_arch = "wasm32")]
207
    pub fn new() -> Self {
208
        let object_stores: DashMap<String, Arc<dyn ObjectStore>> = DashMap::new();
209
        Self { object_stores }
210
    }
211
}
212
213
///
214
/// Stores are registered based on the scheme, host and port of the provided URL
215
/// with a [`LocalFileSystem::new`] automatically registered for `file://` (if the
216
/// target arch is not `wasm32`).
217
///
218
/// For example:
219
///
220
/// - `file:///my_path` will return the default LocalFS store
221
/// - `s3://bucket/path` will return a store registered with `s3://bucket` if any
222
/// - `hdfs://host:port/path` will return a store registered with `hdfs://host:port` if any
223
impl ObjectStoreRegistry for DefaultObjectStoreRegistry {
224
0
    fn register_store(
225
0
        &self,
226
0
        url: &Url,
227
0
        store: Arc<dyn ObjectStore>,
228
0
    ) -> Option<Arc<dyn ObjectStore>> {
229
0
        let s = get_url_key(url);
230
0
        self.object_stores.insert(s, store)
231
0
    }
232
233
0
    fn get_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> {
234
0
        let s = get_url_key(url);
235
0
        self.object_stores
236
0
            .get(&s)
237
0
            .map(|o| Arc::clone(o.value()))
238
0
            .ok_or_else(|| {
239
0
                DataFusionError::Internal(format!(
240
0
                    "No suitable object store found for {url}. See `RuntimeEnv::register_object_store`"
241
0
                ))
242
0
            })
243
0
    }
244
}
245
246
/// Get the key of a url for object store registration.
247
/// The credential info will be removed
248
0
fn get_url_key(url: &Url) -> String {
249
0
    format!(
250
0
        "{}://{}",
251
0
        url.scheme(),
252
0
        &url[url::Position::BeforeHost..url::Position::AfterPort],
253
0
    )
254
0
}
255
256
#[cfg(test)]
257
mod tests {
258
    use super::*;
259
260
    #[test]
261
    fn test_object_store_url() {
262
        let file = ObjectStoreUrl::parse("file://").unwrap();
263
        assert_eq!(file.as_str(), "file:///");
264
265
        let url = ObjectStoreUrl::parse("s3://bucket").unwrap();
266
        assert_eq!(url.as_str(), "s3://bucket/");
267
268
        let url = ObjectStoreUrl::parse("s3://username:password@host:123").unwrap();
269
        assert_eq!(url.as_str(), "s3://username:password@host:123/");
270
271
        let err = ObjectStoreUrl::parse("s3://bucket:invalid").unwrap_err();
272
        assert_eq!(err.strip_backtrace(), "External error: invalid port number");
273
274
        let err = ObjectStoreUrl::parse("s3://bucket?").unwrap_err();
275
        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?");
276
277
        let err = ObjectStoreUrl::parse("s3://bucket?foo=bar").unwrap_err();
278
        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?foo=bar");
279
280
        let err = ObjectStoreUrl::parse("s3://host:123/foo").unwrap_err();
281
        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo");
282
283
        let err =
284
            ObjectStoreUrl::parse("s3://username:password@host:123/foo").unwrap_err();
285
        assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo");
286
    }
287
288
    #[test]
289
    fn test_get_url_key() {
290
        let file = ObjectStoreUrl::parse("file://").unwrap();
291
        let key = get_url_key(&file.url);
292
        assert_eq!(key.as_str(), "file://");
293
294
        let url = ObjectStoreUrl::parse("s3://bucket").unwrap();
295
        let key = get_url_key(&url.url);
296
        assert_eq!(key.as_str(), "s3://bucket");
297
298
        let url = ObjectStoreUrl::parse("s3://username:password@host:123").unwrap();
299
        let key = get_url_key(&url.url);
300
        assert_eq!(key.as_str(), "s3://host:123");
301
    }
302
}