/Users/andrewlamb/Software/datafusion/datafusion/execution/src/object_store.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! ObjectStoreRegistry holds all the object stores at Runtime with a scheme for each store. |
19 | | //! This allows the user to extend DataFusion with different storage systems such as S3 or HDFS |
20 | | //! and query data inside these systems. |
21 | | |
22 | | use dashmap::DashMap; |
23 | | use datafusion_common::{exec_err, DataFusionError, Result}; |
24 | | #[cfg(not(target_arch = "wasm32"))] |
25 | | use object_store::local::LocalFileSystem; |
26 | | use object_store::ObjectStore; |
27 | | use std::sync::Arc; |
28 | | use url::Url; |
29 | | |
30 | | /// A parsed URL identifying a particular [`ObjectStore`] instance |
31 | | /// |
32 | | /// For example: |
33 | | /// * `file://` for local file system |
34 | | /// * `s3://bucket` for AWS S3 bucket |
35 | | /// * `oss://bucket` for Aliyun OSS bucket |
36 | | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] |
37 | | pub struct ObjectStoreUrl { |
38 | | url: Url, |
39 | | } |
40 | | |
41 | | impl ObjectStoreUrl { |
42 | | /// Parse an [`ObjectStoreUrl`] from a string |
43 | | /// |
44 | | /// # Example |
45 | | /// ``` |
46 | | /// # use url::Url; |
47 | | /// # use datafusion_execution::object_store::ObjectStoreUrl; |
48 | | /// let object_store_url = ObjectStoreUrl::parse("s3://bucket").unwrap(); |
49 | | /// assert_eq!(object_store_url.as_str(), "s3://bucket/"); |
50 | | /// // can also access the underlying `Url` |
51 | | /// let url: &Url = object_store_url.as_ref(); |
52 | | /// assert_eq!(url.scheme(), "s3"); |
53 | | /// assert_eq!(url.host_str(), Some("bucket")); |
54 | | /// assert_eq!(url.path(), "/"); |
55 | | /// ``` |
56 | 0 | pub fn parse(s: impl AsRef<str>) -> Result<Self> { |
57 | 0 | let mut parsed = |
58 | 0 | Url::parse(s.as_ref()).map_err(|e| DataFusionError::External(Box::new(e)))?; |
59 | | |
60 | 0 | let remaining = &parsed[url::Position::BeforePath..]; |
61 | 0 | if !remaining.is_empty() && remaining != "/" { |
62 | 0 | return exec_err!( |
63 | 0 | "ObjectStoreUrl must only contain scheme and authority, got: {remaining}" |
64 | 0 | ); |
65 | 0 | } |
66 | 0 |
|
67 | 0 | // Always set path for consistency |
68 | 0 | parsed.set_path("/"); |
69 | 0 | Ok(Self { url: parsed }) |
70 | 0 | } |
71 | | |
72 | | /// An [`ObjectStoreUrl`] for the local filesystem (`file://`) |
73 | | /// |
74 | | /// # Example |
75 | | /// ``` |
76 | | /// # use datafusion_execution::object_store::ObjectStoreUrl; |
77 | | /// let local_fs = ObjectStoreUrl::parse("file://").unwrap(); |
78 | | /// assert_eq!(local_fs, ObjectStoreUrl::local_filesystem()) |
79 | | /// ``` |
80 | 0 | pub fn local_filesystem() -> Self { |
81 | 0 | Self::parse("file://").unwrap() |
82 | 0 | } |
83 | | |
84 | | /// Returns this [`ObjectStoreUrl`] as a string |
85 | 0 | pub fn as_str(&self) -> &str { |
86 | 0 | self.as_ref() |
87 | 0 | } |
88 | | } |
89 | | |
90 | | impl AsRef<str> for ObjectStoreUrl { |
91 | 0 | fn as_ref(&self) -> &str { |
92 | 0 | self.url.as_ref() |
93 | 0 | } |
94 | | } |
95 | | |
96 | | impl AsRef<Url> for ObjectStoreUrl { |
97 | 0 | fn as_ref(&self) -> &Url { |
98 | 0 | &self.url |
99 | 0 | } |
100 | | } |
101 | | |
102 | | impl std::fmt::Display for ObjectStoreUrl { |
103 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
104 | 0 | self.as_str().fmt(f) |
105 | 0 | } |
106 | | } |
107 | | |
108 | | /// [`ObjectStoreRegistry`] maps a URL to an [`ObjectStore`] instance, |
109 | | /// and allows DataFusion to read from different [`ObjectStore`] |
110 | | /// instances. For example DataFusion might be configured so that |
111 | | /// |
112 | | /// 1. `s3://my_bucket/lineitem/` mapped to the `/lineitem` path on an |
113 | | /// AWS S3 object store bound to `my_bucket` |
114 | | /// |
115 | | /// 2. `s3://my_other_bucket/lineitem/` mapped to the (same) |
116 | | /// `/lineitem` path on a *different* AWS S3 object store bound to |
117 | | /// `my_other_bucket` |
118 | | /// |
119 | | /// When given a [`ListingTableUrl`], DataFusion tries to find an |
120 | | /// appropriate [`ObjectStore`]. For example |
121 | | /// |
122 | | /// ```sql |
123 | | /// create external table unicorns stored as parquet location 's3://my_bucket/lineitem/'; |
124 | | /// ``` |
125 | | /// |
126 | | /// In this particular case, the url `s3://my_bucket/lineitem/` will be provided to |
127 | | /// [`ObjectStoreRegistry::get_store`] and one of three things will happen: |
128 | | /// |
129 | | /// - If an [`ObjectStore`] has been registered with [`ObjectStoreRegistry::register_store`] with |
130 | | /// `s3://my_bucket`, that [`ObjectStore`] will be returned |
131 | | /// |
132 | | /// - If an AWS S3 object store can be ad-hoc discovered by the url `s3://my_bucket/lineitem/`, this |
133 | | /// object store will be registered with key `s3://my_bucket` and returned. |
134 | | /// |
135 | | /// - Otherwise an error will be returned, indicating that no suitable [`ObjectStore`] could |
136 | | /// be found |
137 | | /// |
138 | | /// This allows for two different use-cases: |
139 | | /// |
140 | | /// 1. Systems where object store buckets are explicitly created using DDL, can register these |
141 | | /// buckets using [`ObjectStoreRegistry::register_store`] |
142 | | /// |
143 | | /// 2. Systems relying on ad-hoc discovery, without corresponding DDL, can create [`ObjectStore`] |
144 | | /// lazily by providing a custom implementation of [`ObjectStoreRegistry`] |
145 | | /// |
146 | | /// <!-- is in a different crate so normal rustdoc links don't work --> |
147 | | /// [`ListingTableUrl`]: https://docs.rs/datafusion/latest/datafusion/datasource/listing/struct.ListingTableUrl.html |
148 | | /// [`ObjectStore`]: object_store::ObjectStore |
149 | | pub trait ObjectStoreRegistry: Send + Sync + std::fmt::Debug + 'static { |
150 | | /// If a store with the same key existed before, it is replaced and returned |
151 | | fn register_store( |
152 | | &self, |
153 | | url: &Url, |
154 | | store: Arc<dyn ObjectStore>, |
155 | | ) -> Option<Arc<dyn ObjectStore>>; |
156 | | |
157 | | /// Get a suitable store for the provided URL. For example: |
158 | | /// |
159 | | /// - URL with scheme `file:///` or no scheme will return the default LocalFS store |
160 | | /// - URL with scheme `s3://bucket/` will return the S3 store |
161 | | /// - URL with scheme `hdfs://hostname:port/` will return the hdfs store |
162 | | /// |
163 | | /// If no [`ObjectStore`] found for the `url`, ad-hoc discovery may be executed depending on |
164 | | /// the `url` and [`ObjectStoreRegistry`] implementation. An [`ObjectStore`] may be lazily |
165 | | /// created and registered. |
166 | | fn get_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>>; |
167 | | } |
168 | | |
169 | | /// The default [`ObjectStoreRegistry`] |
170 | | pub struct DefaultObjectStoreRegistry { |
171 | | /// A map from scheme to object store that serve list / read operations for the store |
172 | | object_stores: DashMap<String, Arc<dyn ObjectStore>>, |
173 | | } |
174 | | |
175 | | impl std::fmt::Debug for DefaultObjectStoreRegistry { |
176 | 0 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
177 | 0 | f.debug_struct("DefaultObjectStoreRegistry") |
178 | 0 | .field( |
179 | 0 | "schemes", |
180 | 0 | &self |
181 | 0 | .object_stores |
182 | 0 | .iter() |
183 | 0 | .map(|o| o.key().clone()) |
184 | 0 | .collect::<Vec<_>>(), |
185 | 0 | ) |
186 | 0 | .finish() |
187 | 0 | } |
188 | | } |
189 | | |
190 | | impl Default for DefaultObjectStoreRegistry { |
191 | 937 | fn default() -> Self { |
192 | 937 | Self::new() |
193 | 937 | } |
194 | | } |
195 | | |
196 | | impl DefaultObjectStoreRegistry { |
197 | | /// This will register [`LocalFileSystem`] to handle `file://` paths |
198 | | #[cfg(not(target_arch = "wasm32"))] |
199 | 937 | pub fn new() -> Self { |
200 | 937 | let object_stores: DashMap<String, Arc<dyn ObjectStore>> = DashMap::new(); |
201 | 937 | object_stores.insert("file://".to_string(), Arc::new(LocalFileSystem::new())); |
202 | 937 | Self { object_stores } |
203 | 937 | } |
204 | | |
205 | | /// Default without any backend registered. |
206 | | #[cfg(target_arch = "wasm32")] |
207 | | pub fn new() -> Self { |
208 | | let object_stores: DashMap<String, Arc<dyn ObjectStore>> = DashMap::new(); |
209 | | Self { object_stores } |
210 | | } |
211 | | } |
212 | | |
213 | | /// |
214 | | /// Stores are registered based on the scheme, host and port of the provided URL |
215 | | /// with a [`LocalFileSystem::new`] automatically registered for `file://` (if the |
216 | | /// target arch is not `wasm32`). |
217 | | /// |
218 | | /// For example: |
219 | | /// |
220 | | /// - `file:///my_path` will return the default LocalFS store |
221 | | /// - `s3://bucket/path` will return a store registered with `s3://bucket` if any |
222 | | /// - `hdfs://host:port/path` will return a store registered with `hdfs://host:port` if any |
223 | | impl ObjectStoreRegistry for DefaultObjectStoreRegistry { |
224 | 0 | fn register_store( |
225 | 0 | &self, |
226 | 0 | url: &Url, |
227 | 0 | store: Arc<dyn ObjectStore>, |
228 | 0 | ) -> Option<Arc<dyn ObjectStore>> { |
229 | 0 | let s = get_url_key(url); |
230 | 0 | self.object_stores.insert(s, store) |
231 | 0 | } |
232 | | |
233 | 0 | fn get_store(&self, url: &Url) -> Result<Arc<dyn ObjectStore>> { |
234 | 0 | let s = get_url_key(url); |
235 | 0 | self.object_stores |
236 | 0 | .get(&s) |
237 | 0 | .map(|o| Arc::clone(o.value())) |
238 | 0 | .ok_or_else(|| { |
239 | 0 | DataFusionError::Internal(format!( |
240 | 0 | "No suitable object store found for {url}. See `RuntimeEnv::register_object_store`" |
241 | 0 | )) |
242 | 0 | }) |
243 | 0 | } |
244 | | } |
245 | | |
246 | | /// Get the key of a url for object store registration. |
247 | | /// The credential info will be removed |
248 | 0 | fn get_url_key(url: &Url) -> String { |
249 | 0 | format!( |
250 | 0 | "{}://{}", |
251 | 0 | url.scheme(), |
252 | 0 | &url[url::Position::BeforeHost..url::Position::AfterPort], |
253 | 0 | ) |
254 | 0 | } |
255 | | |
256 | | #[cfg(test)] |
257 | | mod tests { |
258 | | use super::*; |
259 | | |
260 | | #[test] |
261 | | fn test_object_store_url() { |
262 | | let file = ObjectStoreUrl::parse("file://").unwrap(); |
263 | | assert_eq!(file.as_str(), "file:///"); |
264 | | |
265 | | let url = ObjectStoreUrl::parse("s3://bucket").unwrap(); |
266 | | assert_eq!(url.as_str(), "s3://bucket/"); |
267 | | |
268 | | let url = ObjectStoreUrl::parse("s3://username:password@host:123").unwrap(); |
269 | | assert_eq!(url.as_str(), "s3://username:password@host:123/"); |
270 | | |
271 | | let err = ObjectStoreUrl::parse("s3://bucket:invalid").unwrap_err(); |
272 | | assert_eq!(err.strip_backtrace(), "External error: invalid port number"); |
273 | | |
274 | | let err = ObjectStoreUrl::parse("s3://bucket?").unwrap_err(); |
275 | | assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?"); |
276 | | |
277 | | let err = ObjectStoreUrl::parse("s3://bucket?foo=bar").unwrap_err(); |
278 | | assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: ?foo=bar"); |
279 | | |
280 | | let err = ObjectStoreUrl::parse("s3://host:123/foo").unwrap_err(); |
281 | | assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo"); |
282 | | |
283 | | let err = |
284 | | ObjectStoreUrl::parse("s3://username:password@host:123/foo").unwrap_err(); |
285 | | assert_eq!(err.strip_backtrace(), "Execution error: ObjectStoreUrl must only contain scheme and authority, got: /foo"); |
286 | | } |
287 | | |
288 | | #[test] |
289 | | fn test_get_url_key() { |
290 | | let file = ObjectStoreUrl::parse("file://").unwrap(); |
291 | | let key = get_url_key(&file.url); |
292 | | assert_eq!(key.as_str(), "file://"); |
293 | | |
294 | | let url = ObjectStoreUrl::parse("s3://bucket").unwrap(); |
295 | | let key = get_url_key(&url.url); |
296 | | assert_eq!(key.as_str(), "s3://bucket"); |
297 | | |
298 | | let url = ObjectStoreUrl::parse("s3://username:password@host:123").unwrap(); |
299 | | let key = get_url_key(&url.url); |
300 | | assert_eq!(key.as_str(), "s3://host:123"); |
301 | | } |
302 | | } |