diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 19385e1f545c..f735d9bbaba5 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -29,8 +29,6 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; -import static org.apache.iceberg.TableProperties.OBJECT_STORE_PATH; - public class LocationProviders { private LocationProviders() { @@ -68,16 +66,22 @@ public static LocationProvider locationsFor(String location, Map } } - private static String defaultDataLocation(String tableLocation, Map properties) { - return properties.getOrDefault(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, - String.format("%s/data", tableLocation)); - } - static class DefaultLocationProvider implements LocationProvider { private final String dataLocation; DefaultLocationProvider(String tableLocation, Map properties) { - this.dataLocation = stripTrailingSlash(defaultDataLocation(tableLocation, properties)); + this.dataLocation = stripTrailingSlash(dataLocation(properties, tableLocation)); + } + + private static String dataLocation(Map properties, String tableLocation) { + String dataLocation = properties.get(TableProperties.WRITE_DATA_LOCATION); + if (dataLocation == null) { + dataLocation = properties.get(TableProperties.WRITE_FOLDER_STORAGE_LOCATION); + if (dataLocation == null) { + dataLocation = String.format("%s/data", tableLocation); + } + } + return dataLocation; } @Override @@ -99,11 +103,24 @@ static class ObjectStoreLocationProvider implements LocationProvider { private final String context; ObjectStoreLocationProvider(String tableLocation, Map properties) { - this.storageLocation = stripTrailingSlash(properties.getOrDefault(OBJECT_STORE_PATH, - defaultDataLocation(tableLocation, properties))); + this.storageLocation = stripTrailingSlash(dataLocation(properties, tableLocation)); this.context = pathContext(tableLocation); } + private static String dataLocation(Map properties, String tableLocation) { + String dataLocation = properties.get(TableProperties.WRITE_DATA_LOCATION); + if (dataLocation == null) { + dataLocation = properties.get(TableProperties.OBJECT_STORE_PATH); + if (dataLocation == null) { + dataLocation = properties.get(TableProperties.WRITE_FOLDER_STORAGE_LOCATION); + if (dataLocation == null) { + dataLocation = String.format("%s/data", tableLocation); + } + } + } + return dataLocation; + } + @Override public String newDataLocation(PartitionSpec spec, StructLike partitionData, String filename) { return newDataLocation(String.format("%s/%s", spec.partitionToPath(partitionData), filename)); diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index dcbab13cd70d..cfe668a69703 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -135,21 +135,31 @@ private TableProperties() { public static final String OBJECT_STORE_ENABLED = "write.object-storage.enabled"; public static final boolean OBJECT_STORE_ENABLED_DEFAULT = false; + /** + * @deprecated Use {@link #WRITE_DATA_LOCATION} instead. + */ + @Deprecated public static final String OBJECT_STORE_PATH = "write.object-storage.path"; public static final String WRITE_LOCATION_PROVIDER_IMPL = "write.location-provider.impl"; - // This only applies to files written after this property is set. Files previously written aren't - // relocated to reflect this parameter. - // If not set, defaults to a "data" folder underneath the root path of the table. + /** + * @deprecated Use {@link #WRITE_DATA_LOCATION} instead. + */ + @Deprecated public static final String WRITE_FOLDER_STORAGE_LOCATION = "write.folder-storage.path"; /** - * @deprecated will be removed in 0.14.0, use {@link #WRITE_FOLDER_STORAGE_LOCATION} instead + * @deprecated will be removed in 0.14.0, use {@link #WRITE_DATA_LOCATION} instead */ @Deprecated public static final String WRITE_NEW_DATA_LOCATION = "write.folder-storage.path"; + // This only applies to files written after this property is set. Files previously written aren't + // relocated to reflect this parameter. + // If not set, defaults to a "data" folder underneath the root path of the table. + public static final String WRITE_DATA_LOCATION = "write.data.path"; + // This only applies to files written after this property is set. Files previously written aren't // relocated to reflect this parameter. // If not set, defaults to a "metadata" folder underneath the root path of the table. diff --git a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java index e47b0e30f9ea..c20a4df44b07 100644 --- a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java +++ b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java @@ -114,7 +114,7 @@ public void testDefaultLocationProvider() { @Test public void testDefaultLocationProviderWithCustomDataLocation() { this.table.updateProperties() - .set(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, "new_location") + .set(TableProperties.WRITE_DATA_LOCATION, "new_location") .commit(); this.table.locationProvider().newDataLocation("my_file"); @@ -237,5 +237,39 @@ public void testObjectStorageLocationProviderPathResolution() { Assert.assertTrue("object storage path should be used when set", table.locationProvider().newDataLocation("file").contains(objectPath)); + + String dataPath = "s3://random/data/location"; + table.updateProperties() + .set(TableProperties.WRITE_DATA_LOCATION, dataPath) + .commit(); + + Assert.assertTrue("write data path should be used when set", + table.locationProvider().newDataLocation("file").contains(dataPath)); + } + + @Test + public void testDefaultStorageLocationProviderPathResolution() { + table.updateProperties() + .set(TableProperties.OBJECT_STORE_ENABLED, "false") + .commit(); + + Assert.assertTrue("default data location should be used when object storage path not set", + table.locationProvider().newDataLocation("file").contains(table.location() + "/data")); + + String folderPath = "s3://random/folder/location"; + table.updateProperties() + .set(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, folderPath) + .commit(); + + Assert.assertTrue("folder storage path should be used when set", + table.locationProvider().newDataLocation("file").contains(folderPath)); + + String dataPath = "s3://random/data/location"; + table.updateProperties() + .set(TableProperties.WRITE_DATA_LOCATION, dataPath) + .commit(); + + Assert.assertTrue("write data path should be used when set", + table.locationProvider().newDataLocation("file").contains(dataPath)); } } diff --git a/site/docs/aws.md b/site/docs/aws.md index ab37962e2505..7e8588c7be91 100644 --- a/site/docs/aws.md +++ b/site/docs/aws.md @@ -344,7 +344,7 @@ Data stored in S3 with a traditional Hive storage layout can face S3 request thr Iceberg by default uses the Hive storage layout, but can be switched to use the `ObjectStoreLocationProvider`. With `ObjectStoreLocationProvider`, a determenistic hash is generated for each stored file, with the hash appended -directly after the `write.object-storage.path`. This ensures files written to s3 are equally distributed across multiple [prefixes](https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/) in the S3 bucket. Resulting in minimized throttling and maximized throughput for S3-related IO operations. When using `ObjectStoreLocationProvider` having a shared and short `write.object-storage.path` across your Iceberg tables will improve performance. +directly after the `write.data.path`. This ensures files written to s3 are equally distributed across multiple [prefixes](https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/) in the S3 bucket. Resulting in minimized throttling and maximized throughput for S3-related IO operations. When using `ObjectStoreLocationProvider` having a shared and short `write.data.path` across your Iceberg tables will improve performance. For more information on how S3 scales API QPS, checkout the 2018 re:Invent session on [Best Practices for Amazon S3 and Amazon S3 Glacier]( https://youtu.be/rHeTn9pHNKo?t=3219). At [53:39](https://youtu.be/rHeTn9pHNKo?t=3219) it covers how S3 scales/partitions & at [54:50](https://youtu.be/rHeTn9pHNKo?t=3290) it discusses the 30-60 minute wait time before new partitions are created. @@ -358,7 +358,7 @@ CREATE TABLE my_catalog.my_ns.my_table ( USING iceberg OPTIONS ( 'write.object-storage.enabled'=true, - 'write.object-storage.path'='s3://my-table-data-bucket') + 'write.data.path'='s3://my-table-data-bucket') PARTITIONED BY (category); ``` @@ -372,10 +372,10 @@ Which will write the data to S3 with a hash (`2d3905f8`) appended directly after s3://my-table-data-bucket/2d3905f8/my_ns.db/my_table/category=orders/00000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet ``` -Note, the path resolution logic for `ObjectStoreLocationProvider` is as follows: -- if `write.object-storage.path` is set, use it -- if not found, fallback to `write.folder-storage.path` -- if not found, use `/data` +Note, the path resolution logic for `ObjectStoreLocationProvider` is `write.data.path` then `/data`. +However, for the older versions up to 0.12.0, the logic is as follows: +- before 0.12.0, `write.object-storage.path` must be set. +- at 0.12.0, `write.object-storage.path` then `write.folder-storage.path` then `/data`. For more details, please refer to the [LocationProvider Configuration](../custom-catalog/#custom-location-provider-implementation) section. diff --git a/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index cb25074b4d96..8c6e225fa1bf 100644 --- a/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -79,8 +79,7 @@ protected String newTableLocation() { protected String dataLocation() { Map properties = table.properties(); - return properties.getOrDefault(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, - String.format("%s/data", table.location())); + return properties.getOrDefault(TableProperties.WRITE_DATA_LOCATION, String.format("%s/data", table.location())); } protected void cleanupFiles() throws IOException { diff --git a/spark/src/test/java/org/apache/iceberg/actions/TestRemoveOrphanFilesAction.java b/spark/src/test/java/org/apache/iceberg/actions/TestRemoveOrphanFilesAction.java index 7c2a325eb783..cf85bc850420 100644 --- a/spark/src/test/java/org/apache/iceberg/actions/TestRemoveOrphanFilesAction.java +++ b/spark/src/test/java/org/apache/iceberg/actions/TestRemoveOrphanFilesAction.java @@ -285,7 +285,7 @@ public void testWapFilesAreKept() throws InterruptedException { public void testMetadataFolderIsIntact() throws InterruptedException { // write data directly to the table location Map props = Maps.newHashMap(); - props.put(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, tableLocation); + props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); List records = Lists.newArrayList( @@ -357,7 +357,7 @@ public void testOlderThanTimestamp() throws InterruptedException { @Test public void testRemoveUnreachableMetadataVersionFiles() throws InterruptedException { Map props = Maps.newHashMap(); - props.put(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, tableLocation); + props.put(TableProperties.WRITE_DATA_LOCATION, tableLocation); props.put(TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, "1"); Table table = TABLES.create(SCHEMA, SPEC, props, tableLocation); diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java index 8ad8e2e2b802..36c0541f46c5 100644 --- a/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java +++ b/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java @@ -145,7 +145,7 @@ public void testWriteWithCustomDataLocation() throws IOException { File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir"); Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location); table.updateProperties().set( - TableProperties.WRITE_FOLDER_STORAGE_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit(); + TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath()).commit(); writeAndValidateWithLocations(table, location, tablePropertyDataLocation); } @@ -271,7 +271,7 @@ public void testNullableWithWriteOption() throws IOException { String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); String targetPath = String.format("%s/nullable_poc/targetFolder/", location.toString()); - tableProperties = ImmutableMap.of(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, targetPath); + tableProperties = ImmutableMap.of(TableProperties.WRITE_DATA_LOCATION, targetPath); // read this and append to iceberg dataset spark @@ -312,7 +312,7 @@ public void testNullableWithSparkSqlOption() throws IOException { String sourcePath = String.format("%s/nullable_poc/sourceFolder/", location.toString()); String targetPath = String.format("%s/nullable_poc/targetFolder/", location.toString()); - tableProperties = ImmutableMap.of(TableProperties.WRITE_FOLDER_STORAGE_LOCATION, targetPath); + tableProperties = ImmutableMap.of(TableProperties.WRITE_DATA_LOCATION, targetPath); // read this and append to iceberg dataset spark diff --git a/spark3/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java b/spark3/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java index fdc5bf09ce4d..1ccb448f1dcc 100644 --- a/spark3/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java +++ b/spark3/src/main/java/org/apache/iceberg/spark/actions/BaseSnapshotTableSparkAction.java @@ -168,6 +168,8 @@ protected Map destTableProps() { properties.remove(LOCATION); properties.remove(TableProperties.WRITE_METADATA_LOCATION); properties.remove(TableProperties.WRITE_FOLDER_STORAGE_LOCATION); + properties.remove(TableProperties.OBJECT_STORE_PATH); + properties.remove(TableProperties.WRITE_DATA_LOCATION); // set default and user-provided props properties.put(TableCatalog.PROP_PROVIDER, "iceberg");