Skip to content

Commit

Permalink
Avoid creating Hive tables with double slashed location
Browse files Browse the repository at this point in the history
Hive connector does not support table locations containing double
slashes. On S3 this leads to correctness issues (e.g. INSERT works, but
SELECT does not find any data).

This commit

- restores normalization of implicit table location during CREATE TABLE.
  There used to be such normalization until 8bd9f75.
- rejects explicit table locations containing double slash during
  `CREATE TABLE .. WITH (external_location = ...)`.
  Before 8bd9f75 there used to be
  normalization also during this flow, but rejecting such unsupported
  locations is deemed more correct.
  • Loading branch information
findepi committed Jun 20, 2023
1 parent 289bf91 commit 431f3e0
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1275,12 +1275,24 @@ private static String validateAvroSchemaLiteral(String avroSchemaLiteral)

private static Location getValidatedExternalLocation(String location)
{
Location validated;
try {
return Location.of(location);
validated = Location.of(location);
}
catch (IllegalArgumentException e) {
throw new TrinoException(INVALID_TABLE_PROPERTY, "External location is not a valid file system URI: " + location, e);
}

// TODO (https://github.com/trinodb/trino/issues/17803) We cannot accept locations with double slash until all relevant Hive connector components are migrated off Hadoop Path.
// Hadoop Path "normalizes location", e.g.:
// - removes double slashes (such locations are rejected),
// - removes trailing slash (such locations are accepted; foo/bar and foo/bar/ are treated as equivalent, and rejecting locations with trailing slash could pose UX issues)
// - replaces file:///<local-path> with file:/<local-path> (such locations are accepted).
if (validated.path().contains("//")) {
throw new TrinoException(INVALID_TABLE_PROPERTY, "Unsupported location that cannot be internally represented: " + location);
}

return validated;
}

private void checkExternalPath(HdfsContext context, Path path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,10 @@ public static Location getTableDefaultLocation(Database database, HdfsContext co
}
}

return Location.of(location).appendPath(escapeTableName(tableName));
// Note: this results in `databaseLocation` being a "normalized location", e.g. not containing double slashes.
// TODO (https://github.com/trinodb/trino/issues/17803): We need to use normalized location until all relevant Hive connector components are migrated off Hadoop Path.
Location databaseLocation = Location.of(databasePath.toString());
return databaseLocation.appendPath(escapeTableName(tableName));
}

public static boolean pathExists(HdfsContext context, HdfsEnvironment hdfsEnvironment, Path path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@

import java.nio.file.Path;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;

import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.plugin.hive.metastore.glue.GlueHiveMetastore.createTestingGlueHiveMetastore;
import static io.trino.spi.security.SelectedRole.Type.ROLE;
import static io.trino.testing.TestingNames.randomNameSuffix;
Expand Down Expand Up @@ -123,9 +121,14 @@ public void testBasicOperationsWithProvidedTableLocation(boolean partitioned, St
String location = locationPattern.formatted(bucketName, schemaName, tableName);
String partitionQueryPart = (partitioned ? ",partitioned_by = ARRAY['col_int']" : "");

assertUpdate("CREATE TABLE " + tableName + "(col_str, col_int)" +
String create = "CREATE TABLE " + tableName + "(col_str, col_int)" +
"WITH (external_location = '" + location + "'" + partitionQueryPart + ") " +
"AS VALUES ('str1', 1), ('str2', 2), ('str3', 3)", 3);
"AS VALUES ('str1', 1), ('str2', 2), ('str3', 3)";
if (locationPattern.contains("double_slash")) {
assertQueryFails(create, "\\QUnsupported location that cannot be internally represented: " + location);
return;
}
assertUpdate(create, 3);
assertQuery("SELECT * FROM " + tableName, "VALUES ('str1', 1), ('str2', 2), ('str3', 3)");

String actualTableLocation = getTableLocation(tableName);
Expand Down Expand Up @@ -154,7 +157,9 @@ public void testBasicOperationsWithProvidedSchemaLocation(boolean partitioned, S
assertThat(getSchemaLocation(schemaName)).isEqualTo(schemaLocation);

assertUpdate("CREATE TABLE " + qualifiedTableName + "(col_str varchar, col_int int)" + partitionQueryPart);
String expectedTableLocation = (schemaLocation.endsWith("/") ? schemaLocation : schemaLocation + "/") + tableName;
String expectedTableLocation = ((schemaLocation.endsWith("/") ? schemaLocation : schemaLocation + "/") + tableName)
// Hive normalizes double slash
.replaceAll("(?<!(s3:))//", "/");

String actualTableLocation = metastore.getTable(schemaName, tableName).orElseThrow().getStorage().getLocation();
assertThat(actualTableLocation).matches(expectedTableLocation);
Expand All @@ -179,16 +184,33 @@ public void testMergeWithProvidedTableLocation(boolean partitioned, String locat
// Row-level modifications are not supported for Hive tables
}

@Override
public void testOptimizeWithProvidedTableLocation(boolean partitioned, String locationPattern)
{
if (locationPattern.contains("double_slash")) {
assertThatThrownBy(() -> super.testOptimizeWithProvidedTableLocation(partitioned, locationPattern))
.hasMessageStartingWith("Unsupported location that cannot be internally represented: ")
.hasStackTraceContaining("SQL: CREATE TABLE test_optimize_");
return;
}
super.testOptimizeWithProvidedTableLocation(partitioned, locationPattern);
}

@Test(dataProvider = "locationPatternsDataProvider")
public void testAnalyzeWithProvidedTableLocation(boolean partitioned, String locationPattern)
{
String tableName = "test_analyze_" + randomNameSuffix();
String location = locationPattern.formatted(bucketName, schemaName, tableName);
String partitionQueryPart = (partitioned ? ",partitioned_by = ARRAY['col_int']" : "");

assertUpdate("CREATE TABLE " + tableName + "(col_str, col_int)" +
String create = "CREATE TABLE " + tableName + "(col_str, col_int)" +
"WITH (external_location = '" + location + "'" + partitionQueryPart + ") " +
"AS VALUES ('str1', 1), ('str2', 2), ('str3', 3)", 3);
"AS VALUES ('str1', 1), ('str2', 2), ('str3', 3)";
if (locationPattern.contains("double_slash")) {
assertQueryFails(create, "\\QUnsupported location that cannot be internally represented: " + location);
return;
}
assertUpdate(create, 3);

assertUpdate("INSERT INTO " + tableName + " VALUES ('str4', 4)", 1);
assertQuery("SELECT * FROM " + tableName, "VALUES ('str1', 1), ('str2', 2), ('str3', 3), ('str4', 4)");
Expand Down Expand Up @@ -286,13 +308,4 @@ public void testSchemaNameEscape()

assertUpdate("DROP SCHEMA \"" + schemaName + "\"");
}

@Override
protected List<String> locationPatterns()
{
// TODO https://github.com/trinodb/trino/issues/17803 Fix correctness issue with double slash
return super.locationPatterns().stream()
.filter(location -> !location.contains("double_slash"))
.collect(toImmutableList());
}
}

0 comments on commit 431f3e0

Please sign in to comment.