Skip to content

Commit

Permalink
Change Hive CSV escape character to backslash when set to double quote
Browse files Browse the repository at this point in the history
Hive has a bug that changes the escape character to backslash when it is
explicitly set to a double quote during deserialization.  The native CSV
reader is a bug for bug compatible reimplementation of the Hive OpenCSV
reader, so this behavior must be replicated.
  • Loading branch information
dain committed Sep 7, 2023
1 parent c942372 commit 006097f
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
import java.util.List;
import java.util.Map;

import static com.google.common.base.Preconditions.checkArgument;
import static io.trino.hive.formats.line.csv.CsvConstants.DEFAULT_QUOTE;
import static io.trino.hive.formats.line.csv.CsvConstants.DEFAULT_SEPARATOR;
import static io.trino.hive.formats.line.csv.CsvConstants.DESERIALIZER_DEFAULT_ESCAPE;
import static io.trino.hive.formats.line.csv.CsvConstants.ESCAPE_KEY;
import static io.trino.hive.formats.line.csv.CsvConstants.HIVE_SERDE_CLASS_NAMES;
import static io.trino.hive.formats.line.csv.CsvConstants.QUOTE_KEY;
import static io.trino.hive.formats.line.csv.CsvConstants.SEPARATOR_KEY;
import static io.trino.hive.formats.line.csv.CsvConstants.SERIALIZER_DEFAULT_ESCAPE;
import static io.trino.hive.formats.line.csv.CsvConstants.getCharProperty;

public class CsvDeserializerFactory
Expand All @@ -44,6 +46,15 @@ public LineDeserializer create(List<Column> columns, Map<String, String> serdePr
char separatorChar = getCharProperty(serdeProperties, SEPARATOR_KEY, DEFAULT_SEPARATOR);
char quoteChar = getCharProperty(serdeProperties, QUOTE_KEY, DEFAULT_QUOTE);
char escapeChar = getCharProperty(serdeProperties, ESCAPE_KEY, DESERIALIZER_DEFAULT_ESCAPE);
// Hive has a bug where when the escape character is explicitly set to double quote (char 34),
// it changes the escape character to backslash (char 92) when deserializing.
if (escapeChar == SERIALIZER_DEFAULT_ESCAPE) {
// Add an explicit checks for separator or quote being backslash, so a more helpful error message can be provided
// as this Hive behavior is not obvious
checkArgument(separatorChar != DESERIALIZER_DEFAULT_ESCAPE, "Separator character cannot be '\\' when escape character is '\"'");
checkArgument(quoteChar != DESERIALIZER_DEFAULT_ESCAPE, "Quote character cannot be '\\' when escape character is '\"'");
escapeChar = DESERIALIZER_DEFAULT_ESCAPE;
}
return new CsvDeserializer(columns, separatorChar, quoteChar, escapeChar);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
import static org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_COLUMN_TYPES;
import static org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

public class TestCsvFormat
{
Expand Down Expand Up @@ -129,22 +130,32 @@ public void testCsv()
// For serialization the pipe character is escaped with a quote char, but for deserialization escape character is the backslash character
assertTrinoHiveByteForByte(false, Arrays.asList("|", "a", "b"), Optional.empty(), Optional.of('|'), Optional.empty());
assertTrinoHiveByteForByte(false, Arrays.asList("|", "a", "|"), Optional.empty(), Optional.of('|'), Optional.empty());

// Hive has strange special handling of the escape character. If the escape character is double quote (char 34)
// Hive will change the escape character to backslash (char 92).
assertLine("*a*|*b\\|b*|*c*", Arrays.asList("a", "b|b", "c"), Optional.of('|'), Optional.of('*'), Optional.of('"'));

// Hive does not allow the separator, quote, or escape characters to be the same, but this is checked after the escape character is changed
assertInvalidConfig(Optional.of('\\'), Optional.of('*'), Optional.of('"'));
assertInvalidConfig(Optional.of('*'), Optional.of('\\'), Optional.of('"'));

// Since the escape character is swapped, the quote or separator character can be the same as the original escape character
assertLine("\"a\"|\"b\\\"b\"|\"c\"", Arrays.asList("a", "b\"b", "c"), Optional.of('|'), Optional.of('"'), Optional.of('"'));
assertLine("*a*\"*b\\\"b*\"*c*", Arrays.asList("a", "b\"b", "c"), Optional.of('"'), Optional.of('*'), Optional.of('"'));
}

private static void assertLine(boolean shouldRoundTrip, String csvLine, List<String> expectedValues)
throws Exception
{
assertHiveLine(csvLine, expectedValues, Optional.empty(), Optional.empty(), Optional.empty());
assertTrinoLine(csvLine, expectedValues, Optional.empty(), Optional.empty(), Optional.empty());
assertLine(csvLine, expectedValues, Optional.empty(), Optional.empty(), Optional.empty());
assertTrinoHiveByteForByte(shouldRoundTrip, expectedValues, Optional.empty(), Optional.empty(), Optional.empty());

csvLine = rewriteSpecialChars(csvLine, '_', '|', '~');
expectedValues = expectedValues.stream()
.map(value -> value == null ? null : rewriteSpecialChars(value, '_', '|', '~'))
.collect(Collectors.toList());

assertHiveLine(csvLine, expectedValues, Optional.of('_'), Optional.of('|'), Optional.of('~'));
assertTrinoLine(csvLine, expectedValues, Optional.of('_'), Optional.of('|'), Optional.of('~'));
assertLine(csvLine, expectedValues, Optional.of('_'), Optional.of('|'), Optional.of('~'));
// after switching the special characters the values will round trip
assertTrinoHiveByteForByte(true, expectedValues, Optional.of('_'), Optional.of('|'), Optional.of('~'));
}
Expand Down Expand Up @@ -222,6 +233,13 @@ private static List<Column> createReadColumns(int columnCount)
.toList();
}

private static void assertLine(String csvLine, List<String> expectedValues, Optional<Character> separatorChar, Optional<Character> quoteChar, Optional<Character> escapeChar)
throws SerDeException, IOException
{
assertHiveLine(csvLine, expectedValues, separatorChar, quoteChar, escapeChar);
assertTrinoLine(csvLine, expectedValues, separatorChar, quoteChar, escapeChar);
}

private static void assertHiveLine(String csvLine, List<String> expectedValues, Optional<Character> separatorChar, Optional<Character> quoteChar, Optional<Character> escapeChar)
throws SerDeException
{
Expand All @@ -246,6 +264,16 @@ private static String writeHiveLine(List<String> expectedValues, Optional<Charac
return serializer.serialize(expectedValues, javaObjectInspector).toString();
}

private static void assertInvalidConfig(Optional<Character> separatorChar, Optional<Character> quoteChar, Optional<Character> escapeChar)
{
assertThatThrownBy(() -> createHiveSerDe(3, separatorChar, quoteChar, escapeChar).deserialize(new Text("")))
.isInstanceOf(SerDeException.class)
.hasMessage("java.lang.UnsupportedOperationException: The separator, quote, and escape characters must be different!");
assertThatThrownBy(() -> new CsvDeserializerFactory().create(createReadColumns(3), createCsvProperties(separatorChar, quoteChar, escapeChar)))
.isInstanceOf(IllegalArgumentException.class)
.hasMessageMatching("(Quote|Separator) character cannot be '\\\\' when escape character is '\"'");
}

private static OpenCSVSerde createHiveSerDe(int columnCount, Optional<Character> separatorChar, Optional<Character> quoteChar, Optional<Character> escapeChar)
throws SerDeException
{
Expand Down

0 comments on commit 006097f

Please sign in to comment.