From 67ce2ed97d18dca19e920fdf362ebf3183ab8751 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Grzegorz=20Kokosi=C5=84ski?= <grzegorz@starburstdata.com>
Date: Mon, 2 Jul 2018 14:19:23 +0200
Subject: [PATCH] Fix reading table with headers in kerberized environments

Previously when table had non zero value set for header or footer line
count, then Presto was unable to generate splits for it, raising:
Unable to query tables due to Can't get Master Kerberos principal for
use as renewer.

Mentioned above error was raised from internals of FileInputFormat. This
change avoids using FileInputFormat.getSplits.
---
 .travis.yml                                   |  2 +-
 .../hive/BackgroundHiveSplitLoader.java       | 23 ++++---------------
 .../hive/util/InternalHiveSplitFactory.java   | 11 +++++----
 .../tables_with_header_and_footer.sql         |  2 +-
 4 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d417cc3caa9d0..88c35defb99e2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -104,7 +104,7 @@ script:
   - |
     if [[ -v PRODUCT_TESTS_SPECIFIC_ENVIRONMENT ]]; then
       presto-product-tests/bin/run_on_docker.sh \
-        singlenode-kerberos-hdfs-impersonation -g storage_formats,cli,hdfs_impersonation,authorization
+        singlenode-kerberos-hdfs-impersonation -g storage_formats,cli,hdfs_impersonation,authorization,hive_file_header
     fi
   - |
     if [[ -v PRODUCT_TESTS_SPECIFIC_ENVIRONMENT_2 ]]; then
diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/BackgroundHiveSplitLoader.java b/presto-hive/src/main/java/com/facebook/presto/hive/BackgroundHiveSplitLoader.java
index b1ec3c6b3fb0c..fda9a9646d55a 100644
--- a/presto-hive/src/main/java/com/facebook/presto/hive/BackgroundHiveSplitLoader.java
+++ b/presto-hive/src/main/java/com/facebook/presto/hive/BackgroundHiveSplitLoader.java
@@ -287,7 +287,6 @@ private ListenableFuture<?> loadPartition(HivePartitionMetadata partition)
                 // get the configuration for the target path -- it may be a different hdfs instance
                 FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(hdfsContext, targetPath);
                 JobConf targetJob = toJobConf(targetFilesystem.getConf());
-                handleFileHeader(schema, targetJob);
                 targetJob.setInputFormat(TextInputFormat.class);
                 targetInputFormat.configure(targetJob);
                 FileInputFormat.setInputPaths(targetJob, targetPath);
@@ -332,12 +331,11 @@ private ListenableFuture<?> loadPartition(HivePartitionMetadata partition)
 
         // To support custom input formats, we want to call getSplits()
         // on the input format to obtain file splits.
-        if (shouldUseFileSplitsFromInputFormat(inputFormat) || getHeaderCount(schema) > 0 || getFooterCount(schema) > 0) {
+        if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
             if (tableBucketInfo.isPresent()) {
                 throw new PrestoException(NOT_SUPPORTED, "Presto cannot read bucketed partition in an input format with UseFileSplitsFromInputFormat annotation: " + inputFormat.getClass().getSimpleName());
             }
             JobConf jobConf = toJobConf(configuration);
-            handleFileHeader(schema, jobConf);
             FileInputFormat.setInputPaths(jobConf, path);
             InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
 
@@ -349,22 +347,11 @@ private ListenableFuture<?> loadPartition(HivePartitionMetadata partition)
             return hiveSplitSource.addToQueue(getBucketedSplits(path, fs, splitFactory, tableBucketInfo.get(), bucketConversion));
         }
 
-        fileIterators.addLast(createInternalHiveSplitIterator(path, fs, splitFactory));
+        boolean splittable = getHeaderCount(schema) == 0 && getFooterCount(schema) == 0;
+        fileIterators.addLast(createInternalHiveSplitIterator(path, fs, splitFactory, splittable));
         return COMPLETED_FUTURE;
     }
 
-    private void handleFileHeader(Properties schema, JobConf jobConf)
-    {
-        int headerCount = getHeaderCount(schema);
-        int footerCount = getFooterCount(schema);
-        if (headerCount > 0 || footerCount > 0) {
-            // do not split file when skip.header.line.count or skip.footer.line.count is used
-            jobConf.setLong("mapreduce.input.fileinputformat.split.minsize", Long.MAX_VALUE);
-            // TODO remove this when Hadoop 1.x is not supported
-            jobConf.setLong("mapred.min.split.size", Long.MAX_VALUE);
-        }
-    }
-
     private ListenableFuture<?> addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory)
             throws IOException
     {
@@ -389,10 +376,10 @@ private static boolean shouldUseFileSplitsFromInputFormat(InputFormat<?, ?> inpu
                 .anyMatch(name -> name.equals("UseFileSplitsFromInputFormat"));
     }
 
-    private Iterator<InternalHiveSplit> createInternalHiveSplitIterator(Path path, FileSystem fileSystem, InternalHiveSplitFactory splitFactory)
+    private Iterator<InternalHiveSplit> createInternalHiveSplitIterator(Path path, FileSystem fileSystem, InternalHiveSplitFactory splitFactory, boolean splittable)
     {
         return Streams.stream(new HiveFileIterator(path, fileSystem, directoryLister, namenodeStats, recursiveDirWalkerEnabled ? RECURSE : IGNORED))
-                .map(splitFactory::createInternalHiveSplit)
+                .map(status -> splitFactory.createInternalHiveSplit(status, splittable))
                 .filter(Optional::isPresent)
                 .map(Optional::get)
                 .iterator();
diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/util/InternalHiveSplitFactory.java b/presto-hive/src/main/java/com/facebook/presto/hive/util/InternalHiveSplitFactory.java
index ae9c1ee49fadf..dd19146b5800f 100644
--- a/presto-hive/src/main/java/com/facebook/presto/hive/util/InternalHiveSplitFactory.java
+++ b/presto-hive/src/main/java/com/facebook/presto/hive/util/InternalHiveSplitFactory.java
@@ -86,18 +86,19 @@ public String getPartitionName()
         return partitionName;
     }
 
-    public Optional<InternalHiveSplit> createInternalHiveSplit(LocatedFileStatus status)
+    public Optional<InternalHiveSplit> createInternalHiveSplit(LocatedFileStatus status, boolean splittable)
     {
-        return createInternalHiveSplit(status, OptionalInt.empty());
+        return createInternalHiveSplit(status, OptionalInt.empty(), splittable);
     }
 
     public Optional<InternalHiveSplit> createInternalHiveSplit(LocatedFileStatus status, int bucketNumber)
     {
-        return createInternalHiveSplit(status, OptionalInt.of(bucketNumber));
+        return createInternalHiveSplit(status, OptionalInt.of(bucketNumber), false);
     }
 
-    private Optional<InternalHiveSplit> createInternalHiveSplit(LocatedFileStatus status, OptionalInt bucketNumber)
+    private Optional<InternalHiveSplit> createInternalHiveSplit(LocatedFileStatus status, OptionalInt bucketNumber, boolean splittable)
     {
+        splittable = splittable && isSplittable(inputFormat, fileSystem, status.getPath());
         return createInternalHiveSplit(
                 status.getPath(),
                 status.getBlockLocations(),
@@ -105,7 +106,7 @@ private Optional<InternalHiveSplit> createInternalHiveSplit(LocatedFileStatus st
                 status.getLen(),
                 status.getLen(),
                 bucketNumber,
-                isSplittable(inputFormat, fileSystem, status.getPath()));
+                splittable);
     }
 
     public Optional<InternalHiveSplit> createInternalHiveSplit(FileSplit split)
diff --git a/presto-product-tests/src/main/resources/sql-tests/testcases/tables_with_header_and_footer.sql b/presto-product-tests/src/main/resources/sql-tests/testcases/tables_with_header_and_footer.sql
index 91241aca3b8e0..4773395798f3b 100644
--- a/presto-product-tests/src/main/resources/sql-tests/testcases/tables_with_header_and_footer.sql
+++ b/presto-product-tests/src/main/resources/sql-tests/testcases/tables_with_header_and_footer.sql
@@ -1,4 +1,4 @@
--- database: presto; tables: table_with_header, table_with_footer, table_with_header_and_footer; groups: hive;
+-- database: presto; tables: table_with_header, table_with_footer, table_with_header_and_footer; groups: hive, hive_file_header;
 --! name: simple_scan with header
 SELECT count(*) FROM table_with_header
 --!