InMobi · Amareshwari · Aug 6, 2014 · Aug 5, 2014
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ProjectionPusher.java
@@ -146,7 +146,7 @@ public JobConf pushProjectionsAndFilters(JobConf jobConf, Path path)
     if ((part != null) && (part.getTableDesc() != null)) {
       Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), cloneJobConf);
     }
-    pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().toString());
+    pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().getPath());
     return cloneJobConf;
   }
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/DataWritableReadSupport.java
@@ -18,6 +18,8 @@
 import java.util.List;
 import java.util.Map;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.ql.io.IOConstants;
 import org.apache.hadoop.hive.ql.io.parquet.convert.DataWritableRecordConverter;
@@ -46,6 +48,7 @@ public class DataWritableReadSupport extends ReadSupport<ArrayWritable> {
 
   private static final String TABLE_SCHEMA = "table_schema";
   public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA";
+  protected static final Log LOG = LogFactory.getLog(DataWritableReadSupport.class);
 
   /**
    * From a string which columns names (including hive column), return a list
@@ -93,7 +96,9 @@ public parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration confi
 
       final List<Type> typeListWanted = new ArrayList<Type>();
       for (final Integer idx : indexColumnsWanted) {
-        typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
+        if(idx < listColumns.size()) {
+          typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
+        }
       }
       requestedSchemaByUser = new MessageType(fileSchema.getName(), typeListWanted);
 

diff --git a/ql/src/test/queries/clientpositive/parquet_join.q b/ql/src/test/queries/clientpositive/parquet_join.q
@@ -0,0 +1,32 @@
+drop table if exists staging;
+drop table if exists parquet_jointable1;
+drop table if exists parquet_jointable2;
+drop table if exists parquet_jointable1_bucketed_sorted;
+drop table if exists parquet_jointable2_bucketed_sorted;
+
+create table staging (key int, value string) stored as textfile;
+insert into table staging select * from src order by key limit 2;
+
+create table parquet_jointable1 stored as parquet as select * from staging;
+
+create table parquet_jointable2 stored as parquet as select key,key+1,concat(value,"value") as myvalue from staging;
+
+explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
+select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
+
+set hive.auto.convert.join=true;
+
+explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
+select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
+
+set hive.optimize.bucketmapjoin=true;
+set hive.optimize.bucketmapjoin.sortedmerge=true;
+set hive.auto.convert.sortmerge.join=true;
+set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
+
+create table parquet_jointable1_bucketed_sorted (key int,value string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet;
+insert overwrite table parquet_jointable1_bucketed_sorted select key,concat(value,"value1") as value from staging cluster by key;
+create table parquet_jointable2_bucketed_sorted (key int,value1 string, value2 string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet;
+insert overwrite table parquet_jointable2_bucketed_sorted select key,concat(value,"value2-1") as value1,concat(value,"value2-2") as value2 from staging cluster by key;
+explain select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key;
+select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key;