Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for GRILL-538 - Map and SMB joins with parquet #27

Merged
merged 1 commit into from
Aug 6, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ public JobConf pushProjectionsAndFilters(JobConf jobConf, Path path)
if ((part != null) && (part.getTableDesc() != null)) {
Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), cloneJobConf);
}
pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().toString());
pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().getPath());
return cloneJobConf;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.parquet.convert.DataWritableRecordConverter;
Expand Down Expand Up @@ -46,6 +48,7 @@ public class DataWritableReadSupport extends ReadSupport<ArrayWritable> {

private static final String TABLE_SCHEMA = "table_schema";
public static final String HIVE_SCHEMA_KEY = "HIVE_TABLE_SCHEMA";
protected static final Log LOG = LogFactory.getLog(DataWritableReadSupport.class);

/**
* From a string which columns names (including hive column), return a list
Expand Down Expand Up @@ -93,7 +96,9 @@ public parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration confi

final List<Type> typeListWanted = new ArrayList<Type>();
for (final Integer idx : indexColumnsWanted) {
typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
if(idx < listColumns.size()) {
typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
}
}
requestedSchemaByUser = new MessageType(fileSchema.getName(), typeListWanted);

Expand Down
32 changes: 32 additions & 0 deletions ql/src/test/queries/clientpositive/parquet_join.q
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
drop table if exists staging;
drop table if exists parquet_jointable1;
drop table if exists parquet_jointable2;
drop table if exists parquet_jointable1_bucketed_sorted;
drop table if exists parquet_jointable2_bucketed_sorted;

create table staging (key int, value string) stored as textfile;
insert into table staging select * from src order by key limit 2;

create table parquet_jointable1 stored as parquet as select * from staging;

create table parquet_jointable2 stored as parquet as select key,key+1,concat(value,"value") as myvalue from staging;

explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;

set hive.auto.convert.join=true;

explain select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;
select p2.myvalue from parquet_jointable1 p1 join parquet_jointable2 p2 on p1.key=p2.key;

set hive.optimize.bucketmapjoin=true;
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.auto.convert.sortmerge.join=true;
set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;

create table parquet_jointable1_bucketed_sorted (key int,value string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet;
insert overwrite table parquet_jointable1_bucketed_sorted select key,concat(value,"value1") as value from staging cluster by key;
create table parquet_jointable2_bucketed_sorted (key int,value1 string, value2 string) clustered by (key) sorted by (key ASC) INTO 1 BUCKETS stored as parquet;
insert overwrite table parquet_jointable2_bucketed_sorted select key,concat(value,"value2-1") as value1,concat(value,"value2-2") as value2 from staging cluster by key;
explain select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key;
select p1.value,p2.value2 from parquet_jointable1_bucketed_sorted p1 join parquet_jointable2_bucketed_sorted p2 on p1.key=p2.key;
Loading