Skip to content

Commit

Permalink
HIVE-28224: Upgrade Orc version in Hive to 1.9.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitriy Fingerman committed Apr 30, 2024
1 parent c7bf32a commit 9195137
Show file tree
Hide file tree
Showing 18 changed files with 57 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
--! qt:replace:/(MAJOR\s+succeeded\s+)[a-zA-Z0-9\-\.\s+]+(\s+manual)/$1#Masked#$2/
-- Mask compaction id as they will be allocated in parallel threads
--! qt:replace:/^[0-9]/#Masked#/
-- Mask removed file size
--! qt:replace:/(\S\"removed-files-size\\\":\\\")(\d+)(\\\")/$1#Masked#$3/

set hive.llap.io.enabled=true;
set hive.vectorized.execution.enabled=true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ Table Parameters:
bucketing_version 2
current-schema {\"type\":\"struct\",\"schema-id\":2,\"fields\":[{\"id\":1,\"name\":\"fname\",\"required\":false,\"type\":\"string\"},{\"id\":2,\"name\":\"last_name\",\"required\":false,\"type\":\"string\"},{\"id\":3,\"name\":\"dept_id\",\"required\":false,\"type\":\"long\"},{\"id\":4,\"name\":\"address\",\"required\":false,\"type\":\"string\"}]}
current-snapshot-id #Masked#
current-snapshot-summary {\"deleted-data-files\":\"6\",\"deleted-records\":\"6\",\"removed-files-size\":\"3167\",\"changed-partition-count\":\"2\",\"total-records\":\"10\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"10\",\"total-delete-files\":\"8\",\"total-position-deletes\":\"8\",\"total-equality-deletes\":\"0\"}
current-snapshot-summary {\"deleted-data-files\":\"6\",\"deleted-records\":\"6\",\"removed-files-size\":\"#Masked#\",\"changed-partition-count\":\"2\",\"total-records\":\"10\",\"total-files-size\":\"#Masked#\",\"total-data-files\":\"10\",\"total-delete-files\":\"8\",\"total-position-deletes\":\"8\",\"total-equality-deletes\":\"0\"}
current-snapshot-timestamp-ms #Masked#
default-partition-spec {\"spec-id\":0,\"fields\":[{\"name\":\"dept_id\",\"transform\":\"identity\",\"source-id\":3,\"field-id\":1000}]}
format-version 2
Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@
<mysql.version>8.0.31</mysql.version>
<postgres.version>42.7.3</postgres.version>
<oracle.version>21.3.0.0</oracle.version>
<opencsv.version>2.3</opencsv.version>
<orc.version>1.8.5</orc.version>
<opencsv.version>5.9</opencsv.version>
<orc.version>1.9.3</orc.version>
<mockito-core.version>3.4.4</mockito-core.version>
<mockito-inline.version>4.11.0</mockito-inline.version>
<mina.version>2.0.0-M5</mina.version>
Expand Down
16 changes: 11 additions & 5 deletions ql/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -536,11 +536,6 @@
<groupId>stax</groupId>
<artifactId>stax-api</artifactId>
</dependency>
<dependency>
<groupId>net.sf.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>${opencsv.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-standalone-metastore-server</artifactId>
Expand Down Expand Up @@ -759,6 +754,17 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>${opencsv.version}</version>
<exclusions>
<exclusion>
<groupId>commons-beanutils</groupId>
<artifactId>commons-beanutils</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.tez</groupId>
<artifactId>tez-dag</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
--! qt:dataset:src
--! qt:dataset:alltypesorc
--! qt:replace:/(\s+totalSize\s+)\S+(\s+)/$1#Masked#$2/
set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
-- Try to run incremental on a non-transactional MV in presence of delete operations
-- Compiler should fall back to full rebuild.

--! qt:replace:/(\S Data size\:\s+)\S+(\s+Basic stats\: \S+ Column stats\: \S+)/$1#Masked#$2/
set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
--! qt:replace:/(\s+totalSize\s+)\S+(\s+)/$1#Masked#$2/
-- Test Incremental rebuild of materialized view without aggregate when source tables have
-- 1) insert operations only
-- 2) update/delete operations since last rebuild.
Expand Down
1 change: 1 addition & 0 deletions ql/src/test/queries/clientpositive/orc_llap_nonvector.q
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
--! qt:dataset:alltypesorc
--! qt:replace:/(\S Data size\:\s+)\S+(\s+Basic stats\: \S+ Column stats\: \S+)/$1#Masked#$2/

set hive.vectorized.execution.enabled=false;
set hive.mapred.mode=nonstrict;
Expand Down
1 change: 1 addition & 0 deletions ql/src/test/queries/clientpositive/orc_merge12.q
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
--! qt:replace:/(File Version:)(.+)/$1#Masked#/
--! qt:replace:/(File length:\s+)\S+(\s+bytes)/$1#Masked#$2/
set hive.vectorized.execution.enabled=false;

CREATE TABLE `alltypesorc3xcols`(
Expand Down
2 changes: 2 additions & 0 deletions ql/src/test/queries/clientpositive/stats_part.q
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
-- Mask the totalSize value as it can have slight variability, causing test flakiness
--! qt:replace:/(\s+totalSize\s+)\S+(\s+)/$1#Masked#$2/
set hive.stats.dbclass=fs;
set hive.stats.fetch.column.stats=true;
set datanucleus.cache.collections=false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,17 @@ STAGE PLANS:
TableScan
alias: t1
filterExpr: (b = 1) (type: boolean)
Statistics: Num rows: 69 Data size: 13710 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 70 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
Filter Operator
predicate: (b = 1) (type: boolean)
Statistics: Num rows: 1 Data size: 198 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: a (type: int), 1 (type: int)
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 198 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
Statistics: Num rows: 1 Data size: 198 Basic stats: COMPLETE Column stats: NONE
Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: NONE
table:
input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
Expand Down
24 changes: 12 additions & 12 deletions ql/src/test/results/clientpositive/llap/orc_llap_nonvector.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -94,17 +94,17 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: orc_llap_nonvector
Statistics: Num rows: 12288 Data size: 2942394 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 12288 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
Limit
Number of rows: 100
Statistics: Num rows: 100 Data size: 24360 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 100 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: ctinyint (type: tinyint), csmallint (type: smallint), cint (type: int), cbigint (type: bigint), cfloat (type: float), cdouble (type: double), cstring1 (type: string), cstring2 (type: string), ctimestamp1 (type: timestamp), ctimestamp2 (type: timestamp), cboolean1 (type: boolean), cboolean2 (type: boolean), rdm (type: double)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
Statistics: Num rows: 100 Data size: 24360 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 100 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
Statistics: Num rows: 100 Data size: 24360 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 100 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
Expand Down Expand Up @@ -249,17 +249,17 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: orc_llap_nonvector
Statistics: Num rows: 12288 Data size: 899146 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 12288 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
Limit
Number of rows: 1025
Statistics: Num rows: 1025 Data size: 75068 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 1025 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: cint (type: int), cstring1 (type: string)
outputColumnNames: _col0, _col1
Statistics: Num rows: 1025 Data size: 75068 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 1025 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
Statistics: Num rows: 1025 Data size: 75068 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 1025 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
Expand Down Expand Up @@ -1354,17 +1354,17 @@ STAGE PLANS:
Map Operator Tree:
TableScan
alias: orc_llap_nonvector_2
Statistics: Num rows: 12288 Data size: 4468250 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 12288 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
Limit
Number of rows: 10
Statistics: Num rows: 10 Data size: 80 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 10 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: ROW__ID (type: struct<writeid:bigint,bucketid:int,rowid:bigint>)
outputColumnNames: _col0
Statistics: Num rows: 10 Data size: 760 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 10 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
Statistics: Num rows: 10 Data size: 760 Basic stats: COMPLETE Column stats: COMPLETE
Statistics: Num rows: 10 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ POSTHOOK: Input: default@decimal_vgby
626923679 1024 9723.4027027027 -9778.9513513514 10541.0525297287 10.29399661106318 5742.091453237337 5744.897264034264 1024 11645.74615384615400 -11712.27692307692300 12625.04759999997746 12.329148046874977988 6877.318722794881 6880.679250101608
6981 3 5831542.2692483780 -515.6210729730 5830511.0271024320 1943503.67570081066667 2749258.455012492 3367140.1929065133 3 6984454.21109769200000 -617.56077692307690 6983219.08954384584620 2327739.696514615282066667 3292794.4113115156 4032833.0678006653
762 2 5831542.2692483780 1531.2194054054 5833073.4886537834 2916536.74432689170000 2915005.5249214866 4122440.3477364695 2 6984454.21109769200000 1833.94569230769250 6986288.15678999969250 3493144.078394999846250000 3491310.1327026924 4937458.140118757
NULL 3072 9318.4351351351 -4298.1513513514 5018444.1081079808 1633.60810810806667 5695.483082135323 5696.410307714464 3072 11160.71538461538500 -5147.90769230769300 6010604.30769230735360 1956.576923076922966667 6821.495748565151 6822.606289190915
NULL 3072 9318.4351351351 -4298.1513513514 5018444.1081079808 1633.60810810806667 5695.483082135325 5696.4103077144655 3072 11160.71538461538500 -5147.90769230769300 6010604.30769230735360 1956.576923076922966667 6821.4957485651385 6822.606289190904
PREHOOK: query: CREATE TABLE decimal_vgby_small STORED AS TEXTFILE AS
SELECT cdouble, CAST (((cdouble*22.1)/37) AS DECIMAL(11,5)) AS cdecimal1,
CAST (((cdouble*9.3)/13) AS DECIMAL(16,0)) AS cdecimal2,
Expand Down Expand Up @@ -828,7 +828,7 @@ POSTHOOK: Input: default@decimal_vgby_small
626923679 1024 9723.40270 -9778.95135 10541.05247 10.293996553 5742.091453325365 5744.897264122335 1024 11646 -11712 12641 12.3447 6877.306686989158 6880.6672084147185
6981 2 -515.62107 -515.62107 -1031.24214 -515.621070000 0.0 0.0 3 6984454 -618 6983218 2327739.3333 3292794.518850853 4032833.1995089175
762 1 1531.21941 1531.21941 1531.21941 1531.219410000 0.0 NULL 2 6984454 1834 6986288 3493144.0000 3491310.0 4937457.95244881
NULL 3072 9318.43514 -4298.15135 5018444.11392 1633.608110000 5695.4830839098695 5696.410309489299 3072 11161 -5148 6010880 1956.6667 6821.647911041892 6822.758476439734
NULL 3072 9318.43514 -4298.15135 5018444.11392 1633.608110000 5695.483083909676 5696.410309489105 3072 11161 -5148 6010880 1956.6667 6821.647911041892 6822.758476439734
PREHOOK: query: SELECT SUM(HASH(*))
FROM (SELECT cint,
COUNT(cdecimal1), MAX(cdecimal1), MIN(cdecimal1), SUM(cdecimal1), AVG(cdecimal1), STDDEV_POP(cdecimal1), STDDEV_SAMP(cdecimal1),
Expand All @@ -847,4 +847,4 @@ FROM (SELECT cint,
POSTHOOK: type: QUERY
POSTHOOK: Input: default@decimal_vgby_small
#### A masked pattern was here ####
95165244160
95767761728
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ Found 4 items
-rw-rw-rw- 3 ### USER ### ### GROUP ### 8753 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7531 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7174 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7066 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7065 ### HDFS DATE ### hdfs://### HDFS PATH ###
PREHOOK: query: insert into over10k_orc_bucketed_n0 select * from over10k_n9
PREHOOK: type: QUERY
PREHOOK: Input: default@over10k_n9
Expand All @@ -402,8 +402,8 @@ Found 8 items
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7531 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7174 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7174 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7066 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7066 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7065 ### HDFS DATE ### hdfs://### HDFS PATH ###
-rw-rw-rw- 3 ### USER ### ### GROUP ### 7065 ### HDFS DATE ### hdfs://### HDFS PATH ###
PREHOOK: query: select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed_n0
PREHOOK: type: QUERY
PREHOOK: Input: default@over10k_orc_bucketed_n0
Expand Down
2 changes: 1 addition & 1 deletion ql/src/test/results/clientpositive/tez/orc_merge12.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,7 @@ Stripes:
Entry 0: count: 6889 hasNull: true true: 3402 positions: 0,0,0,0,0,0,0,0
Entry 1: count: 2284 hasNull: true true: 581 positions: 0,168,8,0,0,520,97,1

File length: 3004637 bytes
File length: #Masked# bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
Expand Down
2 changes: 1 addition & 1 deletion serde/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@
</exclusions>
</dependency>
<dependency>
<groupId>net.sf.opencsv</groupId>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>${opencsv.version}</version>
</dependency>
Expand Down
14 changes: 9 additions & 5 deletions serde/src/java/org/apache/hadoop/hive/serde2/OpenCSVSerde.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,10 @@
import java.util.Objects;
import java.util.Properties;

import au.com.bytecode.opencsv.CSVReader;
import au.com.bytecode.opencsv.CSVWriter;
import com.opencsv.CSVParserBuilder;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import com.opencsv.CSVWriter;

/**
* OpenCSVSerde use opencsv to deserialize CSV format.
Expand Down Expand Up @@ -179,15 +181,17 @@ private CSVReader newReader(final Reader reader, char separator, char quote, cha
// CSVReader will throw an exception if any of separator, quote, or escape is the same, but
// the CSV format specifies that the escape character and quote char are the same... very weird
if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
return new CSVReader(reader, separator, quote);
return new CSVReaderBuilder(reader).withCSVParser(
new CSVParserBuilder().withSeparator(separator).withQuoteChar(quote).build()).build();
} else {
return new CSVReader(reader, separator, quote, escape);
return new CSVReaderBuilder(reader).withCSVParser(
new CSVParserBuilder().withSeparator(separator).withQuoteChar(quote).withEscapeChar(escape).build()).build();
}
}

private CSVWriter newWriter(final Writer writer, char separator, char quote, char escape) {
if (CSVWriter.DEFAULT_ESCAPE_CHARACTER == escape) {
return new CSVWriter(writer, separator, quote, "");
return new CSVWriter(writer, separator, quote, '"', "");
} else {
return new CSVWriter(writer, separator, quote, escape, "");
}
Expand Down
2 changes: 1 addition & 1 deletion standalone-metastore/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
<libthrift.version>0.16.0</libthrift.version>
<log4j2.version>2.18.0</log4j2.version>
<mockito-core.version>3.4.4</mockito-core.version>
<orc.version>1.8.5</orc.version>
<orc.version>1.9.3</orc.version>
<protobuf.version>3.24.4</protobuf.version>
<io.grpc.version>1.51.0</io.grpc.version>
<sqlline.version>1.9.0</sqlline.version>
Expand Down

0 comments on commit 9195137

Please sign in to comment.