-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from tumugi/feature/extract-table
Implement extract table to google cloud storage feature
- Loading branch information
Showing
7 changed files
with
255 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
require 'json' | ||
require 'tumugi' | ||
require_relative '../target/bigquery_table' | ||
|
||
module Tumugi | ||
module Plugin | ||
class BigqueryExportTask < Tumugi::Task | ||
Tumugi::Plugin.register_task('bigquery_export', self) | ||
|
||
param :project_id, type: :string | ||
param :job_project_id, type: :string | ||
param :dataset_id, type: :string, required: true | ||
param :table_id, type: :string, required: true | ||
|
||
param :compression, type: :string, default: 'NONE' # GZIP | ||
param :destination_format, type: :string, default: 'CSV' # NEWLINE_DELIMITED_JSON, AVRO | ||
|
||
# Only effected if destiation_format == 'CSV' | ||
param :field_delimiter, type: :string, default: ',' | ||
param :print_header, type: :bool, default: true | ||
|
||
param :page_size, type: :integer, default: 10000 | ||
|
||
param :wait, type: :integer, default: 120 | ||
|
||
def run | ||
unless output.is_a?(Tumugi::Plugin::FileSystemTarget) | ||
raise Tumgi::TumguiError.new("BigqueryExportTask#output must be return a instance of Tumugi::Plugin::FileSystemTarget") | ||
end | ||
|
||
client = Tumugi::Plugin::Bigquery::Client.new(config) | ||
table = Tumugi::Plugin::Bigquery::Table.new(project_id: client.project_id, dataset_id: dataset_id, table_id: table_id) | ||
job_project_id = client.project_id if job_project_id.nil? | ||
|
||
log "Source: #{table}" | ||
log "Destination: #{output}" | ||
|
||
if is_gcs?(output) | ||
export_to_gcs(client) | ||
else | ||
if destination_format.upcase == 'AVRO' | ||
raise Tumgi::TumguiError.new("destination_format='AVRO' is only supported when export to Google Cloud Storage") | ||
end | ||
if compression.upcase == 'GZIP' | ||
logger.warn("compression parameter is ignored, it only supportd when export to Google Cloud Storage") | ||
end | ||
export_to_file_system(client) | ||
end | ||
end | ||
|
||
private | ||
|
||
def is_gcs?(target) | ||
not target.to_s.match(/^gs:\/\/[^\/]+\/.+$/).nil? | ||
end | ||
|
||
def export_to_gcs(client) | ||
options = { | ||
compression: compression.upcase, | ||
destination_format: destination_format.upcase, | ||
field_delimiter: field_delimiter, | ||
print_header: print_header, | ||
project_id: client.project_id, | ||
job_project_id: job_project_id || client.project_id, | ||
wait: wait | ||
} | ||
client.extract(dataset_id, table_id, _output.to_s, options) | ||
end | ||
|
||
def export_to_file_system(client) | ||
schema ||= client.table(dataset_id, table_id, project_id: client.project_id).schema.fields | ||
field_names = schema.map{|f| f.respond_to?(:[]) ? (f["name"] || f[:name]) : f.name } | ||
start_index = 0 | ||
page_token = nil | ||
options = { | ||
max_result: page_size, | ||
project_id: client.project_id, | ||
} | ||
|
||
_output.open('w') do |file| | ||
file.puts field_names.join(field_delimiter) if destination_format == 'CSV' && print_header | ||
begin | ||
table_data_list = client.list_tabledata(dataset_id, table_id, options.merge(start_index: start_index, page_token: page_token)) | ||
start_index += page_size | ||
page_token = table_data_list[:next_token] | ||
table_data_list[:rows].each do |row| | ||
file.puts line(field_names, row, destination_format) | ||
end | ||
end while not page_token.nil? | ||
end | ||
end | ||
|
||
def line(field_names, row, format) | ||
case format | ||
when 'CSV' | ||
row.map{|v| v[1]}.join(field_delimiter) | ||
when 'NEWLINE_DELIMITED_JSON' | ||
JSON.generate(row.to_h) | ||
end | ||
end | ||
|
||
def config | ||
cfg = Tumugi.config.section('bigquery').to_h | ||
unless project_id.nil? | ||
cfg[:project_id] = project_id | ||
end | ||
cfg | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
require_relative '../../test_helper' | ||
require 'tumugi/plugin/task/bigquery_export' | ||
require 'tumugi/plugin/target/google_cloud_storage_file' | ||
require 'tumugi/plugin/target/local_file' | ||
|
||
class Tumugi::Plugin::BigqueryExportTaskTest < Test::Unit::TestCase | ||
include Tumugi::Plugin::BigqueryTestHelper | ||
|
||
setup do | ||
@klass = Class.new(Tumugi::Plugin::BigqueryExportTask) | ||
@klass.param_set :project_id, 'bigquery-public-data' | ||
@klass.param_set :job_project_id, 'tumugi-plugin-bigquery' | ||
@klass.param_set :dataset_id, 'samples' | ||
@klass.param_set :table_id, 'shakespeare' | ||
@klass.param_set :compression, 'GZIP' | ||
end | ||
|
||
sub_test_case "parameters" do | ||
test "should set correctly" do | ||
task = @klass.new | ||
assert_equal('bigquery-public-data', task.project_id) | ||
assert_equal('tumugi-plugin-bigquery', task.job_project_id) | ||
assert_equal('samples', task.dataset_id) | ||
assert_equal('shakespeare', task.table_id) | ||
assert_equal('GZIP', task.compression) | ||
assert_equal(120, task.wait) | ||
assert_equal(10000, task.page_size) | ||
end | ||
|
||
data({ | ||
"dataset_id" => [:dataset_id], | ||
"table_id" => [:table_id], | ||
}) | ||
test "raise error when required parameter is not set" do |params| | ||
params.each do |param| | ||
@klass.param_set(param, nil) | ||
end | ||
assert_raise(Tumugi::ParameterError) do | ||
@klass.new | ||
end | ||
end | ||
end | ||
|
||
test "export to Google Cloud Storage" do | ||
task = @klass.new | ||
task.instance_eval do | ||
def output | ||
Tumugi::Plugin::GoogleCloudStorageFileTarget.new(bucket: 'tumugi-plugin-bigquery', key: 'export/test.csv.zip') | ||
end | ||
end | ||
output = task.output | ||
task.run | ||
output.open("r") do |f| | ||
count = 0 | ||
header = '' | ||
in_row = '' | ||
Zlib::GzipReader.open(f) do |gz| | ||
while s = gz.gets | ||
if count == 0 | ||
header = s | ||
end | ||
count += 1 | ||
if s.start_with?("in,") | ||
in_row = s | ||
end | ||
end | ||
end | ||
assert_equal(164657, count) | ||
assert_equal("word,word_count,corpus,corpus_date\n", header) | ||
assert_equal("in,255,kinghenryviii,1612\n", in_row) | ||
end | ||
end | ||
|
||
test "export to local file" do | ||
task = @klass.new | ||
task.instance_eval do | ||
def output | ||
Tumugi::Plugin::LocalFileTarget.new('tmp/export.csv') | ||
end | ||
end | ||
output = task.output | ||
task.run | ||
output.open("r") do |f| | ||
count = 0 | ||
header = '' | ||
in_row = '' | ||
while s = f.gets | ||
if count == 0 | ||
header = s | ||
end | ||
count += 1 | ||
if s.start_with?("in,") | ||
in_row = s | ||
end | ||
end | ||
assert_equal(164657, count) | ||
assert_equal("word,word_count,corpus,corpus_date\n", header) | ||
assert_equal("in,255,kinghenryviii,1612\n", in_row) | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters