You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
/// we are going to use TPC-DS data. have the create table command
Note: We set the bucket = 1 since this is a single container image. If we were running on a larger environment with 3 BEs, we could set bucket = 15. The number of buckets must be divisible by the number of BE nodes.
Download the data files (we are using CSV format).
// load generated TPC-DS data
LOAD LABEL tpcds_1t.catalog_page_166445074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/catalog_page/*") INTO TABLE catalog_page COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.catalog_sales_16645674 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/catalog_sales/*") INTO TABLE catalog_sales COLUMNS TERMINATED BY "|" (cs_sold_date_sk,cs_sold_time_sk,cs_ship_date_sk,cs_bill_customer_sk,
cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_bill_addr_sk,cs_ship_customer_sk,cs_ship_cdemo_sk,cs_ship_hdemo_sk,cs_ship_addr_sk,
cs_call_center_sk,cs_catalog_page_sk,cs_ship_mode_sk,cs_warehouse_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,
cs_wholesale_cost,cs_list_price,cs_sales_price,cs_ext_discount_amt,cs_ext_sales_price,cs_ext_wholesale_cost,
cs_ext_list_price,cs_ext_tax,cs_coupon_amt,cs_ext_ship_cost,cs_net_paid,cs_net_paid_inc_tax,cs_net_paid_inc_ship,
cs_net_paid_inc_ship_tax,cs_net_profit)) WITH BROKER ("aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2") PROPERTIES ("timeout" = "144000");
LOAD LABEL tpcds_1t.web_page_166445604 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/web_page/*") INTO TABLE web_page COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.customer_demographics_166446074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/customer_demographics/*") INTO TABLE customer_demographics COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.household_demographics_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/household_demographics/*") INTO TABLE household_demographics COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.store_returns_166445074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/store_returns/*") INTO TABLE store_returns COLUMNS TERMINATED BY "|" (sr_returned_date_sk,sr_return_time_sk,sr_item_sk,sr_customer_sk,sr_cdemo_sk,
sr_hdemo_sk,sr_addr_sk,sr_store_sk,sr_reason_sk,sr_ticket_number,sr_return_quantity,sr_return_amt,sr_return_tax,
sr_return_amt_inc_tax,sr_fee,sr_return_ship_cost,sr_refunded_cash,sr_reversed_charge,sr_store_credit,sr_net_loss)) WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.time_dim_166445074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/time_dim/*") INTO TABLE time_dim COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2" );
LOAD LABEL tpcds_1t.inventory_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/inventory/*") INTO TABLE inventory COLUMNS TERMINATED BY "|" (inv_date_sk,inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand)) WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.item_166446074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/item/*") INTO TABLE item COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.date_dim_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/date_dim/*") INTO TABLE date_dim COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.reason_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/reason/*") INTO TABLE reason COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.call_center_166446074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/call_center/*") INTO TABLE call_center COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.ship_mode_166446074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/ship_mode/*") INTO TABLE ship_mode COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.web_sales_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/web_sales/*") INTO TABLE web_sales COLUMNS TERMINATED BY "|" (ws_sold_date_sk,ws_sold_time_sk,ws_ship_date_sk,ws_item_sk,ws_bill_customer_sk,
ws_bill_cdemo_sk,ws_bill_hdemo_sk,ws_bill_addr_sk,ws_ship_customer_sk,ws_ship_cdemo_sk,ws_ship_hdemo_sk,ws_ship_addr_sk,
ws_web_page_sk,ws_web_site_sk,ws_ship_mode_sk,ws_warehouse_sk,ws_promo_sk,ws_order_number,ws_quantity,ws_wholesale_cost,
ws_list_price,ws_sales_price,ws_ext_discount_amt,ws_ext_sales_price,ws_ext_wholesale_cost,ws_ext_list_price,ws_ext_tax,
ws_coupon_amt,ws_ext_ship_cost,ws_net_paid,ws_net_paid_inc_tax,ws_net_paid_inc_ship,ws_net_paid_inc_ship_tax,ws_net_profit)) WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.web_site_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/web_site/*") INTO TABLE web_site COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.customer_address_166446074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/customer_address/*") INTO TABLE customer_address COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.warehouse_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/warehouse/*") INTO TABLE warehouse COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.income_band_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/income_band/*") INTO TABLE income_band COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.promotion_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/promotion/*") INTO TABLE promotion COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2" );
LOAD LABEL tpcds_1t.web_returns_166456074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/web_returns/*") INTO TABLE web_returns COLUMNS TERMINATED BY "|" (wr_returned_date_sk,wr_returned_time_sk,wr_item_sk,wr_refunded_customer_sk,
wr_refunded_cdemo_sk,wr_refunded_hdemo_sk,wr_refunded_addr_sk,wr_returning_customer_sk,wr_returning_cdemo_sk,
wr_returning_hdemo_sk,wr_returning_addr_sk,wr_web_page_sk,wr_reason_sk,wr_order_number,wr_return_quantity,wr_return_amt,
wr_return_tax,wr_return_amt_inc_tax,wr_fee,wr_return_ship_cost,wr_refunded_cash,wr_reversed_charge,wr_account_credit,wr_net_loss)) WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.catalog_returns_166446074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/catalog_returns/*") INTO TABLE catalog_returns COLUMNS TERMINATED BY "|" (cr_returned_date_sk,cr_returned_time_sk,cr_item_sk,cr_refunded_customer_sk,
cr_refunded_cdemo_sk,cr_refunded_hdemo_sk,cr_refunded_addr_sk,cr_returning_customer_sk,cr_returning_cdemo_sk,cr_returning_hdemo_sk,
cr_returning_addr_sk,cr_call_center_sk,cr_catalog_page_sk,cr_ship_mode_sk,cr_warehouse_sk,cr_reason_sk,cr_order_number,
cr_return_quantity,cr_return_amount,cr_return_tax,cr_return_amt_inc_tax,cr_fee,cr_return_ship_cost,cr_refunded_cash,
cr_reversed_charge,cr_store_credit,cr_net_loss)) WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.store_sales_16645674 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/store_sales/*") INTO TABLE store_sales COLUMNS TERMINATED BY "|" (ss_sold_date_sk,ss_sold_time_sk,ss_item_sk,ss_customer_sk,ss_cdemo_sk,
ss_hdemo_sk,ss_addr_sk,ss_store_sk,ss_promo_sk,ss_ticket_number,ss_quantity,ss_wholesale_cost,ss_list_price,
ss_sales_price,ss_ext_discount_amt,ss_ext_sales_price,ss_ext_wholesale_cost,ss_ext_list_price,ss_ext_tax,
ss_coupon_amt,ss_net_paid,ss_net_paid_inc_tax,ss_net_profit)) WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2")PROPERTIES ("timeout" = "144000");
LOAD LABEL tpcds_1t.customer_166445074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/customer/*") INTO TABLE customer COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
LOAD LABEL tpcds_1t.store_16644074 (DATA INFILE("s3a://us-west-benchmark-data/tpcds_1t/store/*") INTO TABLE store COLUMNS TERMINATED BY "|") WITH BROKER (
"aws.s3.use_instance_profile" = "true",
"aws.s3.region" = "us-west-2");
Insert it into the final tables
INSERT INTO tpcds_1t.call_center
SELECT * FROM lakehouse.tpcds_1t_parquet.call_center;
INSERT INTO tpcds_1t.catalog_page
SELECT * FROM lakehouse.tpcds_1t_parquet.catalog_page;
INSERT INTO tpcds_1t.catalog_returns
SELECT * FROM lakehouse.tpcds_1t_parquet.catalog_returns;
INSERT INTO tpcds_1t.catalog_sales
SELECT * FROM lakehouse.tpcds_1t_parquet.catalog_sales;
INSERT INTO tpcds_1t.customer_address
SELECT * FROM lakehouse.tpcds_1t_parquet.customer_address;
INSERT INTO tpcds_1t.customer_demographics
SELECT * FROM lakehouse.tpcds_1t_parquet.customer_demographic;
INSERT INTO tpcds_1t.customer
SELECT * FROM lakehouse.tpcds_1t_parquet.customer;
INSERT INTO tpcds_1t.date_dim
SELECT * FROM lakehouse.tpcds_1t_parquet.date_dim;
INSERT INTO tpcds_1t.household_demographics
SELECT * FROM lakehouse.tpcds_1t_parquet.household_demographi;
INSERT INTO tpcds_1t.income_band
SELECT * FROM lakehouse.tpcds_1t_parquet.income_band;
INSERT INTO tpcds_1t.inventory
SELECT * FROM lakehouse.tpcds_1t_parquet.inventory;
INSERT INTO tpcds_1t.item
SELECT * FROM lakehouse.tpcds_1t_parquet.item;
INSERT INTO tpcds_1t.promotion
SELECT * FROM lakehouse.tpcds_1t_parquet.promotion;
INSERT INTO tpcds_1t.reason
SELECT * FROM lakehouse.tpcds_1t_parquet.reason;
INSERT INTO tpcds_1t.ship_mode
SELECT * FROM lakehouse.tpcds_1t_parquet.ship_mode;
INSERT INTO tpcds_1t.store_returns
SELECT * FROM lakehouse.tpcds_1t_parquet.store_returns;
INSERT INTO tpcds_1t.store_sales
SELECT * FROM lakehouse.tpcds_1t_parquet.store_sales;
INSERT INTO tpcds_1t.store
SELECT * FROM lakehouse.tpcds_1t_parquet.store;
INSERT INTO tpcds_1t.time_dim
SELECT * FROM lakehouse.tpcds_1t_parquet.time_dim;
INSERT INTO tpcds_1t.warehouse
SELECT * FROM lakehouse.tpcds_1t_parquet.warehouse;
INSERT INTO tpcds_1t.web_page
SELECT * FROM lakehouse.tpcds_1t_parquet.web_page;
INSERT INTO tpcds_1t.web_returns
SELECT * FROM lakehouse.tpcds_1t_parquet.web_returns;
INSERT INTO tpcds_1t.web_sales
SELECT * FROM lakehouse.tpcds_1t_parquet.web_sales;
INSERT INTO tpcds_1t.web_site
SELECT * FROM lakehouse.tpcds_1t_parquet.web_site;
Step 4. Query the Loaded Data
// select a couple of SQL that has joins
Step 5. Browse the operational tooling to understand what happened
StarRocks comes with a web based tool for you to manage your database instance. Through this tooling, you can see logs, understand your queries and more.
Launch your browser and use "root" as your username with leaving the password as blank:
http://127.0.0.1:8030/
Load the Speedtest by Ookla Global Fixed and Mobile Network Performance data set (parquet) into StarRocks and execute Queries
You can find instructions on how to quickly load parquet data into StarRocks by following the tutorial at #24507
Step 6. Connect to a remote Apache Iceberg or Apache Hudi environment and execute Queries
Tabular.io is the hosted managed version of Apache Iceberg from the main author of Apache Iceberg. They have a free tier. Use our StarRocks container to connect to that environment. Tutorial at #23616.
Onehouse.ai is the hosted managed version of Apache Hudi. As of right now there is not a free tier however there is a tutorial of getting StarRocks to work with Apache Hudi's quickstart. You can find the tutorial at #22947
[Optional] Visualize data in Apache SuperSet or use AirByte or Apache Kafka to load more data into StarRocks
See our tutorial (#23210) on how to connect Apache SuperSet to the data you've just loaded.
Another great tutorial is one where you can use AirByte to move batch data into StarRocks (#23713) or if you have a real time data streaming use case, use Apache Kafka to move our data into StarRocks (#23627).
reacted with thumbs up emoji reacted with thumbs down emoji reacted with laugh emoji reacted with hooray emoji reacted with confused emoji reacted with heart emoji reacted with rocket emoji reacted with eyes emoji
-
Prerequisites
For this tutorial you need to:
Have Docker Desktop or podman container runtime installed.
This is out of scope for the tutorial.
Have a MySQL client.
This is out of scope for the tutorial.
Step 1. Deploy StarRocks in Docker
Step 2. Create a database and table
/// we are going to use TPC-DS data. have the create table command
Note: We set the bucket = 1 since this is a single container image. If we were running on a larger environment with 3 BEs, we could set bucket = 15. The number of buckets must be divisible by the number of BE nodes.
Step 3. Insert Data into the Table
Download the data files (we are using CSV format).
// load generated TPC-DS data
Insert it into the final tables
Step 4. Query the Loaded Data
// select a couple of SQL that has joins
Step 5. Browse the operational tooling to understand what happened
StarRocks comes with a web based tool for you to manage your database instance. Through this tooling, you can see logs, understand your queries and more.
Launch your browser and use "root" as your username with leaving the password as blank:
Load the Speedtest by Ookla Global Fixed and Mobile Network Performance data set (parquet) into StarRocks and execute Queries
You can find instructions on how to quickly load parquet data into StarRocks by following the tutorial at #24507
Step 6. Connect to a remote Apache Iceberg or Apache Hudi environment and execute Queries
Tabular.io is the hosted managed version of Apache Iceberg from the main author of Apache Iceberg. They have a free tier. Use our StarRocks container to connect to that environment. Tutorial at #23616.
Onehouse.ai is the hosted managed version of Apache Hudi. As of right now there is not a free tier however there is a tutorial of getting StarRocks to work with Apache Hudi's quickstart. You can find the tutorial at #22947
[Optional] Visualize data in Apache SuperSet or use AirByte or Apache Kafka to load more data into StarRocks
See our tutorial (#23210) on how to connect Apache SuperSet to the data you've just loaded.
Another great tutorial is one where you can use AirByte to move batch data into StarRocks (#23713) or if you have a real time data streaming use case, use Apache Kafka to move our data into StarRocks (#23627).
Step 7. Summary and Clean Up
Beta Was this translation helpful? Give feedback.
All reactions