-
Notifications
You must be signed in to change notification settings - Fork 74
/
Copy pathsnowflake-bigquery-iceberg.sql
195 lines (154 loc) · 6.21 KB
/
snowflake-bigquery-iceberg.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
-- Step 1:
-- Login to Snowflake
-- Click the Create button on the top left
-- Select SQL Worksheet
-- Run this command to switch to an account admin since we have to run some commands that requires this role
USE ROLE accountadmin;
-- Step 2:
-- Create a warehouse to hold the data
-- https://docs.snowflake.com/en/sql-reference/sql/create-warehouse
CREATE OR REPLACE WAREHOUSE ICEBERG_WAREHOUSE WITH WAREHOUSE_SIZE='XSMALL';
-- Step 3:
-- Create a database (the database)
CREATE OR REPLACE DATABASE ICEBERG_DATABASE;
-- Step 4:
-- Create a bucket to hold your BigLake Managed Table
-- Open: https://console.cloud.google.com/storage/browser
-- Click the Create Bucket button
-- Enter your bucket name: bigquery-snowflake-sharing (you can choose a different name)
-- Click Next: Use Region: us-central1 <- must match your snowflake region
-- Click Create at the bottom
-- Step 5:
-- In Snowflake
-- Create our GCS volumne integration. This will create a link between Snowflake and GCS.
-- A service principal will be created and we will grant access to our GCS bucket.
-- https://docs.snowflake.com/en/sql-reference/sql/create-storage-integration
CREATE STORAGE INTEGRATION bigquery_integration
TYPE = EXTERNAL_STAGE
STORAGE_PROVIDER = 'GCS'
ENABLED = TRUE
STORAGE_ALLOWED_LOCATIONS = ('gcs://bigquery-snowflake-sharing')
;
-- Step 6:
-- Get the service principal that we will grant Storage Object Admin in our GCS bucket
DESC STORAGE INTEGRATION bigquery_integration;
-- Copy the STORAGE_GCP_SERVICE_ACCOUNT
-- e.g. [email protected]
-- Step 7:
-- https://docs.snowflake.com/en/user-guide/tables-iceberg-configure-external-volume-gcs?_fsi=YFzl41ld&_fsi=YFzl41ld&_fsi=YFzl41ld
-- Create a custom IAM role
-- Create a custom role that has the permissions required to access the bucket and get objects.
-- Open: https://console.cloud.google.com/iam-admin/roles
-- Select Create Role.
-- Enter a Title and optional Description for the custom role. [e.g. Snowflake Storage Admin]
-- Select Add Permissions.
-- In Filter, select Service and then select storage.
-- Filter the list of permissions, and add the following from the list:
-- storage.buckets.get
-- storage.objects.get
-- storage.objects.create
-- storage.objects.delete
-- storage.objects.list
-- Select Add.
-- Select Create.
-- Step 8:
-- Open your storage account you created
-- Open: https://console.cloud.google.com/storage/browser
-- Click on: bigquery-snowflake-sharing (or whatever you named it)
-- Click on Permissions
-- Click Grant Access
-- Paste in the service account name (from Snowflake)
-- For the role select Custom | Snowflake Storage Admin
-- Click Save
-- Step 9:
-- In Snowflake
-- Create an external volume on GCS
-- https://docs.snowflake.com/en/sql-reference/sql/create-external-volume
CREATE EXTERNAL VOLUME snowflake_ext_volume
STORAGE_LOCATIONS =
(
(
NAME = 'us-central1'
STORAGE_PROVIDER = 'GCS'
STORAGE_BASE_URL = 'gcs://bigquery-snowflake-sharing/snowflake-volume/'
)
),
ALLOW_WRITES = TRUE;
-- Step 10:
-- Describe the volume
DESCRIBE EXTERNAL VOLUME snowflake_external_volume
-- Step 11:
-- Set the current database
USE ICEBERG_DATABASE;
-- Step 12:
-- Create a schema in Snowflake
CREATE SCHEMA iceberg_schema;
-- Step 13:
-- Make the schema active
USE SCHEMA iceberg_schema;
-- Step 14:
-- Create Iceberg table using Snowflake Catalog
-- https://docs.snowflake.com/en/sql-reference/sql/create-iceberg-table-snowflake
CREATE ICEBERG TABLE driver (driver_id int, driver_name string)
CATALOG = 'SNOWFLAKE'
EXTERNAL_VOLUME = 'snowflake_ext_volume'
BASE_LOCATION = 'driver';
-- Step 15:
-- This will show the table just created
SHOW TABLES
-- Step 16:
-- This will insert new data
INSERT INTO driver (driver_id, driver_name) VALUES (1, 'Driver 001');
SELECT * FROM driver;
-- Step 17:
-- This will tell us the latest metadata json file that Snowflake is pointing to
-- We need to point BigQuery to the same place
SELECT REPLACE(JSON_EXTRACT_PATH_TEXT(
SYSTEM$GET_ICEBERG_TABLE_INFORMATION('ICEBERG_DATABASE.iceberg_schema.driver'),
'metadataLocation'), 'gcs://', 'gs://');
-- Step 18:
-- Open your storage account you created
-- Open: https://console.cloud.google.com/storage/browser
-- Click on: bigquery-snowflake-sharing (or whatever you named it)
-- You can now browser the iceberg files
-- Step 19:
-- Create a BigQuery Dataset
CREATE SCHEMA IF NOT EXISTS snowflake_dataset OPTIONS(location = 'us-central1');
-- Step 20:
-- Navigate to BigQuery
-- Open: https://console.cloud.google.com/bigquery
-- Click the Add button
-- Select "Connections to external data sources"
-- Select "Vertex AI remote models, remote functions and BigLake (Cloud Resource)"
-- Select region: us-central1
-- Enter a name: snowflake-connection (use the for friendly name and description)
-- Step 21:
-- Expand your project in the left hand panel
-- Expand external connections
-- Double click on us-central1.snowflake-connection
-- Copy the service account id: e.g. bqcx-xxxxxxxxxxxx-s3rf@gcp-sa-bigquery-condel.iam.gserviceaccount.com
-- Step 23:
-- Open your storage account you created
-- Open: https://console.cloud.google.com/storage/browser
-- Click on: blmt-snowflake-sharing (or whatever you named it)
-- Click on Permissions
-- Click Grant Access
-- Paste in the service account name
-- For the role select Cloud Storage | Storage Object Viewer [Since Snowflake is the write BigQuery just reads]
-- Click Save
-- Step 24:
-- The uris needs to be from the above Snowflake command
CREATE OR REPLACE EXTERNAL TABLE `snowflake_dataset.driver`
WITH CONNECTION `us-central1.snowflake-connection`
OPTIONS (
format = "ICEBERG",
uris = ["gs://bigquery-snowflake-sharing/snowflake-volume/driver/metadata/00001-2d763c77-df0a-4230-bd52-033877d02c40.metadata.json"]
);
-- Step 25:
-- View the data in BQ
SELECT * FROM `snowflake_dataset.driver`;
-- Step 26:
-- Now if you add or update data in Snowflake, BigQuery will not see it since we are pointing to a specific snapshot or metadata json
-- You will need to run the "SYSTEM$GET_ICEBERG_TABLE_INFORMATION" command in Snowflake
-- You will then need to update BigQuery
-- https://cloud.google.com/bigquery/docs/iceberg-tables#update-table-metadata