-
Notifications
You must be signed in to change notification settings - Fork 128
/
Copy pathspark.yml
39 lines (36 loc) · 1.38 KB
/
spark.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
version: 2
sources:
- name: snowplow
tables:
- name: event
description: "Snowplow events stored as CSV files in HDFS"
external:
location: 'hdfs://.../event.csv' # hdfs://, s3://, azure://, dbfs://, ...
using: csv # file type: csv, json, parquet, delta, ...
options: # as needed
sep: '|'
header: 'true'
timestampFormat: 'yyyy-MM-dd HH:mm'
partitions:
- name: year
data_type: int
- name: month
data_type: int
- name: day
data_type: int
columns:
- name: app_id
data_type: string
description: "Application ID"
- name: domain_sessionidx
data_type: int
description: "A visit / session index"
- name: etl_tstamp
data_type: timestamp
description: "Timestamp event began ETL"
# depending on the complexity of nested columns, it may be preferable to
# register them as strings here and parse in a model:
# `from_json(contexts, 'schema string, data array<struct<data:varchar(65000),schema:string>>'``
- name: contexts
data_type: string
description: "Contexts attached to event by Tracker"