--
-- Creates Cassandra keyspace with tables for traces and dependencies.
--
-- Required parameters:
--
--   keyspace
--     name of the keyspace
--   replication
--     replication strategy for the keyspace, such as
--       for prod environments
--         {'class': 'NetworkTopologyStrategy', '$datacenter': '${replication_factor}' }
--       for test environments
--         {'class': 'SimpleStrategy', 'replication_factor': '1'}
--   trace_ttl
--     default time to live for trace data, in seconds
--   dependencies_ttl
--     default time to live for dependencies data, in seconds (0 for no TTL)
--
-- Non-configurable settings:
--   gc_grace_seconds is non-zero, see: http://www.uberobert.com/cassandra_gc_grace_disables_hinted_handoff/
--   For TTL of 2 days, compaction window is 1 hour, rule of thumb here: http://thelastpickle.com/blog/2016/12/08/TWCS-part1.html

CREATE KEYSPACE IF NOT EXISTS mykeyspace WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};

CREATE TYPE IF NOT EXISTS mykeyspace.keyvalue (
    key             text,
    value_type      text,
    value_string    text,
    value_bool      boolean,
    value_long      bigint,
    value_double    double,
    value_binary    blob
);

CREATE TYPE IF NOT EXISTS mykeyspace.log (
    ts      bigint, -- microseconds since epoch
    fields  frozen<list<frozen<mykeyspace.keyvalue>>>
);

CREATE TYPE IF NOT EXISTS mykeyspace.span_ref (
    ref_type        text,
    trace_id        blob,
    span_id         bigint
);

CREATE TYPE IF NOT EXISTS mykeyspace.process (
    service_name    text,
    tags            frozen<list<frozen<mykeyspace.keyvalue>>>
);

-- Notice we have span_hash. This exists only for zipkin backwards compat. Zipkin allows spans with the same ID.
-- Note: Cassandra re-orders non-PK columns alphabetically, so the table looks differently in CQLSH "describe table".
-- start_time is bigint instead of timestamp as we require microsecond precision
CREATE TABLE IF NOT EXISTS mykeyspace.traces (
    trace_id        blob,
    span_id         bigint,
    span_hash       bigint,
    parent_id       bigint,
    operation_name  text,
    flags           int,
    start_time      bigint, -- microseconds since epoch
    duration        bigint, -- microseconds
    tags            list<frozen<keyvalue>>,
    logs            list<frozen<log>>,
    refs            list<frozen<span_ref>>,
    process         frozen<process>,
    PRIMARY KEY (trace_id, span_id, span_hash)
)
    WITH compaction = {
        'compaction_window_size': '2',
        'compaction_window_unit': 'DAYS',
        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
    }
    AND default_time_to_live = 604800
    AND speculative_retry = 'NONE'
    AND gc_grace_seconds = 0; -- 3 hours of downtime acceptable on nodes

CREATE TABLE IF NOT EXISTS mykeyspace.service_names (
    service_name text,
    PRIMARY KEY (service_name)
)
    WITH compaction = {
        'min_threshold': '4',
        'max_threshold': '32',
        'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'
    }
    AND default_time_to_live = 604800
    AND speculative_retry = 'NONE'
    AND gc_grace_seconds = 0; -- 3 hours of downtime acceptable on nodes

CREATE TABLE IF NOT EXISTS mykeyspace.operation_names_v2 (
    service_name        text,
    span_kind           text,
    operation_name      text,
    PRIMARY KEY ((service_name), span_kind, operation_name)
)
    WITH compaction = {
        'min_threshold': '4',
        'max_threshold': '32',
        'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'
    }
    AND default_time_to_live = 604800
    AND speculative_retry = 'NONE'
    AND gc_grace_seconds = 0; -- 3 hours of downtime acceptable on nodes

-- index of trace IDs by service + operation names, sorted by span start_time.
CREATE TABLE IF NOT EXISTS mykeyspace.service_operation_index (
    service_name        text,
    operation_name      text,
    start_time          bigint, -- microseconds since epoch
    trace_id            blob,
    PRIMARY KEY ((service_name, operation_name), start_time)
) WITH CLUSTERING ORDER BY (start_time DESC)
    AND compaction = {
        'compaction_window_size': '1',
        'compaction_window_unit': 'HOURS',
        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
    }
    AND default_time_to_live = 604800
    AND speculative_retry = 'NONE'
    AND gc_grace_seconds = 0; -- 3 hours of downtime acceptable on nodes

CREATE TABLE IF NOT EXISTS mykeyspace.service_name_index (
    service_name      text,
    bucket            int,
    start_time        bigint, -- microseconds since epoch
    trace_id          blob,
    PRIMARY KEY ((service_name, bucket), start_time)
) WITH CLUSTERING ORDER BY (start_time DESC)
    AND compaction = {
        'compaction_window_size': '1',
        'compaction_window_unit': 'HOURS',
        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
    }
    AND default_time_to_live = 604800
    AND speculative_retry = 'NONE'
    AND gc_grace_seconds = 0; -- 3 hours of downtime acceptable on nodes

CREATE TABLE IF NOT EXISTS mykeyspace.duration_index (
    service_name    text,      -- service name
    operation_name  text,      -- operation name, or blank for queries without span name
    bucket          timestamp, -- time bucket, - the start_time of the given span rounded to an hour
    duration        bigint,    -- span duration, in microseconds
    start_time      bigint,    -- microseconds since epoch
    trace_id        blob,
    PRIMARY KEY ((service_name, operation_name, bucket), duration, start_time, trace_id)
) WITH CLUSTERING ORDER BY (duration DESC, start_time DESC)
    AND compaction = {
        'compaction_window_size': '1',
        'compaction_window_unit': 'HOURS',
        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
    }
    AND default_time_to_live = 604800
    AND speculative_retry = 'NONE'
    AND gc_grace_seconds = 0; -- 3 hours of downtime acceptable on nodes

-- a bucketing strategy may have to be added for tag queries
-- we can make this table even better by adding a timestamp to it
CREATE TABLE IF NOT EXISTS mykeyspace.tag_index (
    service_name    text,
    tag_key         text,
    tag_value       text,
    start_time      bigint, -- microseconds since epoch
    trace_id        blob,
    span_id         bigint,
    PRIMARY KEY ((service_name, tag_key, tag_value), start_time, trace_id, span_id)
)
    WITH CLUSTERING ORDER BY (start_time DESC)
    AND compaction = {
        'compaction_window_size': '1',
        'compaction_window_unit': 'HOURS',
        'class': 'org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy'
    }
    AND default_time_to_live = 604800
    AND speculative_retry = 'NONE'
    AND gc_grace_seconds = 0; -- 3 hours of downtime acceptable on nodes

CREATE TYPE IF NOT EXISTS mykeyspace.dependency (
    parent          text,
    child           text,
    call_count      bigint,
    source          text
);

-- compaction strategy is intentionally different as compared to other tables due to the size of dependencies data
CREATE TABLE IF NOT EXISTS mykeyspace.dependencies_v2 (
    ts_bucket    timestamp,
    ts           timestamp,
    dependencies list<frozen<dependency>>,
    PRIMARY KEY (ts_bucket, ts)
) WITH CLUSTERING ORDER BY (ts DESC)
    AND compaction = {
        'min_threshold': '4',
        'max_threshold': '32',
        'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'
    }
    AND default_time_to_live = 0;

-- adaptive sampling tables
-- ./plugin/storage/cassandra/samplingstore/storage.go
CREATE TABLE IF NOT EXISTS mykeyspace.operation_throughput (
    bucket        int,
    ts            timeuuid,
    throughput    text,
    PRIMARY KEY(bucket, ts)
) WITH CLUSTERING ORDER BY (ts desc);

CREATE TABLE IF NOT EXISTS mykeyspace.sampling_probabilities (
    bucket        int,
    ts            timeuuid,
    hostname      text,
    probabilities text,
    PRIMARY KEY(bucket, ts)
) WITH CLUSTERING ORDER BY (ts desc);

-- distributed lock
-- ./plugin/pkg/distributedlock/cassandra/lock.go
CREATE TABLE IF NOT EXISTS mykeyspace.leases (
    name text,
    owner text,
    PRIMARY KEY (name)
);