From b3fde99b0e79e81df459f9e7065225dac2a26174 Mon Sep 17 00:00:00 2001 From: Manfred Moser Date: Wed, 26 Jul 2023 19:58:32 -0700 Subject: [PATCH 1/2] Narrow pattern for reserved keyword test --- .../src/main/java/io/trino/sql/ReservedIdentifiers.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/trino-parser/src/main/java/io/trino/sql/ReservedIdentifiers.java b/core/trino-parser/src/main/java/io/trino/sql/ReservedIdentifiers.java index 2463c4992d05..1ef0cddda2e5 100644 --- a/core/trino-parser/src/main/java/io/trino/sql/ReservedIdentifiers.java +++ b/core/trino-parser/src/main/java/io/trino/sql/ReservedIdentifiers.java @@ -39,7 +39,7 @@ public final class ReservedIdentifiers { private static final Pattern IDENTIFIER = Pattern.compile("'([A-Z_]+)'"); - private static final Pattern TABLE_ROW = Pattern.compile("\\| `([A-Z_]+).*"); + private static final Pattern TABLE_ROW = Pattern.compile("\\| `([A-Z_]+)`.*"); private static final String TABLE_START = "| ------------------- |"; private static final String TABLE_ROW_START = "|"; From 181a804762d87b51e1bd2b83484aa039e7575554 Mon Sep 17 00:00:00 2001 From: Manfred Moser Date: Wed, 26 Jul 2023 20:32:46 -0700 Subject: [PATCH 2/2] Convert some connector docs to markdown source - Specifically ones that are not often changed and don't use included fragments --- docs/src/main/sphinx/connector/accumulo.md | 792 +++++++++++++++++ docs/src/main/sphinx/connector/accumulo.rst | 814 ------------------ .../sphinx/connector/{atop.rst => atop.md} | 72 +- .../src/main/sphinx/connector/googlesheets.md | 175 ++++ .../main/sphinx/connector/googlesheets.rst | 185 ---- docs/src/main/sphinx/connector/jmx.md | 137 +++ docs/src/main/sphinx/connector/jmx.rst | 132 --- docs/src/main/sphinx/connector/localfile.md | 34 + docs/src/main/sphinx/connector/localfile.rst | 40 - docs/src/main/sphinx/connector/memory.md | 104 +++ docs/src/main/sphinx/connector/memory.rst | 106 --- docs/src/main/sphinx/connector/mongodb.md | 506 +++++++++++ docs/src/main/sphinx/connector/mongodb.rst | 534 ------------ docs/src/main/sphinx/connector/prometheus.md | 132 +++ docs/src/main/sphinx/connector/prometheus.rst | 142 --- .../connector/{system.rst => system.md} | 103 ++- docs/src/main/sphinx/connector/thrift.md | 108 +++ docs/src/main/sphinx/connector/thrift.rst | 121 --- docs/src/main/sphinx/connector/tpcds.md | 72 ++ docs/src/main/sphinx/connector/tpcds.rst | 76 -- docs/src/main/sphinx/connector/tpch.md | 70 ++ docs/src/main/sphinx/connector/tpch.rst | 74 -- 22 files changed, 2214 insertions(+), 2315 deletions(-) create mode 100644 docs/src/main/sphinx/connector/accumulo.md delete mode 100644 docs/src/main/sphinx/connector/accumulo.rst rename docs/src/main/sphinx/connector/{atop.rst => atop.md} (73%) create mode 100644 docs/src/main/sphinx/connector/googlesheets.md delete mode 100644 docs/src/main/sphinx/connector/googlesheets.rst create mode 100644 docs/src/main/sphinx/connector/jmx.md delete mode 100644 docs/src/main/sphinx/connector/jmx.rst create mode 100644 docs/src/main/sphinx/connector/localfile.md delete mode 100644 docs/src/main/sphinx/connector/localfile.rst create mode 100644 docs/src/main/sphinx/connector/memory.md delete mode 100644 docs/src/main/sphinx/connector/memory.rst create mode 100644 docs/src/main/sphinx/connector/mongodb.md delete mode 100644 docs/src/main/sphinx/connector/mongodb.rst create mode 100644 docs/src/main/sphinx/connector/prometheus.md delete mode 100644 docs/src/main/sphinx/connector/prometheus.rst rename docs/src/main/sphinx/connector/{system.rst => system.md} (66%) create mode 100644 docs/src/main/sphinx/connector/thrift.md delete mode 100644 docs/src/main/sphinx/connector/thrift.rst create mode 100644 docs/src/main/sphinx/connector/tpcds.md delete mode 100644 docs/src/main/sphinx/connector/tpcds.rst create mode 100644 docs/src/main/sphinx/connector/tpch.md delete mode 100644 docs/src/main/sphinx/connector/tpch.rst diff --git a/docs/src/main/sphinx/connector/accumulo.md b/docs/src/main/sphinx/connector/accumulo.md new file mode 100644 index 000000000000..55813bee3723 --- /dev/null +++ b/docs/src/main/sphinx/connector/accumulo.md @@ -0,0 +1,792 @@ +# Accumulo connector + +```{raw} html + +``` + +The Accumulo connector supports reading and writing data from +[Apache Accumulo](https://accumulo.apache.org/). +Please read this page thoroughly to understand the capabilities and features of the connector. + +## Installing the iterator dependency + +The Accumulo connector uses custom Accumulo iterators in +order to push various information in SQL predicate clauses to Accumulo for +server-side filtering, known as *predicate pushdown*. In order +for the server-side iterators to work, you need to add the `trino-accumulo-iterators` +JAR file to Accumulo's `lib/ext` directory on each TabletServer node. + +```bash +# For each TabletServer node: +scp $TRINO_HOME/plugins/accumulo/trino-accumulo-iterators-*.jar [tabletserver_address]:$ACCUMULO_HOME/lib/ext + +# TabletServer should pick up new JAR files in ext directory, but may require restart +``` + +## Requirements + +To connect to Accumulo, you need: + +- Accumulo versions 1.x starting with 1.7.4. Versions 2.x are not supported. +- Network access from the Trino coordinator and workers to the Accumulo + Zookeeper server. Port 2181 is the default port. + +## Connector configuration + +Create `etc/catalog/example.properties` to mount the `accumulo` connector as +the `example` catalog, with the following connector properties as appropriate +for your setup: + +```text +connector.name=accumulo +accumulo.instance=xxx +accumulo.zookeepers=xxx +accumulo.username=username +accumulo.password=password +``` + +Replace the `accumulo.xxx` properties as required. + +## Configuration variables + +| Property name | Default value | Required | Description | +| -------------------------------------------- | ----------------- | -------- | -------------------------------------------------------------------------------- | +| `accumulo.instance` | (none) | Yes | Name of the Accumulo instance | +| `accumulo.zookeepers` | (none) | Yes | ZooKeeper connect string | +| `accumulo.username` | (none) | Yes | Accumulo user for Trino | +| `accumulo.password` | (none) | Yes | Accumulo password for user | +| `accumulo.zookeeper.metadata.root` | `/trino-accumulo` | No | Root znode for storing metadata. Only relevant if using default Metadata Manager | +| `accumulo.cardinality.cache.size` | `100000` | No | Sets the size of the index cardinality cache | +| `accumulo.cardinality.cache.expire.duration` | `5m` | No | Sets the expiration duration of the cardinality cache. | + +## Usage + +Simply begin using SQL to create a new table in Accumulo to begin +working with data. By default, the first column of the table definition +is set to the Accumulo row ID. This should be the primary key of your +table, and keep in mind that any `INSERT` statements containing the same +row ID is effectively an UPDATE as far as Accumulo is concerned, as any +previous data in the cell is overwritten. The row ID can be +any valid Trino datatype. If the first column is not your primary key, you +can set the row ID column using the `row_id` table property within the `WITH` +clause of your table definition. + +Simply issue a `CREATE TABLE` statement to create a new Trino/Accumulo table: + +``` +CREATE TABLE example_schema.scientists ( + recordkey VARCHAR, + name VARCHAR, + age BIGINT, + birthday DATE +); +``` + +```sql +DESCRIBE example_schema.scientists; +``` + +```text + Column | Type | Extra | Comment +-----------+---------+-------+--------------------------------------------------- + recordkey | varchar | | Accumulo row ID + name | varchar | | Accumulo column name:name. Indexed: false + age | bigint | | Accumulo column age:age. Indexed: false + birthday | date | | Accumulo column birthday:birthday. Indexed: false +``` + +This command creates a new Accumulo table with the `recordkey` column +as the Accumulo row ID. The name, age, and birthday columns are mapped to +auto-generated column family and qualifier values (which, in practice, +are both identical to the Trino column name). + +When creating a table using SQL, you can optionally specify a +`column_mapping` table property. The value of this property is a +comma-delimited list of triples, Trino column **:** Accumulo column +family **:** accumulo column qualifier, with one triple for every +non-row ID column. This sets the mapping of the Trino column name to +the corresponding Accumulo column family and column qualifier. + +If you don't specify the `column_mapping` table property, then the +connector auto-generates column names (respecting any configured locality groups). +Auto-generation of column names is only available for internal tables, so if your +table is external you must specify the column_mapping property. + +For a full list of table properties, see [Table Properties](accumulo-table-properties). + +For example: + +```sql +CREATE TABLE example_schema.scientists ( + recordkey VARCHAR, + name VARCHAR, + age BIGINT, + birthday DATE +) +WITH ( + column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date' +); +``` + +```sql +DESCRIBE example_schema.scientists; +``` + +```text + Column | Type | Extra | Comment +-----------+---------+-------+----------------------------------------------- + recordkey | varchar | | Accumulo row ID + name | varchar | | Accumulo column metadata:name. Indexed: false + age | bigint | | Accumulo column metadata:age. Indexed: false + birthday | date | | Accumulo column metadata:date. Indexed: false +``` + +You can then issue `INSERT` statements to put data into Accumulo. + +:::{note} +While issuing `INSERT` statements is convenient, +this method of loading data into Accumulo is low-throughput. You want +to use the Accumulo APIs to write `Mutations` directly to the tables. +See the section on [Loading Data](accumulo-loading-data) for more details. +::: + +```sql +INSERT INTO example_schema.scientists VALUES +('row1', 'Grace Hopper', 109, DATE '1906-12-09' ), +('row2', 'Alan Turing', 103, DATE '1912-06-23' ); +``` + +```sql +SELECT * FROM example_schema.scientists; +``` + +```text + recordkey | name | age | birthday +-----------+--------------+-----+------------ + row1 | Grace Hopper | 109 | 1906-12-09 + row2 | Alan Turing | 103 | 1912-06-23 +(2 rows) +``` + +As you'd expect, rows inserted into Accumulo via the shell or +programmatically will also show up when queried. (The Accumulo shell +thinks "-5321" is an option and not a number... so we'll just make TBL a +little younger.) + +```bash +$ accumulo shell -u root -p secret +root@default> table example_schema.scientists +root@default example_schema.scientists> insert row3 metadata name "Tim Berners-Lee" +root@default example_schema.scientists> insert row3 metadata age 60 +root@default example_schema.scientists> insert row3 metadata date 5321 +``` + +```sql +SELECT * FROM example_schema.scientists; +``` + +```text + recordkey | name | age | birthday +-----------+-----------------+-----+------------ + row1 | Grace Hopper | 109 | 1906-12-09 + row2 | Alan Turing | 103 | 1912-06-23 + row3 | Tim Berners-Lee | 60 | 1984-07-27 +(3 rows) +``` + +You can also drop tables using `DROP TABLE`. This command drops both +metadata and the tables. See the below section on [External +Tables](accumulo-external-tables) for more details on internal and external +tables. + +```sql +DROP TABLE example_schema.scientists; +``` + +## Indexing columns + +Internally, the connector creates an Accumulo `Range` and packs it in +a split. This split gets passed to a Trino Worker to read the data from +the `Range` via a `BatchScanner`. When issuing a query that results +in a full table scan, each Trino Worker gets a single `Range` that +maps to a single tablet of the table. When issuing a query with a +predicate (i.e. `WHERE x = 10` clause), Trino passes the values +within the predicate (`10`) to the connector so it can use this +information to scan less data. When the Accumulo row ID is used as part +of the predicate clause, this narrows down the `Range` lookup to quickly +retrieve a subset of data from Accumulo. + +But what about the other columns? If you're frequently querying on +non-row ID columns, you should consider using the **indexing** +feature built into the Accumulo connector. This feature can drastically +reduce query runtime when selecting a handful of values from the table, +and the heavy lifting is done for you when loading data via Trino +`INSERT` statements. Keep in mind writing data to Accumulo via +`INSERT` does not have high throughput. + +To enable indexing, add the `index_columns` table property and specify +a comma-delimited list of Trino column names you wish to index (we use the +`string` serializer here to help with this example -- you +should be using the default `lexicoder` serializer). + +```sql +CREATE TABLE example_schema.scientists ( + recordkey VARCHAR, + name VARCHAR, + age BIGINT, + birthday DATE +) +WITH ( + serializer = 'string', + index_columns='name,age,birthday' +); +``` + +After creating the table, we see there are an additional two Accumulo +tables to store the index and metrics. + +```text +root@default> tables +accumulo.metadata +accumulo.root +example_schema.scientists +example_schema.scientists_idx +example_schema.scientists_idx_metrics +trace +``` + +After inserting data, we can look at the index table and see there are +indexed values for the name, age, and birthday columns. The connector +queries this index table + +```sql +INSERT INTO example_schema.scientists VALUES +('row1', 'Grace Hopper', 109, DATE '1906-12-09'), +('row2', 'Alan Turing', 103, DATE '1912-06-23'); +``` + +```text +root@default> scan -t example_schema.scientists_idx +-21011 metadata_date:row2 [] +-23034 metadata_date:row1 [] +103 metadata_age:row2 [] +109 metadata_age:row1 [] +Alan Turing metadata_name:row2 [] +Grace Hopper metadata_name:row1 [] +``` + +When issuing a query with a `WHERE` clause against indexed columns, +the connector searches the index table for all row IDs that contain the +value within the predicate. These row IDs are bundled into a Trino +split as single-value `Range` objects, the number of row IDs per split +is controlled by the value of `accumulo.index_rows_per_split`, and +passed to a Trino worker to be configured in the `BatchScanner` which +scans the data table. + +```sql +SELECT * FROM example_schema.scientists WHERE age = 109; +``` + +```text + recordkey | name | age | birthday +-----------+--------------+-----+------------ + row1 | Grace Hopper | 109 | 1906-12-09 +(1 row) +``` + +(accumulo-loading-data)= +## Loading data + +The Accumulo connector supports loading data via INSERT statements, however +this method tends to be low-throughput and should not be relied on when +throughput is a concern. + +(accumulo-external-tables)= +## External tables + +By default, the tables created using SQL statements via Trino are +*internal* tables, that is both the Trino table metadata and the +Accumulo tables are managed by Trino. When you create an internal +table, the Accumulo table is created as well. You receive an error +if the Accumulo table already exists. When an internal table is dropped +via Trino, the Accumulo table, and any index tables, are dropped as +well. + +To change this behavior, set the `external` property to `true` when +issuing the `CREATE` statement. This makes the table an *external* +table, and a `DROP TABLE` command **only** deletes the metadata +associated with the table. If the Accumulo tables do not already exist, +they are created by the connector. + +Creating an external table *will* set any configured locality groups as well +as the iterators on the index and metrics tables, if the table is indexed. +In short, the only difference between an external table and an internal table, +is that the connector deletes the Accumulo tables when a `DROP TABLE` command +is issued. + +External tables can be a bit more difficult to work with, as the data is stored +in an expected format. If the data is not stored correctly, then you're +gonna have a bad time. Users must provide a `column_mapping` property +when creating the table. This creates the mapping of Trino column name +to the column family/qualifier for the cell of the table. The value of the +cell is stored in the `Value` of the Accumulo key/value pair. By default, +this value is expected to be serialized using Accumulo's *lexicoder* API. +If you are storing values as strings, you can specify a different serializer +using the `serializer` property of the table. See the section on +[Table Properties](accumulo-table-properties) for more information. + +Next, we create the Trino external table. + +```sql +CREATE TABLE external_table ( + a VARCHAR, + b BIGINT, + c DATE +) +WITH ( + column_mapping = 'a:md:a,b:md:b,c:md:c', + external = true, + index_columns = 'b,c', + locality_groups = 'foo:b,c' +); +``` + +After creating the table, usage of the table continues as usual: + +```sql +INSERT INTO external_table VALUES +('1', 1, DATE '2015-03-06'), +('2', 2, DATE '2015-03-07'); +``` + +```sql +SELECT * FROM external_table; +``` + +```text + a | b | c +---+---+------------ + 1 | 1 | 2015-03-06 + 2 | 2 | 2015-03-06 +(2 rows) +``` + +```sql +DROP TABLE external_table; +``` + +After dropping the table, the table still exists in Accumulo because it is *external*. + +```text +root@default> tables +accumulo.metadata +accumulo.root +external_table +external_table_idx +external_table_idx_metrics +trace +``` + +If we wanted to add a new column to the table, we can create the table again and specify a new column. +Any existing rows in the table have a value of NULL. This command re-configures the Accumulo +tables, setting the locality groups and iterator configuration. + +```sql +CREATE TABLE external_table ( + a VARCHAR, + b BIGINT, + c DATE, + d INTEGER +) +WITH ( + column_mapping = 'a:md:a,b:md:b,c:md:c,d:md:d', + external = true, + index_columns = 'b,c,d', + locality_groups = 'foo:b,c,d' +); + +SELECT * FROM external_table; +``` + +```sql + a | b | c | d +---+---+------------+------ + 1 | 1 | 2015-03-06 | NULL + 2 | 2 | 2015-03-07 | NULL +(2 rows) +``` + +(accumulo-table-properties)= +## Table properties + +Table property usage example: + +```sql +CREATE TABLE example_schema.scientists ( + recordkey VARCHAR, + name VARCHAR, + age BIGINT, + birthday DATE +) +WITH ( + column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date', + index_columns = 'name,age' +); +``` + +| Property name | Default value | Description | +| ----------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `column_mapping` | (generated) | Comma-delimited list of column metadata: `col_name:col_family:col_qualifier,[...]`. Required for external tables. Not setting this property results in auto-generated column names. | +| `index_columns` | (none) | A comma-delimited list of Trino columns that are indexed in this table's corresponding index table | +| `external` | `false` | If true, Trino will only do metadata operations for the table. Otherwise, Trino will create and drop Accumulo tables where appropriate. | +| `locality_groups` | (none) | List of locality groups to set on the Accumulo table. Only valid on internal tables. String format is locality group name, colon, comma delimited list of column families in the group. Groups are delimited by pipes. Example: `group1:famA,famB,famC\|group2:famD,famE,famF\|etc...` | +| `row_id` | (first column) | Trino column name that maps to the Accumulo row ID. | +| `serializer` | `default` | Serializer for Accumulo data encodings. Can either be `default`, `string`, `lexicoder` or a Java class name. Default is `default`, i.e. the value from `AccumuloRowSerializer.getDefault()`, i.e. `lexicoder`. | +| `scan_auths` | (user auths) | Scan-time authorizations set on the batch scanner. | + +## Session properties + +You can change the default value of a session property by using {doc}`/sql/set-session`. +Note that session properties are prefixed with the catalog name: + +``` +SET SESSION example.column_filter_optimizations_enabled = false; +``` + +| Property name | Default value | Description | +| ------------------------------------------ | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `optimize_locality_enabled` | `true` | Set to true to enable data locality for non-indexed scans | +| `optimize_split_ranges_enabled` | `true` | Set to true to split non-indexed queries by tablet splits. Should generally be true. | +| `optimize_index_enabled` | `true` | Set to true to enable usage of the secondary index on query | +| `index_rows_per_split` | `10000` | The number of Accumulo row IDs that are packed into a single Trino split | +| `index_threshold` | `0.2` | The ratio between number of rows to be scanned based on the index over the total number of rows. If the ratio is below this threshold, the index will be used. | +| `index_lowest_cardinality_threshold` | `0.01` | The threshold where the column with the lowest cardinality will be used instead of computing an intersection of ranges in the index. Secondary index must be enabled | +| `index_metrics_enabled` | `true` | Set to true to enable usage of the metrics table to optimize usage of the index | +| `scan_username` | (config) | User to impersonate when scanning the tables. This property trumps the `scan_auths` table property | +| `index_short_circuit_cardinality_fetch` | `true` | Short circuit the retrieval of index metrics once any column is less than the lowest cardinality threshold | +| `index_cardinality_cache_polling_duration` | `10ms` | Sets the cardinality cache polling duration for short circuit retrieval of index metrics | + +## Adding columns + +Adding a new column to an existing table cannot be done today via +`ALTER TABLE [table] ADD COLUMN [name] [type]` because of the additional +metadata required for the columns to work; the column family, qualifier, +and if the column is indexed. + +## Serializers + +The Trino connector for Accumulo has a pluggable serializer framework +for handling I/O between Trino and Accumulo. This enables end-users the +ability to programmatically serialized and deserialize their special data +formats within Accumulo, while abstracting away the complexity of the +connector itself. + +There are two types of serializers currently available; a `string` +serializer that treats values as Java `String`, and a `lexicoder` +serializer that leverages Accumulo's Lexicoder API to store values. The +default serializer is the `lexicoder` serializer, as this serializer +does not require expensive conversion operations back and forth between +`String` objects and the Trino types -- the cell's value is encoded as a +byte array. + +Additionally, the `lexicoder` serializer does proper lexigraphical ordering of +numerical types like `BIGINT` or `TIMESTAMP`. This is essential for the connector +to properly leverage the secondary index when querying for data. + +You can change the default the serializer by specifying the +`serializer` table property, using either `default` (which is +`lexicoder`), `string` or `lexicoder` for the built-in types, or +you could provide your own implementation by extending +`AccumuloRowSerializer`, adding it to the Trino `CLASSPATH`, and +specifying the fully-qualified Java class name in the connector configuration. + +```sql +CREATE TABLE example_schema.scientists ( + recordkey VARCHAR, + name VARCHAR, + age BIGINT, + birthday DATE +) +WITH ( + column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date', + serializer = 'default' +); +``` + +```sql +INSERT INTO example_schema.scientists VALUES +('row1', 'Grace Hopper', 109, DATE '1906-12-09' ), +('row2', 'Alan Turing', 103, DATE '1912-06-23' ); +``` + +```text +root@default> scan -t example_schema.scientists +row1 metadata:age [] \x08\x80\x00\x00\x00\x00\x00\x00m +row1 metadata:date [] \x08\x7F\xFF\xFF\xFF\xFF\xFF\xA6\x06 +row1 metadata:name [] Grace Hopper +row2 metadata:age [] \x08\x80\x00\x00\x00\x00\x00\x00g +row2 metadata:date [] \x08\x7F\xFF\xFF\xFF\xFF\xFF\xAD\xED +row2 metadata:name [] Alan Turing +``` + +```sql +CREATE TABLE example_schema.stringy_scientists ( + recordkey VARCHAR, + name VARCHAR, + age BIGINT, + birthday DATE +) +WITH ( + column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date', + serializer = 'string' +); +``` + +```sql +INSERT INTO example_schema.stringy_scientists VALUES +('row1', 'Grace Hopper', 109, DATE '1906-12-09' ), +('row2', 'Alan Turing', 103, DATE '1912-06-23' ); +``` + +```text +root@default> scan -t example_schema.stringy_scientists +row1 metadata:age [] 109 +row1 metadata:date [] -23034 +row1 metadata:name [] Grace Hopper +row2 metadata:age [] 103 +row2 metadata:date [] -21011 +row2 metadata:name [] Alan Turing +``` + +```sql +CREATE TABLE example_schema.custom_scientists ( + recordkey VARCHAR, + name VARCHAR, + age BIGINT, + birthday DATE +) +WITH ( + column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date', + serializer = 'my.serializer.package.MySerializer' +); +``` + +## Metadata management + +Metadata for the Trino/Accumulo tables is stored in ZooKeeper. You can, +and should, issue SQL statements in Trino to create and drop tables. +This is the easiest method of creating the metadata required to make the +connector work. It is best to not mess with the metadata, but here are +the details of how it is stored. + +A root node in ZooKeeper holds all the mappings, and the format is as +follows: + +```text +/metadata-root/schema/table +``` + +Where `metadata-root` is the value of `zookeeper.metadata.root` in +the config file (default is `/trino-accumulo`), `schema` is the +Trino schema (which is identical to the Accumulo namespace name), and +`table` is the Trino table name (again, identical to Accumulo name). +The data of the `table` ZooKeeper node is a serialized +`AccumuloTable` Java object (which resides in the connector code). +This table contains the schema (namespace) name, table name, column +definitions, the serializer to use for the table, and any additional +table properties. + +If you have a need to programmatically manipulate the ZooKeeper metadata +for Accumulo, take a look at +`io.trino.plugin.accumulo.metadata.ZooKeeperMetadataManager` for some +Java code to simplify the process. + +## Converting table from internal to external + +If your table is *internal*, you can convert it to an external table by deleting +the corresponding znode in ZooKeeper, effectively making the table no longer exist as +far as Trino is concerned. Then, create the table again using the same DDL, but adding the +`external = true` table property. + +For example: + +1\. We're starting with an internal table `foo.bar` that was created with the below DDL. +If you have not previously defined a table property for `column_mapping` (like this example), +be sure to describe the table **before** deleting the metadata. We need the column mappings +when creating the external table. + +```sql +CREATE TABLE foo.bar (a VARCHAR, b BIGINT, c DATE) +WITH ( + index_columns = 'b,c' +); +``` + +```sql +DESCRIBE foo.bar; +``` + +```text + Column | Type | Extra | Comment +--------+---------+-------+------------------------------------- + a | varchar | | Accumulo row ID + b | bigint | | Accumulo column b:b. Indexed: true + c | date | | Accumulo column c:c. Indexed: true +``` + +2\. Using the ZooKeeper CLI, delete the corresponding znode. Note this uses the default ZooKeeper +metadata root of `/trino-accumulo` + +```text +$ zkCli.sh +[zk: localhost:2181(CONNECTED) 1] delete /trino-accumulo/foo/bar +``` + +3\. Re-create the table using the same DDL as before, but adding the `external=true` property. +Note that if you had not previously defined the column_mapping, you need to add the property +to the new DDL (external tables require this property to be set). The column mappings are in +the output of the `DESCRIBE` statement. + +```sql +CREATE TABLE foo.bar ( + a VARCHAR, + b BIGINT, + c DATE +) +WITH ( + column_mapping = 'a:a:a,b:b:b,c:c:c', + index_columns = 'b,c', + external = true +); +``` + +(accumulo-type-mapping)= + +## Type mapping + +Because Trino and Accumulo each support types that the other does not, this +connector modifies some types when reading or writing data. Data types may not +map the same way in both directions between Trino and the data source. Refer to +the following sections for type mapping in each direction. + +### Accumulo type to Trino type mapping + +The connector maps Accumulo types to the corresponding Trino types following +this table: + +```{eval-rst} +.. list-table:: Accumulo type to Trino type mapping + :widths: 30, 20, 50 + :header-rows: 1 + + * - Accumulo type + - Trino type + - Notes + * - ``BOOLEAN`` + - ``BOOLEAN`` + - + * - ``TINYINT`` + - ``TINYINT`` + - + * - ``SMALLINT`` + - ``SMALLINT`` + - + * - ``INTEGER`` + - ``INTEGER`` + - + * - ``BIGINT`` + - ``BIGINT`` + - + * - ``REAL`` + - ``REAL`` + - + * - ``DOUBLE`` + - ``DOUBLE`` + - + * - ``VARCHAR(n)`` + - ``VARCHAR(n)`` + - + * - ``VARBINARY`` + - ``VARBINARY`` + - + * - ``DATE`` + - ``DATE`` + - + * - ``TIME(n)`` + - ``TIME(n)`` + - + * - ``TIMESTAMP(n)`` + - ``TIMESTAMP(n)`` + - +``` + +No other types are supported + +### Trino type to Accumulo type mapping + +The connector maps Trino types to the corresponding Trino type to Accumulo type +mapping types following this table: + +```{eval-rst} +.. list-table:: Trino type to Accumulo type mapping + :widths: 30, 20, 50 + :header-rows: 1 + + * - Trino type + - Accumulo type + - Notes + * - ``BOOLEAN`` + - ``BOOLEAN`` + - + * - ``TINYINT`` + - ``TINYINT`` + - Trino only supports writing values belonging to ``[0, 127]`` + * - ``SMALLINT`` + - ``SMALLINT`` + - + * - ``INTEGER`` + - ``INTEGER`` + - + * - ``BIGINT`` + - ``BIGINT`` + - + * - ``REAL`` + - ``REAL`` + - + * - ``DOUBLE`` + - ``DOUBLE`` + - + * - ``VARCHAR(n)`` + - ``VARCHAR(n)`` + - + * - ``VARBINARY`` + - ``VARBINARY`` + - + * - ``DATE`` + - ``DATE`` + - + * - ``TIME(n)`` + - ``TIME(n)`` + - + * - ``TIMESTAMP(n)`` + - ``TIMESTAMP(n)`` + - +``` + +No other types are supported + +(accumulo-sql-support)= + +## SQL support + +The connector provides read and write access to data and metadata in +the Accumulo database. In addition to the {ref}`globally available +` and {ref}`read operation ` +statements, the connector supports the following features: + +- {doc}`/sql/insert` +- {doc}`/sql/create-table` +- {doc}`/sql/create-table-as` +- {doc}`/sql/drop-table` +- {doc}`/sql/create-schema` +- {doc}`/sql/drop-schema` diff --git a/docs/src/main/sphinx/connector/accumulo.rst b/docs/src/main/sphinx/connector/accumulo.rst deleted file mode 100644 index 7f3d27bf6d71..000000000000 --- a/docs/src/main/sphinx/connector/accumulo.rst +++ /dev/null @@ -1,814 +0,0 @@ -Accumulo connector -================== - -.. raw:: html - - - -The Accumulo connector supports reading and writing data from -`Apache Accumulo `_. -Please read this page thoroughly to understand the capabilities and features of the connector. - -Installing the iterator dependency ----------------------------------- - -The Accumulo connector uses custom Accumulo iterators in -order to push various information in SQL predicate clauses to Accumulo for -server-side filtering, known as *predicate pushdown*. In order -for the server-side iterators to work, you need to add the ``trino-accumulo-iterators`` -JAR file to Accumulo's ``lib/ext`` directory on each TabletServer node. - -.. code-block:: bash - - # For each TabletServer node: - scp $TRINO_HOME/plugins/accumulo/trino-accumulo-iterators-*.jar [tabletserver_address]:$ACCUMULO_HOME/lib/ext - - # TabletServer should pick up new JAR files in ext directory, but may require restart - -Requirements ------------- - -To connect to Accumulo, you need: - -* Accumulo versions 1.x starting with 1.7.4. Versions 2.x are not supported. -* Network access from the Trino coordinator and workers to the Accumulo - Zookeeper server. Port 2181 is the default port. - -Connector configuration ------------------------ - -Create ``etc/catalog/example.properties`` to mount the ``accumulo`` connector as -the ``example`` catalog, with the following connector properties as appropriate -for your setup: - -.. code-block:: text - - connector.name=accumulo - accumulo.instance=xxx - accumulo.zookeepers=xxx - accumulo.username=username - accumulo.password=password - -Replace the ``accumulo.xxx`` properties as required. - -Configuration variables ------------------------ - -================================================ ====================== ========== ===================================================================================== -Property name Default value Required Description -================================================ ====================== ========== ===================================================================================== -``accumulo.instance`` (none) Yes Name of the Accumulo instance -``accumulo.zookeepers`` (none) Yes ZooKeeper connect string -``accumulo.username`` (none) Yes Accumulo user for Trino -``accumulo.password`` (none) Yes Accumulo password for user -``accumulo.zookeeper.metadata.root`` ``/trino-accumulo`` No Root znode for storing metadata. Only relevant if using default Metadata Manager -``accumulo.cardinality.cache.size`` ``100000`` No Sets the size of the index cardinality cache -``accumulo.cardinality.cache.expire.duration`` ``5m`` No Sets the expiration duration of the cardinality cache. -================================================ ====================== ========== ===================================================================================== - -Usage ------ - -Simply begin using SQL to create a new table in Accumulo to begin -working with data. By default, the first column of the table definition -is set to the Accumulo row ID. This should be the primary key of your -table, and keep in mind that any ``INSERT`` statements containing the same -row ID is effectively an UPDATE as far as Accumulo is concerned, as any -previous data in the cell is overwritten. The row ID can be -any valid Trino datatype. If the first column is not your primary key, you -can set the row ID column using the ``row_id`` table property within the ``WITH`` -clause of your table definition. - -Simply issue a ``CREATE TABLE`` statement to create a new Trino/Accumulo table:: - - CREATE TABLE example_schema.scientists ( - recordkey VARCHAR, - name VARCHAR, - age BIGINT, - birthday DATE - ); - -.. code-block:: sql - - DESCRIBE example_schema.scientists; - -.. code-block:: text - - Column | Type | Extra | Comment - -----------+---------+-------+--------------------------------------------------- - recordkey | varchar | | Accumulo row ID - name | varchar | | Accumulo column name:name. Indexed: false - age | bigint | | Accumulo column age:age. Indexed: false - birthday | date | | Accumulo column birthday:birthday. Indexed: false - -This command creates a new Accumulo table with the ``recordkey`` column -as the Accumulo row ID. The name, age, and birthday columns are mapped to -auto-generated column family and qualifier values (which, in practice, -are both identical to the Trino column name). - -When creating a table using SQL, you can optionally specify a -``column_mapping`` table property. The value of this property is a -comma-delimited list of triples, Trino column **:** Accumulo column -family **:** accumulo column qualifier, with one triple for every -non-row ID column. This sets the mapping of the Trino column name to -the corresponding Accumulo column family and column qualifier. - -If you don't specify the ``column_mapping`` table property, then the -connector auto-generates column names (respecting any configured locality groups). -Auto-generation of column names is only available for internal tables, so if your -table is external you must specify the column_mapping property. - -For a full list of table properties, see `Table Properties <#table-properties>`__. - -For example: - -.. code-block:: sql - - CREATE TABLE example_schema.scientists ( - recordkey VARCHAR, - name VARCHAR, - age BIGINT, - birthday DATE - ) - WITH ( - column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date' - ); - -.. code-block:: sql - - DESCRIBE example_schema.scientists; - -.. code-block:: text - - Column | Type | Extra | Comment - -----------+---------+-------+----------------------------------------------- - recordkey | varchar | | Accumulo row ID - name | varchar | | Accumulo column metadata:name. Indexed: false - age | bigint | | Accumulo column metadata:age. Indexed: false - birthday | date | | Accumulo column metadata:date. Indexed: false - -You can then issue ``INSERT`` statements to put data into Accumulo. - -.. note:: - - While issuing ``INSERT`` statements is convenient, - this method of loading data into Accumulo is low-throughput. You want - to use the Accumulo APIs to write ``Mutations`` directly to the tables. - See the section on `Loading Data <#loading-data>`__ for more details. - -.. code-block:: sql - - INSERT INTO example_schema.scientists VALUES - ('row1', 'Grace Hopper', 109, DATE '1906-12-09' ), - ('row2', 'Alan Turing', 103, DATE '1912-06-23' ); - -.. code-block:: sql - - SELECT * FROM example_schema.scientists; - -.. code-block:: text - - recordkey | name | age | birthday - -----------+--------------+-----+------------ - row1 | Grace Hopper | 109 | 1906-12-09 - row2 | Alan Turing | 103 | 1912-06-23 - (2 rows) - -As you'd expect, rows inserted into Accumulo via the shell or -programmatically will also show up when queried. (The Accumulo shell -thinks "-5321" is an option and not a number... so we'll just make TBL a -little younger.) - -.. code-block:: bash - - $ accumulo shell -u root -p secret - root@default> table example_schema.scientists - root@default example_schema.scientists> insert row3 metadata name "Tim Berners-Lee" - root@default example_schema.scientists> insert row3 metadata age 60 - root@default example_schema.scientists> insert row3 metadata date 5321 - -.. code-block:: sql - - SELECT * FROM example_schema.scientists; - -.. code-block:: text - - recordkey | name | age | birthday - -----------+-----------------+-----+------------ - row1 | Grace Hopper | 109 | 1906-12-09 - row2 | Alan Turing | 103 | 1912-06-23 - row3 | Tim Berners-Lee | 60 | 1984-07-27 - (3 rows) - -You can also drop tables using ``DROP TABLE``. This command drops both -metadata and the tables. See the below section on `External -Tables <#external-tables>`__ for more details on internal and external -tables. - -.. code-block:: sql - - DROP TABLE example_schema.scientists; - -Indexing columns ----------------- - -Internally, the connector creates an Accumulo ``Range`` and packs it in -a split. This split gets passed to a Trino Worker to read the data from -the ``Range`` via a ``BatchScanner``. When issuing a query that results -in a full table scan, each Trino Worker gets a single ``Range`` that -maps to a single tablet of the table. When issuing a query with a -predicate (i.e. ``WHERE x = 10`` clause), Trino passes the values -within the predicate (``10``) to the connector so it can use this -information to scan less data. When the Accumulo row ID is used as part -of the predicate clause, this narrows down the ``Range`` lookup to quickly -retrieve a subset of data from Accumulo. - -But what about the other columns? If you're frequently querying on -non-row ID columns, you should consider using the **indexing** -feature built into the Accumulo connector. This feature can drastically -reduce query runtime when selecting a handful of values from the table, -and the heavy lifting is done for you when loading data via Trino -``INSERT`` statements. Keep in mind writing data to Accumulo via -``INSERT`` does not have high throughput. - -To enable indexing, add the ``index_columns`` table property and specify -a comma-delimited list of Trino column names you wish to index (we use the -``string`` serializer here to help with this example -- you -should be using the default ``lexicoder`` serializer). - -.. code-block:: sql - - CREATE TABLE example_schema.scientists ( - recordkey VARCHAR, - name VARCHAR, - age BIGINT, - birthday DATE - ) - WITH ( - serializer = 'string', - index_columns='name,age,birthday' - ); - -After creating the table, we see there are an additional two Accumulo -tables to store the index and metrics. - -.. code-block:: text - - root@default> tables - accumulo.metadata - accumulo.root - example_schema.scientists - example_schema.scientists_idx - example_schema.scientists_idx_metrics - trace - -After inserting data, we can look at the index table and see there are -indexed values for the name, age, and birthday columns. The connector -queries this index table - -.. code-block:: sql - - INSERT INTO example_schema.scientists VALUES - ('row1', 'Grace Hopper', 109, DATE '1906-12-09'), - ('row2', 'Alan Turing', 103, DATE '1912-06-23'); - -.. code-block:: text - - root@default> scan -t example_schema.scientists_idx - -21011 metadata_date:row2 [] - -23034 metadata_date:row1 [] - 103 metadata_age:row2 [] - 109 metadata_age:row1 [] - Alan Turing metadata_name:row2 [] - Grace Hopper metadata_name:row1 [] - -When issuing a query with a ``WHERE`` clause against indexed columns, -the connector searches the index table for all row IDs that contain the -value within the predicate. These row IDs are bundled into a Trino -split as single-value ``Range`` objects, the number of row IDs per split -is controlled by the value of ``accumulo.index_rows_per_split``, and -passed to a Trino worker to be configured in the ``BatchScanner`` which -scans the data table. - -.. code-block:: sql - - SELECT * FROM example_schema.scientists WHERE age = 109; - -.. code-block:: text - - recordkey | name | age | birthday - -----------+--------------+-----+------------ - row1 | Grace Hopper | 109 | 1906-12-09 - (1 row) - -Loading data ------------- - -The Accumulo connector supports loading data via INSERT statements, however -this method tends to be low-throughput and should not be relied on when -throughput is a concern. - -External tables ---------------- - -By default, the tables created using SQL statements via Trino are -*internal* tables, that is both the Trino table metadata and the -Accumulo tables are managed by Trino. When you create an internal -table, the Accumulo table is created as well. You receive an error -if the Accumulo table already exists. When an internal table is dropped -via Trino, the Accumulo table, and any index tables, are dropped as -well. - -To change this behavior, set the ``external`` property to ``true`` when -issuing the ``CREATE`` statement. This makes the table an *external* -table, and a ``DROP TABLE`` command **only** deletes the metadata -associated with the table. If the Accumulo tables do not already exist, -they are created by the connector. - -Creating an external table *will* set any configured locality groups as well -as the iterators on the index and metrics tables, if the table is indexed. -In short, the only difference between an external table and an internal table, -is that the connector deletes the Accumulo tables when a ``DROP TABLE`` command -is issued. - -External tables can be a bit more difficult to work with, as the data is stored -in an expected format. If the data is not stored correctly, then you're -gonna have a bad time. Users must provide a ``column_mapping`` property -when creating the table. This creates the mapping of Trino column name -to the column family/qualifier for the cell of the table. The value of the -cell is stored in the ``Value`` of the Accumulo key/value pair. By default, -this value is expected to be serialized using Accumulo's *lexicoder* API. -If you are storing values as strings, you can specify a different serializer -using the ``serializer`` property of the table. See the section on -`Table Properties <#table-properties>`__ for more information. - -Next, we create the Trino external table. - -.. code-block:: sql - - CREATE TABLE external_table ( - a VARCHAR, - b BIGINT, - c DATE - ) - WITH ( - column_mapping = 'a:md:a,b:md:b,c:md:c', - external = true, - index_columns = 'b,c', - locality_groups = 'foo:b,c' - ); - -After creating the table, usage of the table continues as usual: - -.. code-block:: sql - - INSERT INTO external_table VALUES - ('1', 1, DATE '2015-03-06'), - ('2', 2, DATE '2015-03-07'); - -.. code-block:: sql - - SELECT * FROM external_table; - -.. code-block:: text - - a | b | c - ---+---+------------ - 1 | 1 | 2015-03-06 - 2 | 2 | 2015-03-06 - (2 rows) - -.. code-block:: sql - - DROP TABLE external_table; - -After dropping the table, the table still exists in Accumulo because it is *external*. - -.. code-block:: text - - root@default> tables - accumulo.metadata - accumulo.root - external_table - external_table_idx - external_table_idx_metrics - trace - -If we wanted to add a new column to the table, we can create the table again and specify a new column. -Any existing rows in the table have a value of NULL. This command re-configures the Accumulo -tables, setting the locality groups and iterator configuration. - -.. code-block:: sql - - CREATE TABLE external_table ( - a VARCHAR, - b BIGINT, - c DATE, - d INTEGER - ) - WITH ( - column_mapping = 'a:md:a,b:md:b,c:md:c,d:md:d', - external = true, - index_columns = 'b,c,d', - locality_groups = 'foo:b,c,d' - ); - - SELECT * FROM external_table; - -.. code-block:: sql - - a | b | c | d - ---+---+------------+------ - 1 | 1 | 2015-03-06 | NULL - 2 | 2 | 2015-03-07 | NULL - (2 rows) - -Table properties ----------------- - -Table property usage example: - -.. code-block:: sql - - CREATE TABLE example_schema.scientists ( - recordkey VARCHAR, - name VARCHAR, - age BIGINT, - birthday DATE - ) - WITH ( - column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date', - index_columns = 'name,age' - ); - -==================== ================ ====================================================================================================== -Property name Default value Description -==================== ================ ====================================================================================================== -``column_mapping`` (generated) Comma-delimited list of column metadata: ``col_name:col_family:col_qualifier,[...]``. - Required for external tables. Not setting this property results in auto-generated column names. -``index_columns`` (none) A comma-delimited list of Trino columns that are indexed in this table's corresponding index table -``external`` ``false`` If true, Trino will only do metadata operations for the table. - Otherwise, Trino will create and drop Accumulo tables where appropriate. -``locality_groups`` (none) List of locality groups to set on the Accumulo table. Only valid on internal tables. - String format is locality group name, colon, comma delimited list of column families in the group. - Groups are delimited by pipes. Example: ``group1:famA,famB,famC|group2:famD,famE,famF|etc...`` -``row_id`` (first column) Trino column name that maps to the Accumulo row ID. -``serializer`` ``default`` Serializer for Accumulo data encodings. Can either be ``default``, ``string``, ``lexicoder`` - or a Java class name. Default is ``default``, - i.e. the value from ``AccumuloRowSerializer.getDefault()``, i.e. ``lexicoder``. -``scan_auths`` (user auths) Scan-time authorizations set on the batch scanner. -==================== ================ ====================================================================================================== - -Session properties ------------------- - -You can change the default value of a session property by using :doc:`/sql/set-session`. -Note that session properties are prefixed with the catalog name:: - - SET SESSION example.column_filter_optimizations_enabled = false; - -============================================= ============= ======================================================================================================= -Property name Default value Description -============================================= ============= ======================================================================================================= -``optimize_locality_enabled`` ``true`` Set to true to enable data locality for non-indexed scans -``optimize_split_ranges_enabled`` ``true`` Set to true to split non-indexed queries by tablet splits. Should generally be true. -``optimize_index_enabled`` ``true`` Set to true to enable usage of the secondary index on query -``index_rows_per_split`` ``10000`` The number of Accumulo row IDs that are packed into a single Trino split -``index_threshold`` ``0.2`` The ratio between number of rows to be scanned based on the index over the total number of rows - If the ratio is below this threshold, the index will be used. -``index_lowest_cardinality_threshold`` ``0.01`` The threshold where the column with the lowest cardinality will be used instead of computing an - intersection of ranges in the index. Secondary index must be enabled -``index_metrics_enabled`` ``true`` Set to true to enable usage of the metrics table to optimize usage of the index -``scan_username`` (config) User to impersonate when scanning the tables. This property trumps the ``scan_auths`` table property -``index_short_circuit_cardinality_fetch`` ``true`` Short circuit the retrieval of index metrics once any column is less than the lowest cardinality threshold -``index_cardinality_cache_polling_duration`` ``10ms`` Sets the cardinality cache polling duration for short circuit retrieval of index metrics -============================================= ============= ======================================================================================================= - -Adding columns --------------- - -Adding a new column to an existing table cannot be done today via -``ALTER TABLE [table] ADD COLUMN [name] [type]`` because of the additional -metadata required for the columns to work; the column family, qualifier, -and if the column is indexed. - -Serializers ------------ - -The Trino connector for Accumulo has a pluggable serializer framework -for handling I/O between Trino and Accumulo. This enables end-users the -ability to programmatically serialized and deserialize their special data -formats within Accumulo, while abstracting away the complexity of the -connector itself. - -There are two types of serializers currently available; a ``string`` -serializer that treats values as Java ``String``, and a ``lexicoder`` -serializer that leverages Accumulo's Lexicoder API to store values. The -default serializer is the ``lexicoder`` serializer, as this serializer -does not require expensive conversion operations back and forth between -``String`` objects and the Trino types -- the cell's value is encoded as a -byte array. - -Additionally, the ``lexicoder`` serializer does proper lexigraphical ordering of -numerical types like ``BIGINT`` or ``TIMESTAMP``. This is essential for the connector -to properly leverage the secondary index when querying for data. - -You can change the default the serializer by specifying the -``serializer`` table property, using either ``default`` (which is -``lexicoder``), ``string`` or ``lexicoder`` for the built-in types, or -you could provide your own implementation by extending -``AccumuloRowSerializer``, adding it to the Trino ``CLASSPATH``, and -specifying the fully-qualified Java class name in the connector configuration. - -.. code-block:: sql - - CREATE TABLE example_schema.scientists ( - recordkey VARCHAR, - name VARCHAR, - age BIGINT, - birthday DATE - ) - WITH ( - column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date', - serializer = 'default' - ); - -.. code-block:: sql - - INSERT INTO example_schema.scientists VALUES - ('row1', 'Grace Hopper', 109, DATE '1906-12-09' ), - ('row2', 'Alan Turing', 103, DATE '1912-06-23' ); - -.. code-block:: text - - root@default> scan -t example_schema.scientists - row1 metadata:age [] \x08\x80\x00\x00\x00\x00\x00\x00m - row1 metadata:date [] \x08\x7F\xFF\xFF\xFF\xFF\xFF\xA6\x06 - row1 metadata:name [] Grace Hopper - row2 metadata:age [] \x08\x80\x00\x00\x00\x00\x00\x00g - row2 metadata:date [] \x08\x7F\xFF\xFF\xFF\xFF\xFF\xAD\xED - row2 metadata:name [] Alan Turing - -.. code-block:: sql - - CREATE TABLE example_schema.stringy_scientists ( - recordkey VARCHAR, - name VARCHAR, - age BIGINT, - birthday DATE - ) - WITH ( - column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date', - serializer = 'string' - ); - -.. code-block:: sql - - INSERT INTO example_schema.stringy_scientists VALUES - ('row1', 'Grace Hopper', 109, DATE '1906-12-09' ), - ('row2', 'Alan Turing', 103, DATE '1912-06-23' ); - -.. code-block:: text - - root@default> scan -t example_schema.stringy_scientists - row1 metadata:age [] 109 - row1 metadata:date [] -23034 - row1 metadata:name [] Grace Hopper - row2 metadata:age [] 103 - row2 metadata:date [] -21011 - row2 metadata:name [] Alan Turing - -.. code-block:: sql - - CREATE TABLE example_schema.custom_scientists ( - recordkey VARCHAR, - name VARCHAR, - age BIGINT, - birthday DATE - ) - WITH ( - column_mapping = 'name:metadata:name,age:metadata:age,birthday:metadata:date', - serializer = 'my.serializer.package.MySerializer' - ); - -Metadata management -------------------- - -Metadata for the Trino/Accumulo tables is stored in ZooKeeper. You can, -and should, issue SQL statements in Trino to create and drop tables. -This is the easiest method of creating the metadata required to make the -connector work. It is best to not mess with the metadata, but here are -the details of how it is stored. - -A root node in ZooKeeper holds all the mappings, and the format is as -follows: - -.. code-block:: text - - /metadata-root/schema/table - -Where ``metadata-root`` is the value of ``zookeeper.metadata.root`` in -the config file (default is ``/trino-accumulo``), ``schema`` is the -Trino schema (which is identical to the Accumulo namespace name), and -``table`` is the Trino table name (again, identical to Accumulo name). -The data of the ``table`` ZooKeeper node is a serialized -``AccumuloTable`` Java object (which resides in the connector code). -This table contains the schema (namespace) name, table name, column -definitions, the serializer to use for the table, and any additional -table properties. - -If you have a need to programmatically manipulate the ZooKeeper metadata -for Accumulo, take a look at -``io.trino.plugin.accumulo.metadata.ZooKeeperMetadataManager`` for some -Java code to simplify the process. - -Converting table from internal to external ------------------------------------------- - -If your table is *internal*, you can convert it to an external table by deleting -the corresponding znode in ZooKeeper, effectively making the table no longer exist as -far as Trino is concerned. Then, create the table again using the same DDL, but adding the -``external = true`` table property. - -For example: - -1. We're starting with an internal table ``foo.bar`` that was created with the below DDL. -If you have not previously defined a table property for ``column_mapping`` (like this example), -be sure to describe the table **before** deleting the metadata. We need the column mappings -when creating the external table. - -.. code-block:: sql - - CREATE TABLE foo.bar (a VARCHAR, b BIGINT, c DATE) - WITH ( - index_columns = 'b,c' - ); - -.. code-block:: sql - - DESCRIBE foo.bar; - -.. code-block:: text - - Column | Type | Extra | Comment - --------+---------+-------+------------------------------------- - a | varchar | | Accumulo row ID - b | bigint | | Accumulo column b:b. Indexed: true - c | date | | Accumulo column c:c. Indexed: true - -2. Using the ZooKeeper CLI, delete the corresponding znode. Note this uses the default ZooKeeper -metadata root of ``/trino-accumulo`` - -.. code-block:: text - - $ zkCli.sh - [zk: localhost:2181(CONNECTED) 1] delete /trino-accumulo/foo/bar - -3. Re-create the table using the same DDL as before, but adding the ``external=true`` property. -Note that if you had not previously defined the column_mapping, you need to add the property -to the new DDL (external tables require this property to be set). The column mappings are in -the output of the ``DESCRIBE`` statement. - -.. code-block:: sql - - CREATE TABLE foo.bar ( - a VARCHAR, - b BIGINT, - c DATE - ) - WITH ( - column_mapping = 'a:a:a,b:b:b,c:c:c', - index_columns = 'b,c', - external = true - ); - -.. _accumulo-type-mapping: - -Type mapping ------------- - -Because Trino and Accumulo each support types that the other does not, this -connector modifies some types when reading or writing data. Data types may not -map the same way in both directions between Trino and the data source. Refer to -the following sections for type mapping in each direction. - -Accumulo type to Trino type mapping -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The connector maps Accumulo types to the corresponding Trino types following -this table: - -.. list-table:: Accumulo type to Trino type mapping - :widths: 30, 20, 50 - :header-rows: 1 - - * - Accumulo type - - Trino type - - Notes - * - ``BOOLEAN`` - - ``BOOLEAN`` - - - * - ``TINYINT`` - - ``TINYINT`` - - - * - ``SMALLINT`` - - ``SMALLINT`` - - - * - ``INTEGER`` - - ``INTEGER`` - - - * - ``BIGINT`` - - ``BIGINT`` - - - * - ``REAL`` - - ``REAL`` - - - * - ``DOUBLE`` - - ``DOUBLE`` - - - * - ``VARCHAR(n)`` - - ``VARCHAR(n)`` - - - * - ``VARBINARY`` - - ``VARBINARY`` - - - * - ``DATE`` - - ``DATE`` - - - * - ``TIME(n)`` - - ``TIME(n)`` - - - * - ``TIMESTAMP(n)`` - - ``TIMESTAMP(n)`` - - - -No other types are supported - -Trino type to Accumulo type mapping -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The connector maps Trino types to the corresponding Trino type to Accumulo type -mapping types following this table: - -.. list-table:: Trino type to Accumulo type mapping - :widths: 30, 20, 50 - :header-rows: 1 - - * - Trino type - - Accumulo type - - Notes - * - ``BOOLEAN`` - - ``BOOLEAN`` - - - * - ``TINYINT`` - - ``TINYINT`` - - Trino only supports writing values belonging to ``[0, 127]`` - * - ``SMALLINT`` - - ``SMALLINT`` - - - * - ``INTEGER`` - - ``INTEGER`` - - - * - ``BIGINT`` - - ``BIGINT`` - - - * - ``REAL`` - - ``REAL`` - - - * - ``DOUBLE`` - - ``DOUBLE`` - - - * - ``VARCHAR(n)`` - - ``VARCHAR(n)`` - - - * - ``VARBINARY`` - - ``VARBINARY`` - - - * - ``DATE`` - - ``DATE`` - - - * - ``TIME(n)`` - - ``TIME(n)`` - - - * - ``TIMESTAMP(n)`` - - ``TIMESTAMP(n)`` - - - -No other types are supported - -.. _accumulo-sql-support: - -SQL support ------------ - -The connector provides read and write access to data and metadata in -the Accumulo database. In addition to the :ref:`globally available -` and :ref:`read operation ` -statements, the connector supports the following features: - -* :doc:`/sql/insert` -* :doc:`/sql/create-table` -* :doc:`/sql/create-table-as` -* :doc:`/sql/drop-table` -* :doc:`/sql/create-schema` -* :doc:`/sql/drop-schema` diff --git a/docs/src/main/sphinx/connector/atop.rst b/docs/src/main/sphinx/connector/atop.md similarity index 73% rename from docs/src/main/sphinx/connector/atop.rst rename to docs/src/main/sphinx/connector/atop.md index 0c457d34dd45..64eca21db683 100644 --- a/docs/src/main/sphinx/connector/atop.rst +++ b/docs/src/main/sphinx/connector/atop.md @@ -1,34 +1,30 @@ -============== -Atop connector -============== +# Atop connector -The Atop connector supports reading disk utilization statistics from the `Atop `_ +The Atop connector supports reading disk utilization statistics from the [Atop](https://www.atoptool.nl/) (Advanced System and Process Monitor) Linux server performance analysis tool. -Requirements ------------- +## Requirements In order to use this connector, the host on which the Trino worker is running -needs to have the ``atop`` tool installed locally. +needs to have the `atop` tool installed locally. -Connector configuration ------------------------ +## Connector configuration The connector can read disk utilization statistics on the Trino cluster. Create a catalog properties file that specifies the Atop connector by -setting the ``connector.name`` to ``atop``. +setting the `connector.name` to `atop`. -For example, create the file ``etc/catalog/example.properties`` with the +For example, create the file `etc/catalog/example.properties` with the following connector properties as appropriate for your setup: -.. code-block:: text +```text +connector.name=atop +atop.executable-path=/usr/bin/atop +``` - connector.name=atop - atop.executable-path=/usr/bin/atop - -Configuration properties ------------------------- +## Configuration properties +```{eval-rst} .. list-table:: :widths: 42, 18, 5, 35 :header-rows: 1 @@ -62,27 +58,29 @@ Configuration properties - Yes - The time zone identifier in which the atop data is collected. Generally the timezone of the host. Sample time zone identifiers: ``Europe/Vienna``, ``+0100``, ``UTC``. +``` -Usage ------ - -The Atop connector provides a ``default`` schema. - -The tables exposed by this connector can be retrieved by running ``SHOW TABLES``:: +## Usage - SHOW TABLES FROM example.default; +The Atop connector provides a `default` schema. -.. code-block:: text +The tables exposed by this connector can be retrieved by running `SHOW TABLES`: - Table - --------- - disks - reboots - (2 rows) +``` +SHOW TABLES FROM example.default; +``` +```text + Table +--------- + disks + reboots +(2 rows) +``` -The ``disks`` table offers disk utilization statistics recorded on the Trino node. +The `disks` table offers disk utilization statistics recorded on the Trino node. +```{eval-rst} .. list-table:: Disks columns :widths: 30, 30, 40 :header-rows: 1 @@ -120,9 +118,11 @@ The ``disks`` table offers disk utilization statistics recorded on the Trino nod * - ``sectors_written`` - ``BIGINT`` - Number of sectors transferred for write +``` -The ``reboots`` table offers information about the system reboots performed on the Trino node. +The `reboots` table offers information about the system reboots performed on the Trino node. +```{eval-rst} .. list-table:: Reboots columns :widths: 30, 30, 40 :header-rows: 1 @@ -137,10 +137,10 @@ The ``reboots`` table offers information about the system reboots performed on t - ``TIMESTAMP(3) WITH TIME ZONE`` - The boot/reboot timestamp +``` -SQL support ------------ +## SQL support -The connector provides :ref:`globally available ` and -:ref:`read operation ` statements to access system and process monitor +The connector provides {ref}`globally available ` and +{ref}`read operation ` statements to access system and process monitor information on your Trino nodes. diff --git a/docs/src/main/sphinx/connector/googlesheets.md b/docs/src/main/sphinx/connector/googlesheets.md new file mode 100644 index 000000000000..8615d6d58637 --- /dev/null +++ b/docs/src/main/sphinx/connector/googlesheets.md @@ -0,0 +1,175 @@ +# Google Sheets connector + +```{raw} html + +``` + +The Google Sheets connector allows reading and writing [Google Sheets](https://www.google.com/sheets/about/) spreadsheets as tables in Trino. + +## Configuration + +Create `etc/catalog/example.properties` to mount the Google Sheets connector +as the `example` catalog, with the following contents: + +```text +connector.name=gsheets +gsheets.credentials-path=/path/to/google-sheets-credentials.json +gsheets.metadata-sheet-id=exampleId +``` + +## Configuration properties + +The following configuration properties are available: + +| Property name | Description | +| ----------------------------- | ---------------------------------------------------------------- | +| `gsheets.credentials-path` | Path to the Google API JSON key file | +| `gsheets.credentials-key` | The base64 encoded credentials key | +| `gsheets.metadata-sheet-id` | Sheet ID of the spreadsheet, that contains the table mapping | +| `gsheets.max-data-cache-size` | Maximum number of spreadsheets to cache, defaults to `1000` | +| `gsheets.data-cache-ttl` | How long to cache spreadsheet data or metadata, defaults to `5m` | +| `gsheets.connection-timeout` | Timeout when connection to Google Sheets API, defaults to `20s` | +| `gsheets.read-timeout` | Timeout when reading from Google Sheets API, defaults to `20s` | +| `gsheets.write-timeout` | Timeout when writing to Google Sheets API, defaults to `20s` | + +## Credentials + +The connector requires credentials in order to access the Google Sheets API. + +1. Open the [Google Sheets API](https://console.developers.google.com/apis/library/sheets.googleapis.com) + page and click the *Enable* button. This takes you to the API manager page. +2. Select a project using the drop down menu at the top of the page. + Create a new project, if you do not already have one. +3. Choose *Credentials* in the left panel. +4. Click *Manage service accounts*, then create a service account for the connector. + On the *Create key* step, create and download a key in JSON format. + +The key file needs to be available on the Trino coordinator and workers. +Set the `gsheets.credentials-path` configuration property to point to this file. +The exact name of the file does not matter -- it can be named anything. + +Alternatively, set the `gsheets.credentials-key` configuration property. +It should contain the contents of the JSON file, encoded using base64. + +## Metadata sheet + +The metadata sheet is used to map table names to sheet IDs. +Create a new metadata sheet. The first row must be a header row +containing the following columns in this order: + +- Table Name +- Sheet ID +- Owner (optional) +- Notes (optional) + +See this [example sheet](https://docs.google.com/spreadsheets/d/1Es4HhWALUQjoa-bQh4a8B5HROz7dpGMfq_HbfoaW5LM) +as a reference. + +The metadata sheet must be shared with the service account user, +the one for which the key credentials file was created. Click the *Share* +button to share the sheet with the email address of the service account. + +Set the `gsheets.metadata-sheet-id` configuration property to the ID of this sheet. + +## Querying sheets + +The service account user must have access to the sheet in order for Trino +to query it. Click the *Share* button to share the sheet with the email +address of the service account. + +The sheet needs to be mapped to a Trino table name. Specify a table name +(column A) and the sheet ID (column B) in the metadata sheet. To refer +to a specific range in the sheet, add the range after the sheet ID, separated +with `#`. If a range is not provided, the connector loads only 10,000 rows by default from +the first tab in the sheet. + +The first row of the provided sheet range is used as the header and will determine the column +names of the Trino table. +For more details on sheet range syntax see the [google sheets docs](https://developers.google.com/sheets/api/guides/concepts). + +## Writing to sheets + +The same way sheets can be queried, they can also be written by appending data to existing sheets. +In this case the service account user must also have **Editor** permissions on the sheet. + +After data is written to a table, the table contents are removed from the cache +described in [API usage limits](gsheets-api-usage). If the table is accessed +immediately after the write, querying the Google Sheets API may not reflect the +change yet. In that case the old version of the table is read and cached for the +configured amount of time, and it might take some time for the written changes +to propagate properly. + +Keep in mind that the Google Sheets API has [usage limits](https://developers.google.com/sheets/api/limits), that limit the speed of inserting data. +If you run into timeouts you can increase timeout times to avoid `503: The service is currently unavailable` errors. + +(gsheets-api-usage)= +## API usage limits + +The Google Sheets API has [usage limits](https://developers.google.com/sheets/api/limits), +that may impact the usage of this connector. Increasing the cache duration and/or size +may prevent the limit from being reached. Running queries on the `information_schema.columns` +table without a schema and table name filter may lead to hitting the limit, as this requires +fetching the sheet data for every table, unless it is already cached. + +## Type mapping + +Because Trino and Google Sheets each support types that the other does not, this +connector {ref}`modifies some types ` when reading data. + +### Google Sheets type to Trino type mapping + +The connector maps Google Sheets types to the corresponding Trino types +following this table: + +```{eval-rst} +.. list-table:: Google Sheets type to Trino type mapping + :widths: 30, 20 + :header-rows: 1 + + * - Google Sheets type + - Trino type + * - ``TEXT`` + - ``VARCHAR`` +``` + +No other types are supported. + +(google-sheets-sql-support)= + +## SQL support + +In addition to the {ref}`globally available ` and {ref}`read operation ` statements, +this connector supports the following features: + +- {doc}`/sql/insert` + +## Table functions + +The connector provides specific {doc}`/functions/table` to access Google Sheets. + +(google-sheets-sheet-function)= + +### `sheet(id, range) -> table` + +The `sheet` function allows you to query a Google Sheet directly without +specifying it as a named table in the metadata sheet. + +For example, for a catalog named 'example': + +``` +SELECT * +FROM + TABLE(example.system.sheet( + id => 'googleSheetIdHere')); +``` + +A sheet range or named range can be provided as an optional `range` argument. +The default sheet range is `$1:$10000` if one is not provided: + +``` +SELECT * +FROM + TABLE(example.system.sheet( + id => 'googleSheetIdHere', + range => 'TabName!A1:B4')); +``` diff --git a/docs/src/main/sphinx/connector/googlesheets.rst b/docs/src/main/sphinx/connector/googlesheets.rst deleted file mode 100644 index a8d55d758dbe..000000000000 --- a/docs/src/main/sphinx/connector/googlesheets.rst +++ /dev/null @@ -1,185 +0,0 @@ -======================= -Google Sheets connector -======================= - -.. raw:: html - - - -The Google Sheets connector allows reading and writing `Google Sheets `_ spreadsheets as tables in Trino. - -Configuration -------------- - -Create ``etc/catalog/example.properties`` to mount the Google Sheets connector -as the ``example`` catalog, with the following contents: - -.. code-block:: text - - connector.name=gsheets - gsheets.credentials-path=/path/to/google-sheets-credentials.json - gsheets.metadata-sheet-id=exampleId - -Configuration properties ------------------------- - -The following configuration properties are available: - -=================================== ===================================================================== -Property name Description -=================================== ===================================================================== -``gsheets.credentials-path`` Path to the Google API JSON key file -``gsheets.credentials-key`` The base64 encoded credentials key -``gsheets.metadata-sheet-id`` Sheet ID of the spreadsheet, that contains the table mapping -``gsheets.max-data-cache-size`` Maximum number of spreadsheets to cache, defaults to ``1000`` -``gsheets.data-cache-ttl`` How long to cache spreadsheet data or metadata, defaults to ``5m`` -``gsheets.connection-timeout`` Timeout when connection to Google Sheets API, defaults to ``20s`` -``gsheets.read-timeout`` Timeout when reading from Google Sheets API, defaults to ``20s`` -``gsheets.write-timeout`` Timeout when writing to Google Sheets API, defaults to ``20s`` -=================================== ===================================================================== - -Credentials ------------ - -The connector requires credentials in order to access the Google Sheets API. - -1. Open the `Google Sheets API `_ - page and click the *Enable* button. This takes you to the API manager page. - -2. Select a project using the drop down menu at the top of the page. - Create a new project, if you do not already have one. - -3. Choose *Credentials* in the left panel. - -4. Click *Manage service accounts*, then create a service account for the connector. - On the *Create key* step, create and download a key in JSON format. - -The key file needs to be available on the Trino coordinator and workers. -Set the ``gsheets.credentials-path`` configuration property to point to this file. -The exact name of the file does not matter -- it can be named anything. - -Alternatively, set the ``gsheets.credentials-key`` configuration property. -It should contain the contents of the JSON file, encoded using base64. - -Metadata sheet --------------- - -The metadata sheet is used to map table names to sheet IDs. -Create a new metadata sheet. The first row must be a header row -containing the following columns in this order: - -* Table Name -* Sheet ID -* Owner (optional) -* Notes (optional) - -See this `example sheet `_ -as a reference. - -The metadata sheet must be shared with the service account user, -the one for which the key credentials file was created. Click the *Share* -button to share the sheet with the email address of the service account. - -Set the ``gsheets.metadata-sheet-id`` configuration property to the ID of this sheet. - -Querying sheets ---------------- - -The service account user must have access to the sheet in order for Trino -to query it. Click the *Share* button to share the sheet with the email -address of the service account. - -The sheet needs to be mapped to a Trino table name. Specify a table name -(column A) and the sheet ID (column B) in the metadata sheet. To refer -to a specific range in the sheet, add the range after the sheet ID, separated -with ``#``. If a range is not provided, the connector loads only 10,000 rows by default from -the first tab in the sheet. - -The first row of the provided sheet range is used as the header and will determine the column -names of the Trino table. -For more details on sheet range syntax see the `google sheets docs `_. - -Writing to sheets ------------------ - -The same way sheets can be queried, they can also be written by appending data to existing sheets. -In this case the service account user must also have **Editor** permissions on the sheet. - -After data is written to a table, the table contents are removed from the cache described in `API usage limits`_. -If the table is accessed immediately after the write, querying the Google Sheets API may not reflect the change yet. -In that case the old version of the table is read and cached for the configured amount of time, -and it might take some time for the written changes to propagate properly. - -Keep in mind that the Google Sheets API has `usage limits `_, that limit the speed of inserting data. -If you run into timeouts you can increase timeout times to avoid ``503: The service is currently unavailable`` errors. - -API usage limits ----------------- - -The Google Sheets API has `usage limits `_, -that may impact the usage of this connector. Increasing the cache duration and/or size -may prevent the limit from being reached. Running queries on the ``information_schema.columns`` -table without a schema and table name filter may lead to hitting the limit, as this requires -fetching the sheet data for every table, unless it is already cached. - -Type mapping ------------- - -Because Trino and Google Sheets each support types that the other does not, this -connector :ref:`modifies some types ` when reading data. - -Google Sheets type to Trino type mapping -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The connector maps Google Sheets types to the corresponding Trino types -following this table: - -.. list-table:: Google Sheets type to Trino type mapping - :widths: 30, 20 - :header-rows: 1 - - * - Google Sheets type - - Trino type - * - ``TEXT`` - - ``VARCHAR`` - -No other types are supported. - -.. _google-sheets-sql-support: - -SQL support ------------ - -In addition to the :ref:`globally available ` and :ref:`read operation ` statements, -this connector supports the following features: - -* :doc:`/sql/insert` - -Table functions ---------------- - -The connector provides specific :doc:`/functions/table` to access Google Sheets. - -.. _google-sheets-sheet-function: - -``sheet(id, range) -> table`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``sheet`` function allows you to query a Google Sheet directly without -specifying it as a named table in the metadata sheet. - -For example, for a catalog named 'example':: - - SELECT * - FROM - TABLE(example.system.sheet( - id => 'googleSheetIdHere')); - -A sheet range or named range can be provided as an optional ``range`` argument. -The default sheet range is ``$1:$10000`` if one is not provided:: - - SELECT * - FROM - TABLE(example.system.sheet( - id => 'googleSheetIdHere', - range => 'TabName!A1:B4')); diff --git a/docs/src/main/sphinx/connector/jmx.md b/docs/src/main/sphinx/connector/jmx.md new file mode 100644 index 000000000000..ab6cf6e83baf --- /dev/null +++ b/docs/src/main/sphinx/connector/jmx.md @@ -0,0 +1,137 @@ +# JMX connector + +The JMX connector provides the ability to query Java Management Extensions (JMX) +information from all +nodes in a Trino cluster. This is very useful for monitoring or debugging. +JMX provides information about the Java +Virtual Machine and all of the software running inside it. Trino itself +is heavily instrumented via JMX. + +This connector can be configured so that chosen JMX information is +periodically dumped and stored in memory for later access. + +## Configuration + +To configure the JMX connector, create a catalog properties file +`etc/catalog/example.properties` with the following contents: + +```text +connector.name=jmx +``` + +To enable periodical dumps, define the following properties: + +```text +connector.name=jmx +jmx.dump-tables=java.lang:type=Runtime,trino.execution.scheduler:name=NodeScheduler +jmx.dump-period=10s +jmx.max-entries=86400 +``` + +`dump-tables` is a comma separated list of Managed Beans (MBean). It specifies +which MBeans are sampled and stored in memory every `dump-period`. You can +configure the maximum number of history entries with `max-entries` and it +defaults to `86400`. The time between dumps can be configured using +`dump-period` and it defaults to `10s`. + +Commas in MBean names must be escaped using double backslashes (`\\`) in the +following manner: + +```text +connector.name=jmx +jmx.dump-tables=trino.memory:name=general\\,type=memorypool,trino.memory:name=reserved\\,type=memorypool +``` + +Double backslashes are required because a single backslash (`\`) is used to +split the value across multiple lines in the following manner: + +```text +connector.name=jmx +jmx.dump-tables=trino.memory:name=general\\,type=memorypool,\ + trino.memory:name=reserved\\,type=memorypool +``` + +## Querying JMX + +The JMX connector provides two schemas. + +The first one is `current` that contains every MBean from every node in the Trino +cluster. You can see all of the available MBeans by running `SHOW TABLES`: + +``` +SHOW TABLES FROM example.current; +``` + +MBean names map to non-standard table names, and must be quoted with +double quotes when referencing them in a query. For example, the +following query shows the JVM version of every node: + +``` +SELECT node, vmname, vmversion +FROM example.current."java.lang:type=runtime"; +``` + +```text + node | vmname | vmversion +--------------------------------------+-----------------------------------+----------- + ddc4df17-0b8e-4843-bb14-1b8af1a7451a | Java HotSpot(TM) 64-Bit Server VM | 24.60-b09 +(1 row) +``` + +The following query shows the open and maximum file descriptor counts +for each node: + +``` +SELECT openfiledescriptorcount, maxfiledescriptorcount +FROM example.current."java.lang:type=operatingsystem"; +``` + +```text + openfiledescriptorcount | maxfiledescriptorcount +-------------------------+------------------------ + 329 | 10240 +(1 row) +``` + +The wildcard character `*` may be used with table names in the `current` schema. +This allows matching several MBean objects within a single query. The following query +returns information from the different Trino memory pools on each node: + +``` +SELECT freebytes, node, object_name +FROM example.current."trino.memory:*type=memorypool*"; +``` + +```text + freebytes | node | object_name +------------+---------+---------------------------------------------------------- + 214748364 | example | trino.memory:type=MemoryPool,name=reserved + 1073741825 | example | trino.memory:type=MemoryPool,name=general + 858993459 | example | trino.memory:type=MemoryPool,name=system +(3 rows) +``` + +The `history` schema contains the list of tables configured in the connector properties file. +The tables have the same columns as those in the current schema, but with an additional +timestamp column that stores the time at which the snapshot was taken: + +``` +SELECT "timestamp", "uptime" FROM example.history."java.lang:type=runtime"; +``` + +```text + timestamp | uptime +-------------------------+-------- + 2016-01-28 10:18:50.000 | 11420 + 2016-01-28 10:19:00.000 | 21422 + 2016-01-28 10:19:10.000 | 31412 +(3 rows) +``` + +(jmx-sql-support)= + +## SQL support + +The connector provides {ref}`globally available ` and +{ref}`read operation ` statements to access JMX information +on your Trino nodes. diff --git a/docs/src/main/sphinx/connector/jmx.rst b/docs/src/main/sphinx/connector/jmx.rst deleted file mode 100644 index 780d7810ec29..000000000000 --- a/docs/src/main/sphinx/connector/jmx.rst +++ /dev/null @@ -1,132 +0,0 @@ -============= -JMX connector -============= - -The JMX connector provides the ability to query Java Management Extensions (JMX) -information from all -nodes in a Trino cluster. This is very useful for monitoring or debugging. -JMX provides information about the Java -Virtual Machine and all of the software running inside it. Trino itself -is heavily instrumented via JMX. - -This connector can be configured so that chosen JMX information is -periodically dumped and stored in memory for later access. - -Configuration -------------- - -To configure the JMX connector, create a catalog properties file -``etc/catalog/example.properties`` with the following contents: - -.. code-block:: text - - connector.name=jmx - -To enable periodical dumps, define the following properties: - -.. code-block:: text - - connector.name=jmx - jmx.dump-tables=java.lang:type=Runtime,trino.execution.scheduler:name=NodeScheduler - jmx.dump-period=10s - jmx.max-entries=86400 - -``dump-tables`` is a comma separated list of Managed Beans (MBean). It specifies -which MBeans are sampled and stored in memory every ``dump-period``. You can -configure the maximum number of history entries with ``max-entries`` and it -defaults to ``86400``. The time between dumps can be configured using -``dump-period`` and it defaults to ``10s``. - -Commas in MBean names must be escaped using double backslashes (``\\``) in the -following manner: - -.. code-block:: text - - connector.name=jmx - jmx.dump-tables=trino.memory:name=general\\,type=memorypool,trino.memory:name=reserved\\,type=memorypool - -Double backslashes are required because a single backslash (``\``) is used to -split the value across multiple lines in the following manner: - -.. code-block:: text - - connector.name=jmx - jmx.dump-tables=trino.memory:name=general\\,type=memorypool,\ - trino.memory:name=reserved\\,type=memorypool - -Querying JMX ------------- - -The JMX connector provides two schemas. - -The first one is ``current`` that contains every MBean from every node in the Trino -cluster. You can see all of the available MBeans by running ``SHOW TABLES``:: - - SHOW TABLES FROM example.current; - -MBean names map to non-standard table names, and must be quoted with -double quotes when referencing them in a query. For example, the -following query shows the JVM version of every node:: - - SELECT node, vmname, vmversion - FROM example.current."java.lang:type=runtime"; - -.. code-block:: text - - node | vmname | vmversion - --------------------------------------+-----------------------------------+----------- - ddc4df17-0b8e-4843-bb14-1b8af1a7451a | Java HotSpot(TM) 64-Bit Server VM | 24.60-b09 - (1 row) - -The following query shows the open and maximum file descriptor counts -for each node:: - - SELECT openfiledescriptorcount, maxfiledescriptorcount - FROM example.current."java.lang:type=operatingsystem"; - -.. code-block:: text - - openfiledescriptorcount | maxfiledescriptorcount - -------------------------+------------------------ - 329 | 10240 - (1 row) - -The wildcard character ``*`` may be used with table names in the ``current`` schema. -This allows matching several MBean objects within a single query. The following query -returns information from the different Trino memory pools on each node:: - - SELECT freebytes, node, object_name - FROM example.current."trino.memory:*type=memorypool*"; - -.. code-block:: text - - freebytes | node | object_name - ------------+---------+---------------------------------------------------------- - 214748364 | example | trino.memory:type=MemoryPool,name=reserved - 1073741825 | example | trino.memory:type=MemoryPool,name=general - 858993459 | example | trino.memory:type=MemoryPool,name=system - (3 rows) - -The ``history`` schema contains the list of tables configured in the connector properties file. -The tables have the same columns as those in the current schema, but with an additional -timestamp column that stores the time at which the snapshot was taken:: - - SELECT "timestamp", "uptime" FROM example.history."java.lang:type=runtime"; - -.. code-block:: text - - timestamp | uptime - -------------------------+-------- - 2016-01-28 10:18:50.000 | 11420 - 2016-01-28 10:19:00.000 | 21422 - 2016-01-28 10:19:10.000 | 31412 - (3 rows) - -.. _jmx-sql-support: - -SQL support ------------ - -The connector provides :ref:`globally available ` and -:ref:`read operation ` statements to access JMX information -on your Trino nodes. diff --git a/docs/src/main/sphinx/connector/localfile.md b/docs/src/main/sphinx/connector/localfile.md new file mode 100644 index 000000000000..6c3e7663ad7a --- /dev/null +++ b/docs/src/main/sphinx/connector/localfile.md @@ -0,0 +1,34 @@ +# Local file connector + +The local file connector allows querying the HTTP request log files stored on +the local file system of each worker. + +## Configuration + +To configure the local file connector, create a catalog properties file under +`etc/catalog` named, for example, `example.properties` with the following +contents: + +```text +connector.name=localfile +``` + +## Configuration properties + +| Property name | Description | +| -------------------------------------- | ------------------------------------------------------------------------------------------ | +| `trino-logs.http-request-log.location` | Directory or file where HTTP request logs are written | +| `trino-logs.http-request-log.pattern` | If the log location is a directory, this glob is used to match file names in the directory | + +## Local file connector schemas and tables + +The local file connector provides a single schema named `logs`. +You can see all the available tables by running `SHOW TABLES`: + +``` +SHOW TABLES FROM example.logs; +``` + +### `http_request_log` + +This table contains the HTTP request logs from each node on the cluster. diff --git a/docs/src/main/sphinx/connector/localfile.rst b/docs/src/main/sphinx/connector/localfile.rst deleted file mode 100644 index 736146b11723..000000000000 --- a/docs/src/main/sphinx/connector/localfile.rst +++ /dev/null @@ -1,40 +0,0 @@ -==================== -Local file connector -==================== - -The local file connector allows querying the HTTP request log files stored on -the local file system of each worker. - -Configuration -------------- - -To configure the local file connector, create a catalog properties file under -``etc/catalog`` named, for example, ``example.properties`` with the following -contents: - -.. code-block:: text - - connector.name=localfile - -Configuration properties ------------------------- - -========================================= ============================================================== -Property name Description -========================================= ============================================================== -``trino-logs.http-request-log.location`` Directory or file where HTTP request logs are written -``trino-logs.http-request-log.pattern`` If the log location is a directory, this glob is used - to match file names in the directory -========================================= ============================================================== - -Local file connector schemas and tables ---------------------------------------- - -The local file connector provides a single schema named ``logs``. -You can see all the available tables by running ``SHOW TABLES``:: - - SHOW TABLES FROM example.logs; - -``http_request_log`` -^^^^^^^^^^^^^^^^^^^^ -This table contains the HTTP request logs from each node on the cluster. diff --git a/docs/src/main/sphinx/connector/memory.md b/docs/src/main/sphinx/connector/memory.md new file mode 100644 index 000000000000..1a4fcdb62f98 --- /dev/null +++ b/docs/src/main/sphinx/connector/memory.md @@ -0,0 +1,104 @@ +# Memory connector + +The Memory connector stores all data and metadata in RAM on workers +and both are discarded when Trino restarts. + +## Configuration + +To configure the Memory connector, create a catalog properties file +`etc/catalog/example.properties` with the following contents: + +```text +connector.name=memory +memory.max-data-per-node=128MB +``` + +`memory.max-data-per-node` defines memory limit for pages stored in this +connector per each node (default value is 128MB). + +## Examples + +Create a table using the Memory connector: + +``` +CREATE TABLE example.default.nation AS +SELECT * from tpch.tiny.nation; +``` + +Insert data into a table in the Memory connector: + +``` +INSERT INTO example.default.nation +SELECT * FROM tpch.tiny.nation; +``` + +Select from the Memory connector: + +``` +SELECT * FROM example.default.nation; +``` + +Drop table: + +``` +DROP TABLE example.default.nation; +``` + +(memory-type-mapping)= + +## Type mapping + +Trino supports all data types used within the Memory schemas so no mapping is +required. + +(memory-sql-support)= + +## SQL support + +The connector provides read and write access to temporary data and metadata +stored in memory. In addition to the {ref}`globally available +` and {ref}`read operation ` +statements, the connector supports the following features: + +- {doc}`/sql/insert` +- {doc}`/sql/create-table` +- {doc}`/sql/create-table-as` +- {doc}`/sql/drop-table` +- {doc}`/sql/create-schema` +- {doc}`/sql/drop-schema` +- {doc}`/sql/comment` + +### DROP TABLE + +Upon execution of a `DROP TABLE` operation, memory is not released +immediately. It is instead released after the next write operation to the +catalog. + +(memory-dynamic-filtering)= + +## Dynamic filtering + +The Memory connector supports the {doc}`dynamic filtering ` optimization. +Dynamic filters are pushed into local table scan on worker nodes for broadcast joins. + +### Delayed execution for dynamic filters + +For the Memory connector, a table scan is delayed until the collection of dynamic filters. +This can be disabled by using the configuration property `memory.enable-lazy-dynamic-filtering` +in the catalog file. + +## Limitations + +- When one worker fails/restarts, all data that was stored in its + memory is lost. To prevent silent data loss the + connector throws an error on any read access to such + corrupted table. +- When a query fails for any reason during writing to memory table, + the table enters an undefined state. The table should be dropped + and recreated manually. Reading attempts from the table may fail, + or may return partial data. +- When the coordinator fails/restarts, all metadata about tables is + lost. The tables remain on the workers, but become inaccessible. +- This connector does not work properly with multiple + coordinators, since each coordinator has different + metadata. diff --git a/docs/src/main/sphinx/connector/memory.rst b/docs/src/main/sphinx/connector/memory.rst deleted file mode 100644 index 97b819a1543c..000000000000 --- a/docs/src/main/sphinx/connector/memory.rst +++ /dev/null @@ -1,106 +0,0 @@ -================ -Memory connector -================ - -The Memory connector stores all data and metadata in RAM on workers -and both are discarded when Trino restarts. - -Configuration -------------- - -To configure the Memory connector, create a catalog properties file -``etc/catalog/example.properties`` with the following contents: - -.. code-block:: text - - connector.name=memory - memory.max-data-per-node=128MB - -``memory.max-data-per-node`` defines memory limit for pages stored in this -connector per each node (default value is 128MB). - -Examples --------- - -Create a table using the Memory connector:: - - CREATE TABLE example.default.nation AS - SELECT * from tpch.tiny.nation; - -Insert data into a table in the Memory connector:: - - INSERT INTO example.default.nation - SELECT * FROM tpch.tiny.nation; - -Select from the Memory connector:: - - SELECT * FROM example.default.nation; - -Drop table:: - - DROP TABLE example.default.nation; - -.. _memory-type-mapping: - -Type mapping ------------- - -Trino supports all data types used within the Memory schemas so no mapping is -required. - -.. _memory-sql-support: - -SQL support ------------ - -The connector provides read and write access to temporary data and metadata -stored in memory. In addition to the :ref:`globally available -` and :ref:`read operation ` -statements, the connector supports the following features: - -* :doc:`/sql/insert` -* :doc:`/sql/create-table` -* :doc:`/sql/create-table-as` -* :doc:`/sql/drop-table` -* :doc:`/sql/create-schema` -* :doc:`/sql/drop-schema` -* :doc:`/sql/comment` - -DROP TABLE -^^^^^^^^^^ - -Upon execution of a ``DROP TABLE`` operation, memory is not released -immediately. It is instead released after the next write operation to the -catalog. - -.. _memory-dynamic-filtering: - -Dynamic filtering ------------------ - -The Memory connector supports the :doc:`dynamic filtering ` optimization. -Dynamic filters are pushed into local table scan on worker nodes for broadcast joins. - -Delayed execution for dynamic filters -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For the Memory connector, a table scan is delayed until the collection of dynamic filters. -This can be disabled by using the configuration property ``memory.enable-lazy-dynamic-filtering`` -in the catalog file. - -Limitations ------------ - -* When one worker fails/restarts, all data that was stored in its - memory is lost. To prevent silent data loss the - connector throws an error on any read access to such - corrupted table. -* When a query fails for any reason during writing to memory table, - the table enters an undefined state. The table should be dropped - and recreated manually. Reading attempts from the table may fail, - or may return partial data. -* When the coordinator fails/restarts, all metadata about tables is - lost. The tables remain on the workers, but become inaccessible. -* This connector does not work properly with multiple - coordinators, since each coordinator has different - metadata. diff --git a/docs/src/main/sphinx/connector/mongodb.md b/docs/src/main/sphinx/connector/mongodb.md new file mode 100644 index 000000000000..f4967322312c --- /dev/null +++ b/docs/src/main/sphinx/connector/mongodb.md @@ -0,0 +1,506 @@ +# MongoDB connector + +```{raw} html + +``` + +The `mongodb` connector allows the use of [MongoDB](https://www.mongodb.com/) collections as tables in Trino. + +## Requirements + +To connect to MongoDB, you need: + +- MongoDB 4.2 or higher. +- Network access from the Trino coordinator and workers to MongoDB. + Port 27017 is the default port. +- Write access to the {ref}`schema information collection ` + in MongoDB. + +## Configuration + +To configure the MongoDB connector, create a catalog properties file +`etc/catalog/example.properties` with the following contents, +replacing the properties as appropriate: + +```text +connector.name=mongodb +mongodb.connection-url=mongodb://user:pass@sample.host:27017/ +``` + +### Multiple MongoDB clusters + +You can have as many catalogs as you need, so if you have additional +MongoDB clusters, simply add another properties file to `etc/catalog` +with a different name, making sure it ends in `.properties`). For +example, if you name the property file `sales.properties`, Trino +will create a catalog named `sales` using the configured connector. + +## Configuration properties + +The following configuration properties are available: + +| Property name | Description | +| ---------------------------------------- | -------------------------------------------------------------------------- | +| `mongodb.connection-url` | The connection url that the driver uses to connect to a MongoDB deployment | +| `mongodb.schema-collection` | A collection which contains schema information | +| `mongodb.case-insensitive-name-matching` | Match database and collection names case insensitively | +| `mongodb.min-connections-per-host` | The minimum size of the connection pool per host | +| `mongodb.connections-per-host` | The maximum size of the connection pool per host | +| `mongodb.max-wait-time` | The maximum wait time | +| `mongodb.max-connection-idle-time` | The maximum idle time of a pooled connection | +| `mongodb.connection-timeout` | The socket connect timeout | +| `mongodb.socket-timeout` | The socket timeout | +| `mongodb.tls.enabled` | Use TLS/SSL for connections to mongod/mongos | +| `mongodb.tls.keystore-path` | Path to the or JKS key store | +| `mongodb.tls.truststore-path` | Path to the or JKS trust store | +| `mongodb.tls.keystore-password` | Password for the key store | +| `mongodb.tls.truststore-password` | Password for the trust store | +| `mongodb.read-preference` | The read preference | +| `mongodb.write-concern` | The write concern | +| `mongodb.required-replica-set` | The required replica set name | +| `mongodb.cursor-batch-size` | The number of elements to return in a batch | + +### `mongodb.connection-url` + +A connection string containing the protocol, credential, and host info for use +inconnection to your MongoDB deployment. + +For example, the connection string may use the format +`mongodb://:@:/?` or +`mongodb+srv://:@/?`, depending on the protocol +used. The user/pass credentials must be for a user with write access to the +{ref}`schema information collection `. + +See the [MongoDB Connection URI](https://docs.mongodb.com/drivers/java/sync/current/fundamentals/connection/#connection-uri) for more information. + +This property is required; there is no default. A connection URL must be +provided to connect to a MongoDB deployment. + +### `mongodb.schema-collection` + +As MongoDB is a document database, there is no fixed schema information in the system. So a special collection in each MongoDB database should define the schema of all tables. Please refer the {ref}`table-definition-label` section for the details. + +At startup, the connector tries to guess the data type of fields based on the {ref}`type mapping `. + +The initial guess can be incorrect for your specific collection. In that case, you need to modify it manually. Please refer the {ref}`table-definition-label` section for the details. + +Creating new tables using `CREATE TABLE` and `CREATE TABLE AS SELECT` automatically create an entry for you. + +This property is optional; the default is `_schema`. + +### `mongodb.case-insensitive-name-matching` + +Match database and collection names case insensitively. + +This property is optional; the default is `false`. + +### `mongodb.min-connections-per-host` + +The minimum number of connections per host for this MongoClient instance. Those connections are kept in a pool when idle, and the pool ensures over time that it contains at least this minimum number. + +This property is optional; the default is `0`. + +### `mongodb.connections-per-host` + +The maximum number of connections allowed per host for this MongoClient instance. Those connections are kept in a pool when idle. Once the pool is exhausted, any operation requiring a connection blocks waiting for an available connection. + +This property is optional; the default is `100`. + +### `mongodb.max-wait-time` + +The maximum wait time in milliseconds, that a thread may wait for a connection to become available. +A value of `0` means that it does not wait. A negative value means to wait indefinitely for a connection to become available. + +This property is optional; the default is `120000`. + +### `mongodb.max-connection-idle-time` + +The maximum idle time of a pooled connection in milliseconds. A value of `0` indicates no limit to the idle time. +A pooled connection that has exceeded its idle time will be closed and replaced when necessary by a new connection. + +This property is optional; the default is `0`. + +### `mongodb.connection-timeout` + +The connection timeout in milliseconds. A value of `0` means no timeout. It is used solely when establishing a new connection. + +This property is optional; the default is `10000`. + +### `mongodb.socket-timeout` + +The socket timeout in milliseconds. It is used for I/O socket read and write operations. + +This property is optional; the default is `0` and means no timeout. + +### `mongodb.tls.enabled` + +This flag enables TLS connections to MongoDB servers. + +This property is optional; the default is `false`. + +### `mongodb.tls.keystore-path` + +The path to the {doc}`PEM ` or +{doc}`JKS ` key store. + +This property is optional. + +### `mongodb.tls.truststore-path` + +The path to {doc}`PEM ` or +{doc}`JKS ` trust store. + +This property is optional. + +### `mongodb.tls.keystore-password` + +The key password for the key store specified by `mongodb.tls.keystore-path`. + +This property is optional. + +### `mongodb.tls.truststore-password` + +The key password for the trust store specified by `mongodb.tls.truststore-path`. + +This property is optional. + +### `mongodb.read-preference` + +The read preference to use for queries, map-reduce, aggregation, and count. +The available values are `PRIMARY`, `PRIMARY_PREFERRED`, `SECONDARY`, `SECONDARY_PREFERRED` and `NEAREST`. + +This property is optional; the default is `PRIMARY`. + +### `mongodb.write-concern` + +The write concern to use. The available values are +`ACKNOWLEDGED`, `JOURNALED`, `MAJORITY` and `UNACKNOWLEDGED`. + +This property is optional; the default is `ACKNOWLEDGED`. + +### `mongodb.required-replica-set` + +The required replica set name. With this option set, the MongoClient instance performs the following actions: + +``` +#. Connect in replica set mode, and discover all members of the set based on the given servers +#. Make sure that the set name reported by all members matches the required set name. +#. Refuse to service any requests, if authenticated user is not part of a replica set with the required name. +``` + +This property is optional; no default value. + +### `mongodb.cursor-batch-size` + +Limits the number of elements returned in one batch. A cursor typically fetches a batch of result objects and stores them locally. +If batchSize is 0, Driver's default are used. +If batchSize is positive, it represents the size of each batch of objects retrieved. It can be adjusted to optimize performance and limit data transfer. +If batchSize is negative, it limits the number of objects returned, that fit within the max batch size limit (usually 4MB), and the cursor is closed. For example if batchSize is -10, then the server returns a maximum of 10 documents, and as many as can fit in 4MB, then closes the cursor. + +:::{note} +Do not use a batch size of `1`. +::: + +This property is optional; the default is `0`. + +(table-definition-label)= + +## Table definition + +MongoDB maintains table definitions on the special collection where `mongodb.schema-collection` configuration value specifies. + +:::{note} +There's no way for the plugin to detect a collection is deleted. +You need to delete the entry by `db.getCollection("_schema").remove( { table: deleted_table_name })` in the Mongo Shell. +Or drop a collection by running `DROP TABLE table_name` using Trino. +::: + +A schema collection consists of a MongoDB document for a table. + +```text +{ + "table": ..., + "fields": [ + { "name" : ..., + "type" : "varchar|bigint|boolean|double|date|array(bigint)|...", + "hidden" : false }, + ... + ] + } +} +``` + +The connector quotes the fields for a row type when auto-generating the schema. +However, if the schema is being fixed manually in the collection then +the fields need to be explicitly quoted. `row("UpperCase" varchar)` + +| Field | Required | Type | Description | +| -------- | -------- | ------ | ------------------------------------------------------------------------------------------- | +| `table` | required | string | Trino table name | +| `fields` | required | array | A list of field definitions. Each field definition creates a new column in the Trino table. | + +Each field definition: + +```text +{ + "name": ..., + "type": ..., + "hidden": ... +} +``` + +| Field | Required | Type | Description | +| -------- | -------- | ------- | ---------------------------------------------------------------------------------- | +| `name` | required | string | Name of the column in the Trino table. | +| `type` | required | string | Trino type of the column. | +| `hidden` | optional | boolean | Hides the column from `DESCRIBE ` and `SELECT *`. Defaults to `false`. | + +There is no limit on field descriptions for either key or message. + +## ObjectId + +MongoDB collection has the special field `_id`. The connector tries to follow the same rules for this special field, so there will be hidden field `_id`. + +```sql +CREATE TABLE IF NOT EXISTS orders ( + orderkey BIGINT, + orderstatus VARCHAR, + totalprice DOUBLE, + orderdate DATE +); + +INSERT INTO orders VALUES(1, 'bad', 50.0, current_date); +INSERT INTO orders VALUES(2, 'good', 100.0, current_date); +SELECT _id, * FROM orders; +``` + +```text + _id | orderkey | orderstatus | totalprice | orderdate +-------------------------------------+----------+-------------+------------+------------ + 55 b1 51 63 38 64 d6 43 8c 61 a9 ce | 1 | bad | 50.0 | 2015-07-23 + 55 b1 51 67 38 64 d6 43 8c 61 a9 cf | 2 | good | 100.0 | 2015-07-23 +(2 rows) +``` + +```sql +SELECT _id, * FROM orders WHERE _id = ObjectId('55b151633864d6438c61a9ce'); +``` + +```text + _id | orderkey | orderstatus | totalprice | orderdate +-------------------------------------+----------+-------------+------------+------------ + 55 b1 51 63 38 64 d6 43 8c 61 a9 ce | 1 | bad | 50.0 | 2015-07-23 +(1 row) +``` + +You can render the `_id` field to readable values with a cast to `VARCHAR`: + +```sql +SELECT CAST(_id AS VARCHAR), * FROM orders WHERE _id = ObjectId('55b151633864d6438c61a9ce'); +``` + +```text + _id | orderkey | orderstatus | totalprice | orderdate +---------------------------+----------+-------------+------------+------------ + 55b151633864d6438c61a9ce | 1 | bad | 50.0 | 2015-07-23 +(1 row) +``` + +### ObjectId timestamp functions + +The first four bytes of each [ObjectId](https://docs.mongodb.com/manual/reference/method/ObjectId) represent +an embedded timestamp of its creation time. Trino provides a couple of functions to take advantage of this MongoDB feature. + +```{eval-rst} +.. function:: objectid_timestamp(ObjectId) -> timestamp + + Extracts the TIMESTAMP WITH TIME ZONE from a given ObjectId:: + + SELECT objectid_timestamp(ObjectId('507f191e810c19729de860ea')); + -- 2012-10-17 20:46:22.000 UTC +``` + +```{eval-rst} +.. function:: timestamp_objectid(timestamp) -> ObjectId + + Creates an ObjectId from a TIMESTAMP WITH TIME ZONE:: + + SELECT timestamp_objectid(TIMESTAMP '2021-08-07 17:51:36 +00:00'); + -- 61 0e c8 28 00 00 00 00 00 00 00 00 +``` + +In MongoDB, you can filter all the documents created after `2021-08-07 17:51:36` +with a query like this: + +```text +db.collection.find({"_id": {"$gt": ObjectId("610ec8280000000000000000")}}) +``` + +In Trino, the same can be achieved with this query: + +```sql +SELECT * +FROM collection +WHERE _id > timestamp_objectid(TIMESTAMP '2021-08-07 17:51:36 +00:00'); +``` + +(mongodb-type-mapping)= + +## Type mapping + +Because Trino and MongoDB each support types that the other does not, this +connector {ref}`modifies some types ` when reading or +writing data. Data types may not map the same way in both directions between +Trino and the data source. Refer to the following sections for type mapping in +each direction. + +### MongoDB to Trino type mapping + +The connector maps MongoDB types to the corresponding Trino types following +this table: + +```{eval-rst} +.. list-table:: MongoDB to Trino type mapping + :widths: 30, 20, 50 + :header-rows: 1 + + * - MongoDB type + - Trino type + - Notes + * - ``Boolean`` + - ``BOOLEAN`` + - + * - ``Int32`` + - ``BIGINT`` + - + * - ``Int64`` + - ``BIGINT`` + - + * - ``Double`` + - ``DOUBLE`` + - + * - ``Decimal128`` + - ``DECIMAL(p, s)`` + - + * - ``Date`` + - ``TIMESTAMP(3)`` + - + * - ``String`` + - ``VARCHAR`` + - + * - ``Binary`` + - ``VARBINARY`` + - + * - ``ObjectId`` + - ``ObjectId`` + - + * - ``Object`` + - ``ROW`` + - + * - ``Array`` + - ``ARRAY`` + - Map to ``ROW`` if the element type is not unique. + * - ``DBRef`` + - ``ROW`` + - +``` + +No other types are supported. + +### Trino to MongoDB type mapping + +The connector maps Trino types to the corresponding MongoDB types following +this table: + +```{eval-rst} +.. list-table:: Trino to MongoDB type mapping + :widths: 30, 20 + :header-rows: 1 + + * - Trino type + - MongoDB type + * - ``BOOLEAN`` + - ``Boolean`` + * - ``BIGINT`` + - ``Int64`` + * - ``DOUBLE`` + - ``Double`` + * - ``DECIMAL(p, s)`` + - ``Decimal128`` + * - ``TIMESTAMP(3)`` + - ``Date`` + * - ``VARCHAR`` + - ``String`` + * - ``VARBINARY`` + - ``Binary`` + * - ``ObjectId`` + - ``ObjectId`` + * - ``ROW`` + - ``Object`` + * - ``ARRAY`` + - ``Array`` +``` + +No other types are supported. + +(mongodb-sql-support)= + +## SQL support + +The connector provides read and write access to data and metadata in +MongoDB. In addition to the {ref}`globally available +` and {ref}`read operation ` +statements, the connector supports the following features: + +- {doc}`/sql/insert` +- {doc}`/sql/delete` +- {doc}`/sql/create-table` +- {doc}`/sql/create-table-as` +- {doc}`/sql/drop-table` +- {doc}`/sql/alter-table` +- {doc}`/sql/create-schema` +- {doc}`/sql/drop-schema` +- {doc}`/sql/comment` + +### ALTER TABLE + +The connector supports `ALTER TABLE RENAME TO`, `ALTER TABLE ADD COLUMN` +and `ALTER TABLE DROP COLUMN` operations. +Other uses of `ALTER TABLE` are not supported. + +(mongodb-fte-support)= + +## Fault-tolerant execution support + +The connector supports {doc}`/admin/fault-tolerant-execution` of query +processing. Read and write operations are both supported with any retry policy. + +## Table functions + +The connector provides specific {doc}`table functions ` to +access MongoDB. + +(mongodb-query-function)= + +### `query(database, collection, filter) -> table` + +The `query` function allows you to query the underlying MongoDB directly. It +requires syntax native to MongoDB, because the full query is pushed down and +processed by MongoDB. This can be useful for accessing native features which are +not available in Trino or for improving query performance in situations where +running a query natively may be faster. + +For example, get all rows where `regionkey` field is 0: + +``` +SELECT + * +FROM + TABLE( + example.system.query( + database => 'tpch', + collection => 'region', + filter => '{ regionkey: 0 }' + ) + ); +``` diff --git a/docs/src/main/sphinx/connector/mongodb.rst b/docs/src/main/sphinx/connector/mongodb.rst deleted file mode 100644 index 6d8ebd2f3ca4..000000000000 --- a/docs/src/main/sphinx/connector/mongodb.rst +++ /dev/null @@ -1,534 +0,0 @@ -================= -MongoDB connector -================= - -.. raw:: html - - - -The ``mongodb`` connector allows the use of `MongoDB `_ collections as tables in Trino. - - -Requirements ------------- - -To connect to MongoDB, you need: - -* MongoDB 4.2 or higher. -* Network access from the Trino coordinator and workers to MongoDB. - Port 27017 is the default port. -* Write access to the :ref:`schema information collection ` - in MongoDB. - -Configuration -------------- - -To configure the MongoDB connector, create a catalog properties file -``etc/catalog/example.properties`` with the following contents, -replacing the properties as appropriate: - -.. code-block:: text - - connector.name=mongodb - mongodb.connection-url=mongodb://user:pass@sample.host:27017/ - -Multiple MongoDB clusters -^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can have as many catalogs as you need, so if you have additional -MongoDB clusters, simply add another properties file to ``etc/catalog`` -with a different name, making sure it ends in ``.properties``). For -example, if you name the property file ``sales.properties``, Trino -will create a catalog named ``sales`` using the configured connector. - -Configuration properties ------------------------- - -The following configuration properties are available: - -========================================== ============================================================== -Property name Description -========================================== ============================================================== -``mongodb.connection-url`` The connection url that the driver uses to connect to a MongoDB deployment -``mongodb.schema-collection`` A collection which contains schema information -``mongodb.case-insensitive-name-matching`` Match database and collection names case insensitively -``mongodb.min-connections-per-host`` The minimum size of the connection pool per host -``mongodb.connections-per-host`` The maximum size of the connection pool per host -``mongodb.max-wait-time`` The maximum wait time -``mongodb.max-connection-idle-time`` The maximum idle time of a pooled connection -``mongodb.connection-timeout`` The socket connect timeout -``mongodb.socket-timeout`` The socket timeout -``mongodb.tls.enabled`` Use TLS/SSL for connections to mongod/mongos -``mongodb.tls.keystore-path`` Path to the or JKS key store -``mongodb.tls.truststore-path`` Path to the or JKS trust store -``mongodb.tls.keystore-password`` Password for the key store -``mongodb.tls.truststore-password`` Password for the trust store -``mongodb.read-preference`` The read preference -``mongodb.write-concern`` The write concern -``mongodb.required-replica-set`` The required replica set name -``mongodb.cursor-batch-size`` The number of elements to return in a batch -========================================== ============================================================== - -``mongodb.connection-url`` -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A connection string containing the protocol, credential, and host info for use -inconnection to your MongoDB deployment. - -For example, the connection string may use the format -``mongodb://:@:/?`` or -``mongodb+srv://:@/?``, depending on the protocol -used. The user/pass credentials must be for a user with write access to the -:ref:`schema information collection `. - -See the `MongoDB Connection URI `_ for more information. - -This property is required; there is no default. A connection URL must be -provided to connect to a MongoDB deployment. - -``mongodb.schema-collection`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -As MongoDB is a document database, there is no fixed schema information in the system. So a special collection in each MongoDB database should define the schema of all tables. Please refer the :ref:`table-definition-label` section for the details. - -At startup, the connector tries to guess the data type of fields based on the :ref:`type mapping `. - -The initial guess can be incorrect for your specific collection. In that case, you need to modify it manually. Please refer the :ref:`table-definition-label` section for the details. - -Creating new tables using ``CREATE TABLE`` and ``CREATE TABLE AS SELECT`` automatically create an entry for you. - -This property is optional; the default is ``_schema``. - -``mongodb.case-insensitive-name-matching`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Match database and collection names case insensitively. - -This property is optional; the default is ``false``. - -``mongodb.min-connections-per-host`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The minimum number of connections per host for this MongoClient instance. Those connections are kept in a pool when idle, and the pool ensures over time that it contains at least this minimum number. - -This property is optional; the default is ``0``. - -``mongodb.connections-per-host`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The maximum number of connections allowed per host for this MongoClient instance. Those connections are kept in a pool when idle. Once the pool is exhausted, any operation requiring a connection blocks waiting for an available connection. - -This property is optional; the default is ``100``. - -``mongodb.max-wait-time`` -^^^^^^^^^^^^^^^^^^^^^^^^^ - -The maximum wait time in milliseconds, that a thread may wait for a connection to become available. -A value of ``0`` means that it does not wait. A negative value means to wait indefinitely for a connection to become available. - -This property is optional; the default is ``120000``. - -``mongodb.max-connection-idle-time`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The maximum idle time of a pooled connection in milliseconds. A value of ``0`` indicates no limit to the idle time. -A pooled connection that has exceeded its idle time will be closed and replaced when necessary by a new connection. - -This property is optional; the default is ``0``. - -``mongodb.connection-timeout`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The connection timeout in milliseconds. A value of ``0`` means no timeout. It is used solely when establishing a new connection. - -This property is optional; the default is ``10000``. - -``mongodb.socket-timeout`` -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The socket timeout in milliseconds. It is used for I/O socket read and write operations. - -This property is optional; the default is ``0`` and means no timeout. - -``mongodb.tls.enabled`` -^^^^^^^^^^^^^^^^^^^^^^^^ - -This flag enables TLS connections to MongoDB servers. - -This property is optional; the default is ``false``. - -``mongodb.tls.keystore-path`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The path to the :doc:`PEM ` or -:doc:`JKS ` key store. - -This property is optional. - -``mongodb.tls.truststore-path`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The path to :doc:`PEM ` or -:doc:`JKS ` trust store. - -This property is optional. - -``mongodb.tls.keystore-password`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The key password for the key store specified by ``mongodb.tls.keystore-path``. - -This property is optional. - -``mongodb.tls.truststore-password`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The key password for the trust store specified by ``mongodb.tls.truststore-path``. - -This property is optional. - -``mongodb.read-preference`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The read preference to use for queries, map-reduce, aggregation, and count. -The available values are ``PRIMARY``, ``PRIMARY_PREFERRED``, ``SECONDARY``, ``SECONDARY_PREFERRED`` and ``NEAREST``. - -This property is optional; the default is ``PRIMARY``. - -``mongodb.write-concern`` -^^^^^^^^^^^^^^^^^^^^^^^^^ - -The write concern to use. The available values are -``ACKNOWLEDGED``, ``JOURNALED``, ``MAJORITY`` and ``UNACKNOWLEDGED``. - -This property is optional; the default is ``ACKNOWLEDGED``. - -``mongodb.required-replica-set`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The required replica set name. With this option set, the MongoClient instance performs the following actions:: - -#. Connect in replica set mode, and discover all members of the set based on the given servers -#. Make sure that the set name reported by all members matches the required set name. -#. Refuse to service any requests, if authenticated user is not part of a replica set with the required name. - -This property is optional; no default value. - -``mongodb.cursor-batch-size`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Limits the number of elements returned in one batch. A cursor typically fetches a batch of result objects and stores them locally. -If batchSize is 0, Driver's default are used. -If batchSize is positive, it represents the size of each batch of objects retrieved. It can be adjusted to optimize performance and limit data transfer. -If batchSize is negative, it limits the number of objects returned, that fit within the max batch size limit (usually 4MB), and the cursor is closed. For example if batchSize is -10, then the server returns a maximum of 10 documents, and as many as can fit in 4MB, then closes the cursor. - -.. note:: Do not use a batch size of ``1``. - -This property is optional; the default is ``0``. - -.. _table-definition-label: - -Table definition ----------------- - -MongoDB maintains table definitions on the special collection where ``mongodb.schema-collection`` configuration value specifies. - -.. note:: - - There's no way for the plugin to detect a collection is deleted. - You need to delete the entry by ``db.getCollection("_schema").remove( { table: deleted_table_name })`` in the Mongo Shell. - Or drop a collection by running ``DROP TABLE table_name`` using Trino. - -A schema collection consists of a MongoDB document for a table. - -.. code-block:: text - - { - "table": ..., - "fields": [ - { "name" : ..., - "type" : "varchar|bigint|boolean|double|date|array(bigint)|...", - "hidden" : false }, - ... - ] - } - } - -The connector quotes the fields for a row type when auto-generating the schema. -However, if the schema is being fixed manually in the collection then -the fields need to be explicitly quoted. ``row("UpperCase" varchar)`` - -=============== ========= ============== ============================= -Field Required Type Description -=============== ========= ============== ============================= -``table`` required string Trino table name -``fields`` required array A list of field definitions. Each field definition creates a new column in the Trino table. -=============== ========= ============== ============================= - -Each field definition: - -.. code-block:: text - - { - "name": ..., - "type": ..., - "hidden": ... - } - -=============== ========= ========= ============================= -Field Required Type Description -=============== ========= ========= ============================= -``name`` required string Name of the column in the Trino table. -``type`` required string Trino type of the column. -``hidden`` optional boolean Hides the column from ``DESCRIBE
`` and ``SELECT *``. Defaults to ``false``. -=============== ========= ========= ============================= - -There is no limit on field descriptions for either key or message. - -ObjectId --------- - -MongoDB collection has the special field ``_id``. The connector tries to follow the same rules for this special field, so there will be hidden field ``_id``. - -.. code-block:: sql - - CREATE TABLE IF NOT EXISTS orders ( - orderkey BIGINT, - orderstatus VARCHAR, - totalprice DOUBLE, - orderdate DATE - ); - - INSERT INTO orders VALUES(1, 'bad', 50.0, current_date); - INSERT INTO orders VALUES(2, 'good', 100.0, current_date); - SELECT _id, * FROM orders; - -.. code-block:: text - - _id | orderkey | orderstatus | totalprice | orderdate - -------------------------------------+----------+-------------+------------+------------ - 55 b1 51 63 38 64 d6 43 8c 61 a9 ce | 1 | bad | 50.0 | 2015-07-23 - 55 b1 51 67 38 64 d6 43 8c 61 a9 cf | 2 | good | 100.0 | 2015-07-23 - (2 rows) - -.. code-block:: sql - - SELECT _id, * FROM orders WHERE _id = ObjectId('55b151633864d6438c61a9ce'); - -.. code-block:: text - - _id | orderkey | orderstatus | totalprice | orderdate - -------------------------------------+----------+-------------+------------+------------ - 55 b1 51 63 38 64 d6 43 8c 61 a9 ce | 1 | bad | 50.0 | 2015-07-23 - (1 row) - -You can render the ``_id`` field to readable values with a cast to ``VARCHAR``: - -.. code-block:: sql - - SELECT CAST(_id AS VARCHAR), * FROM orders WHERE _id = ObjectId('55b151633864d6438c61a9ce'); - -.. code-block:: text - - _id | orderkey | orderstatus | totalprice | orderdate - ---------------------------+----------+-------------+------------+------------ - 55b151633864d6438c61a9ce | 1 | bad | 50.0 | 2015-07-23 - (1 row) - -ObjectId timestamp functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The first four bytes of each `ObjectId `_ represent -an embedded timestamp of its creation time. Trino provides a couple of functions to take advantage of this MongoDB feature. - -.. function:: objectid_timestamp(ObjectId) -> timestamp - - Extracts the TIMESTAMP WITH TIME ZONE from a given ObjectId:: - - SELECT objectid_timestamp(ObjectId('507f191e810c19729de860ea')); - -- 2012-10-17 20:46:22.000 UTC - -.. function:: timestamp_objectid(timestamp) -> ObjectId - - Creates an ObjectId from a TIMESTAMP WITH TIME ZONE:: - - SELECT timestamp_objectid(TIMESTAMP '2021-08-07 17:51:36 +00:00'); - -- 61 0e c8 28 00 00 00 00 00 00 00 00 - -In MongoDB, you can filter all the documents created after ``2021-08-07 17:51:36`` -with a query like this: - -.. code-block:: text - - db.collection.find({"_id": {"$gt": ObjectId("610ec8280000000000000000")}}) - -In Trino, the same can be achieved with this query: - -.. code-block:: sql - - SELECT * - FROM collection - WHERE _id > timestamp_objectid(TIMESTAMP '2021-08-07 17:51:36 +00:00'); - -.. _mongodb-type-mapping: - -Type mapping ------------- - -Because Trino and MongoDB each support types that the other does not, this -connector :ref:`modifies some types ` when reading or -writing data. Data types may not map the same way in both directions between -Trino and the data source. Refer to the following sections for type mapping in -each direction. - -MongoDB to Trino type mapping -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The connector maps MongoDB types to the corresponding Trino types following -this table: - -.. list-table:: MongoDB to Trino type mapping - :widths: 30, 20, 50 - :header-rows: 1 - - * - MongoDB type - - Trino type - - Notes - * - ``Boolean`` - - ``BOOLEAN`` - - - * - ``Int32`` - - ``BIGINT`` - - - * - ``Int64`` - - ``BIGINT`` - - - * - ``Double`` - - ``DOUBLE`` - - - * - ``Decimal128`` - - ``DECIMAL(p, s)`` - - - * - ``Date`` - - ``TIMESTAMP(3)`` - - - * - ``String`` - - ``VARCHAR`` - - - * - ``Binary`` - - ``VARBINARY`` - - - * - ``ObjectId`` - - ``ObjectId`` - - - * - ``Object`` - - ``ROW`` - - - * - ``Array`` - - ``ARRAY`` - - Map to ``ROW`` if the element type is not unique. - * - ``DBRef`` - - ``ROW`` - - - -No other types are supported. - -Trino to MongoDB type mapping -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The connector maps Trino types to the corresponding MongoDB types following -this table: - -.. list-table:: Trino to MongoDB type mapping - :widths: 30, 20 - :header-rows: 1 - - * - Trino type - - MongoDB type - * - ``BOOLEAN`` - - ``Boolean`` - * - ``BIGINT`` - - ``Int64`` - * - ``DOUBLE`` - - ``Double`` - * - ``DECIMAL(p, s)`` - - ``Decimal128`` - * - ``TIMESTAMP(3)`` - - ``Date`` - * - ``VARCHAR`` - - ``String`` - * - ``VARBINARY`` - - ``Binary`` - * - ``ObjectId`` - - ``ObjectId`` - * - ``ROW`` - - ``Object`` - * - ``ARRAY`` - - ``Array`` - -No other types are supported. - -.. _mongodb-sql-support: - -SQL support ------------ - -The connector provides read and write access to data and metadata in -MongoDB. In addition to the :ref:`globally available -` and :ref:`read operation ` -statements, the connector supports the following features: - -* :doc:`/sql/insert` -* :doc:`/sql/delete` -* :doc:`/sql/create-table` -* :doc:`/sql/create-table-as` -* :doc:`/sql/drop-table` -* :doc:`/sql/alter-table` -* :doc:`/sql/create-schema` -* :doc:`/sql/drop-schema` -* :doc:`/sql/comment` - -ALTER TABLE -^^^^^^^^^^^ - -The connector supports ``ALTER TABLE RENAME TO``, ``ALTER TABLE ADD COLUMN`` -and ``ALTER TABLE DROP COLUMN`` operations. -Other uses of ``ALTER TABLE`` are not supported. - -.. _mongodb-fte-support: - -Fault-tolerant execution support --------------------------------- - -The connector supports :doc:`/admin/fault-tolerant-execution` of query -processing. Read and write operations are both supported with any retry policy. - -Table functions ---------------- - -The connector provides specific :doc:`table functions ` to -access MongoDB. - -.. _mongodb-query-function: - -``query(database, collection, filter) -> table`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ``query`` function allows you to query the underlying MongoDB directly. It -requires syntax native to MongoDB, because the full query is pushed down and -processed by MongoDB. This can be useful for accessing native features which are -not available in Trino or for improving query performance in situations where -running a query natively may be faster. - -For example, get all rows where ``regionkey`` field is 0:: - - SELECT - * - FROM - TABLE( - example.system.query( - database => 'tpch', - collection => 'region', - filter => '{ regionkey: 0 }' - ) - ); diff --git a/docs/src/main/sphinx/connector/prometheus.md b/docs/src/main/sphinx/connector/prometheus.md new file mode 100644 index 000000000000..357c68236eb1 --- /dev/null +++ b/docs/src/main/sphinx/connector/prometheus.md @@ -0,0 +1,132 @@ +# Prometheus connector + +```{raw} html + +``` + +The Prometheus connector allows reading +[Prometheus](https://prometheus.io/) +metrics as tables in Trino. + +The mechanism for querying Prometheus is to use the Prometheus HTTP API. Specifically, all queries are resolved to Prometheus Instant queries +with a form like: . +In this case the `up` metric is taken from the Trino query table name, `21d` is the duration of the query. The Prometheus `time` value +corresponds to the `TIMESTAMP` field. Trino queries are translated from their use of the `TIMESTAMP` field to a duration and time value +as needed. Trino splits are generated by dividing the query range into attempted equal chunks. + +## Requirements + +To query Prometheus, you need: + +- Network access from the Trino coordinator and workers to the Prometheus + server. The default port is 9090. +- Prometheus version 2.15.1 or later. + +## Configuration + +Create `etc/catalog/example.properties` to mount the Prometheus connector as +the `example` catalog, replacing the properties as appropriate: + +```text +connector.name=prometheus +prometheus.uri=http://localhost:9090 +prometheus.query.chunk.size.duration=1d +prometheus.max.query.range.duration=21d +prometheus.cache.ttl=30s +prometheus.bearer.token.file=/path/to/bearer/token/file +prometheus.read-timeout=10s +``` + +## Configuration properties + +The following configuration properties are available: + +| Property name | Description | +| ------------------------------------------- | -------------------------------------------------------------------------------------------- | +| `prometheus.uri` | Where to find Prometheus coordinator host | +| `prometheus.query.chunk.size.duration` | The duration of each query to Prometheus | +| `prometheus.max.query.range.duration` | Width of overall query to Prometheus, will be divided into query-chunk-size-duration queries | +| `prometheus.cache.ttl` | How long values from this config file are cached | +| `prometheus.auth.user` | Username for basic authentication | +| `prometheus.auth.password` | Password for basic authentication | +| `prometheus.bearer.token.file` | File holding bearer token if needed for access to Prometheus | +| `prometheus.read-timeout` | How much time a query to Prometheus has before timing out | +| `prometheus.case-insensitive-name-matching` | Match Prometheus metric names case insensitively. Defaults to `false` | + +## Not exhausting your Trino available heap + +The `prometheus.query.chunk.size.duration` and `prometheus.max.query.range.duration` are values to protect Trino from +too much data coming back from Prometheus. The `prometheus.max.query.range.duration` is the item of +particular interest. + +On a Prometheus instance that has been running for awhile and depending +on data retention settings, `21d` might be far too much. Perhaps `1h` might be a more reasonable setting. +In the case of `1h` it might be then useful to set `prometheus.query.chunk.size.duration` to `10m`, dividing the +query window into 6 queries each of which can be handled in a Trino split. + +Primarily query issuers can limit the amount of data returned by Prometheus by taking +advantage of `WHERE` clause limits on `TIMESTAMP`, setting an upper bound and lower bound that define +a relatively small window. For example: + +```sql +SELECT * FROM example.default.up WHERE TIMESTAMP > (NOW() - INTERVAL '10' second); +``` + +If the query does not include a WHERE clause limit, these config +settings are meant to protect against an unlimited query. + +## Bearer token authentication + +Prometheus can be setup to require a Authorization header with every query. The value in +`prometheus.bearer.token.file` allows for a bearer token to be read from the configured file. This file +is optional and not required unless your Prometheus setup requires it. + +(prometheus-type-mapping)= + +## Type mapping + +Because Trino and Prometheus each support types that the other does not, this +connector {ref}`modifies some types ` when reading data. + +The connector returns fixed columns that have a defined mapping to Trino types +according to the following table: + +```{eval-rst} +.. list-table:: Prometheus column to Trino type mapping + :widths: 50, 50 + :header-rows: 1 + + * - Prometheus column + - Trino type + * - ``labels`` + - ``MAP(VARCHAR,VARCHAR)`` + * - ``TIMESTAMP`` + - ``TIMESTAMP(3) WITH TIMEZONE`` + * - ``value`` + - ``DOUBLE`` +``` + +No other types are supported. + +The following example query result shows how the Prometheus `up` metric is +represented in Trino: + +```sql +SELECT * FROM example.default.up; +``` + +```text + labels | timestamp | value +--------------------------------------------------------+--------------------------------+------- +{instance=localhost:9090, job=prometheus, __name__=up} | 2022-09-01 06:18:54.481 +09:00 | 1.0 +{instance=localhost:9090, job=prometheus, __name__=up} | 2022-09-01 06:19:09.446 +09:00 | 1.0 +(2 rows) +``` + +(prometheus-sql-support)= + +## SQL support + +The connector provides {ref}`globally available ` and +{ref}`read operation ` statements to access data and +metadata in Prometheus. diff --git a/docs/src/main/sphinx/connector/prometheus.rst b/docs/src/main/sphinx/connector/prometheus.rst deleted file mode 100644 index e7850372f796..000000000000 --- a/docs/src/main/sphinx/connector/prometheus.rst +++ /dev/null @@ -1,142 +0,0 @@ -==================== -Prometheus connector -==================== - -.. raw:: html - - - -The Prometheus connector allows reading -`Prometheus `_ -metrics as tables in Trino. - -The mechanism for querying Prometheus is to use the Prometheus HTTP API. Specifically, all queries are resolved to Prometheus Instant queries -with a form like: http://localhost:9090/api/v1/query?query=up[21d]&time=1568229904.000. -In this case the ``up`` metric is taken from the Trino query table name, ``21d`` is the duration of the query. The Prometheus ``time`` value -corresponds to the ``TIMESTAMP`` field. Trino queries are translated from their use of the ``TIMESTAMP`` field to a duration and time value -as needed. Trino splits are generated by dividing the query range into attempted equal chunks. - -Requirements ------------- - -To query Prometheus, you need: - -* Network access from the Trino coordinator and workers to the Prometheus - server. The default port is 9090. -* Prometheus version 2.15.1 or later. - -Configuration -------------- - -Create ``etc/catalog/example.properties`` to mount the Prometheus connector as -the ``example`` catalog, replacing the properties as appropriate: - -.. code-block:: text - - connector.name=prometheus - prometheus.uri=http://localhost:9090 - prometheus.query.chunk.size.duration=1d - prometheus.max.query.range.duration=21d - prometheus.cache.ttl=30s - prometheus.bearer.token.file=/path/to/bearer/token/file - prometheus.read-timeout=10s - -Configuration properties ------------------------- - -The following configuration properties are available: - -============================================= ============================================================================================ -Property name Description -============================================= ============================================================================================ -``prometheus.uri`` Where to find Prometheus coordinator host -``prometheus.query.chunk.size.duration`` The duration of each query to Prometheus -``prometheus.max.query.range.duration`` Width of overall query to Prometheus, will be divided into query-chunk-size-duration queries -``prometheus.cache.ttl`` How long values from this config file are cached -``prometheus.auth.user`` Username for basic authentication -``prometheus.auth.password`` Password for basic authentication -``prometheus.bearer.token.file`` File holding bearer token if needed for access to Prometheus -``prometheus.read-timeout`` How much time a query to Prometheus has before timing out -``prometheus.case-insensitive-name-matching`` Match Prometheus metric names case insensitively. Defaults to ``false`` -============================================= ============================================================================================ - -Not exhausting your Trino available heap ------------------------------------------ - -The ``prometheus.query.chunk.size.duration`` and ``prometheus.max.query.range.duration`` are values to protect Trino from -too much data coming back from Prometheus. The ``prometheus.max.query.range.duration`` is the item of -particular interest. - -On a Prometheus instance that has been running for awhile and depending -on data retention settings, ``21d`` might be far too much. Perhaps ``1h`` might be a more reasonable setting. -In the case of ``1h`` it might be then useful to set ``prometheus.query.chunk.size.duration`` to ``10m``, dividing the -query window into 6 queries each of which can be handled in a Trino split. - -Primarily query issuers can limit the amount of data returned by Prometheus by taking -advantage of ``WHERE`` clause limits on ``TIMESTAMP``, setting an upper bound and lower bound that define -a relatively small window. For example: - -.. code-block:: sql - - SELECT * FROM example.default.up WHERE TIMESTAMP > (NOW() - INTERVAL '10' second); - -If the query does not include a WHERE clause limit, these config -settings are meant to protect against an unlimited query. - - -Bearer token authentication ---------------------------- - -Prometheus can be setup to require a Authorization header with every query. The value in -``prometheus.bearer.token.file`` allows for a bearer token to be read from the configured file. This file -is optional and not required unless your Prometheus setup requires it. - -.. _prometheus-type-mapping: - -Type mapping ------------- - -Because Trino and Prometheus each support types that the other does not, this -connector :ref:`modifies some types ` when reading data. - -The connector returns fixed columns that have a defined mapping to Trino types -according to the following table: - -.. list-table:: Prometheus column to Trino type mapping - :widths: 50, 50 - :header-rows: 1 - - * - Prometheus column - - Trino type - * - ``labels`` - - ``MAP(VARCHAR,VARCHAR)`` - * - ``TIMESTAMP`` - - ``TIMESTAMP(3) WITH TIMEZONE`` - * - ``value`` - - ``DOUBLE`` - -No other types are supported. - -The following example query result shows how the Prometheus ``up`` metric is -represented in Trino: - -.. code-block:: sql - - SELECT * FROM example.default.up; - -.. code-block:: text - - labels | timestamp | value - --------------------------------------------------------+--------------------------------+------- - {instance=localhost:9090, job=prometheus, __name__=up} | 2022-09-01 06:18:54.481 +09:00 | 1.0 - {instance=localhost:9090, job=prometheus, __name__=up} | 2022-09-01 06:19:09.446 +09:00 | 1.0 - (2 rows) - -.. _prometheus-sql-support: - -SQL support ------------ - -The connector provides :ref:`globally available ` and -:ref:`read operation ` statements to access data and -metadata in Prometheus. diff --git a/docs/src/main/sphinx/connector/system.rst b/docs/src/main/sphinx/connector/system.md similarity index 66% rename from docs/src/main/sphinx/connector/system.rst rename to docs/src/main/sphinx/connector/system.md index edf0c4493154..a4f034bbe515 100644 --- a/docs/src/main/sphinx/connector/system.rst +++ b/docs/src/main/sphinx/connector/system.md @@ -1,63 +1,63 @@ -================ -System connector -================ +# System connector The System connector provides information and metrics about the currently running Trino cluster. It makes this available via normal SQL queries. -Configuration -------------- +## Configuration The System connector doesn't need to be configured: it is automatically -available via a catalog named ``system``. +available via a catalog named `system`. -Using the System connector --------------------------- +## Using the System connector -List the available system schemas:: +List the available system schemas: - SHOW SCHEMAS FROM system; +``` +SHOW SCHEMAS FROM system; +``` -List the tables in one of the schemas:: +List the tables in one of the schemas: - SHOW TABLES FROM system.runtime; +``` +SHOW TABLES FROM system.runtime; +``` -Query one of the tables:: +Query one of the tables: - SELECT * FROM system.runtime.nodes; +``` +SELECT * FROM system.runtime.nodes; +``` -Kill a running query:: +Kill a running query: - CALL system.runtime.kill_query(query_id => '20151207_215727_00146_tx3nr', message => 'Using too many resources'); +``` +CALL system.runtime.kill_query(query_id => '20151207_215727_00146_tx3nr', message => 'Using too many resources'); +``` -System connector tables ------------------------ +## System connector tables -``metadata.catalogs`` -^^^^^^^^^^^^^^^^^^^^^ +### `metadata.catalogs` The catalogs table contains the list of available catalogs. -``metadata.schema_properties`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### `metadata.schema_properties` The schema properties table contains the list of available properties that can be set when creating a new schema. -``metadata.table_properties`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### `metadata.table_properties` The table properties table contains the list of available properties that can be set when creating a new table. -.. _system-metadata-materialized-views: +(system-metadata-materialized-views)= -``metadata.materialized_views`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### `metadata.materialized_views` The materialized views table contains the following information about all -:ref:`materialized views `: +{ref}`materialized views `: +```{eval-rst} .. list-table:: Metadata for materialized views :widths: 30, 70 :header-rows: 1 @@ -89,39 +89,35 @@ The materialized views table contains the following information about all - User supplied text about the materialized view. * - ``definition`` - SQL query that defines the data provided by the materialized view. +``` -``metadata.materialized_view_properties`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### `metadata.materialized_view_properties` The materialized view properties table contains the list of available properties that can be set when creating a new materialized view. -``metadata.table_comments`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### `metadata.table_comments` The table comments table contains the list of table comment. -``runtime.nodes`` -^^^^^^^^^^^^^^^^^ +### `runtime.nodes` The nodes table contains the list of visible nodes in the Trino cluster along with their status. -.. _optimizer-rule-stats: +(optimizer-rule-stats)= -``runtime.optimizer_rule_stats`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### `runtime.optimizer_rule_stats` -The ``optimizer_rule_stats`` table contains the statistics for optimizer +The `optimizer_rule_stats` table contains the statistics for optimizer rule invocations during the query planning phase. The statistics are aggregated over all queries since the server start-up. The table contains information about invocation frequency, failure rates and performance for optimizer rules. For example, you can look at the multiplication of columns -``invocations`` and ``average_time`` to get an idea about which rules +`invocations` and `average_time` to get an idea about which rules generally impact query planning times the most. -``runtime.queries`` -^^^^^^^^^^^^^^^^^^^ +### `runtime.queries` The queries table contains information about currently and recently running queries on the Trino cluster. From this table you can find out @@ -129,41 +125,38 @@ the original query SQL text, the identity of the user who ran the query, and performance information about the query, including how long the query was queued and analyzed. -``runtime.tasks`` -^^^^^^^^^^^^^^^^^ +### `runtime.tasks` The tasks table contains information about the tasks involved in a Trino query, including where they were executed, and how many rows and bytes each task processed. -``runtime.transactions`` -^^^^^^^^^^^^^^^^^^^^^^^^ +### `runtime.transactions` The transactions table contains the list of currently open transactions and related metadata. This includes information such as the create time, idle time, initialization parameters, and accessed catalogs. -System connector procedures ---------------------------- +## System connector procedures +```{eval-rst} .. function:: runtime.kill_query(query_id, message) Kill the query identified by ``query_id``. The query failure message includes the specified ``message``. ``message`` is optional. +``` -.. _system-type-mapping: +(system-type-mapping)= -Type mapping ------------- +## Type mapping Trino supports all data types used within the System schemas so no mapping is required. -.. _system-sql-support: +(system-sql-support)= -SQL support ------------ +## SQL support -The connector provides :ref:`globally available ` and -:ref:`read operation ` statements to access Trino system +The connector provides {ref}`globally available ` and +{ref}`read operation ` statements to access Trino system data and metadata. diff --git a/docs/src/main/sphinx/connector/thrift.md b/docs/src/main/sphinx/connector/thrift.md new file mode 100644 index 000000000000..db9a5dbd58da --- /dev/null +++ b/docs/src/main/sphinx/connector/thrift.md @@ -0,0 +1,108 @@ +# Thrift connector + +The Thrift connector makes it possible to integrate with external storage systems +without a custom Trino connector implementation by using +[Apache Thrift](https://thrift.apache.org/) on these servers. It is therefore +generic and can provide access to any backend, as long as it exposes the expected +API by using Thrift. + +In order to use the Thrift connector with an external system, you need to implement +the `TrinoThriftService` interface, found below. Next, you configure the Thrift connector +to point to a set of machines, called Thrift servers, that implement the interface. +As part of the interface implementation, the Thrift servers provide metadata, +splits and data. The connector randomly chooses a server to talk to from the available +instances for metadata calls, or for data calls unless the splits include a list of addresses. +All requests are assumed to be idempotent and can be retried freely among any server. + +## Requirements + +To connect to your custom servers with the Thrift protocol, you need: + +- Network access from the Trino coordinator and workers to the Thrift servers. +- A {ref}`trino-thrift-service` for your system. + +## Configuration + +To configure the Thrift connector, create a catalog properties file +`etc/catalog/example.properties` with the following content, replacing the +properties as appropriate: + +```text +connector.name=trino_thrift +trino.thrift.client.addresses=host:port,host:port +``` + +### Multiple Thrift systems + +You can have as many catalogs as you need, so if you have additional +Thrift systems to connect to, simply add another properties file to `etc/catalog` +with a different name, making sure it ends in `.properties`. + +## Configuration properties + +The following configuration properties are available: + +| Property name | Description | +| ------------------------------------------ | -------------------------------------------------------- | +| `trino.thrift.client.addresses` | Location of Thrift servers | +| `trino-thrift.max-response-size` | Maximum size of data returned from Thrift server | +| `trino-thrift.metadata-refresh-threads` | Number of refresh threads for metadata cache | +| `trino.thrift.client.max-retries` | Maximum number of retries for failed Thrift requests | +| `trino.thrift.client.max-backoff-delay` | Maximum interval between retry attempts | +| `trino.thrift.client.min-backoff-delay` | Minimum interval between retry attempts | +| `trino.thrift.client.max-retry-time` | Maximum duration across all attempts of a Thrift request | +| `trino.thrift.client.backoff-scale-factor` | Scale factor for exponential back off | +| `trino.thrift.client.connect-timeout` | Connect timeout | +| `trino.thrift.client.request-timeout` | Request timeout | +| `trino.thrift.client.socks-proxy` | SOCKS proxy address | +| `trino.thrift.client.max-frame-size` | Maximum size of a raw Thrift response | +| `trino.thrift.client.transport` | Thrift transport type (`UNFRAMED`, `FRAMED`, `HEADER`) | +| `trino.thrift.client.protocol` | Thrift protocol type (`BINARY`, `COMPACT`, `FB_COMPACT`) | + +### `trino.thrift.client.addresses` + +Comma-separated list of thrift servers in the form of `host:port`. For example: + +```text +trino.thrift.client.addresses=192.0.2.3:7777,192.0.2.4:7779 +``` + +This property is required; there is no default. + +### `trino-thrift.max-response-size` + +Maximum size of a data response that the connector accepts. This value is sent +by the connector to the Thrift server when requesting data, allowing it to size +the response appropriately. + +This property is optional; the default is `16MB`. + +### `trino-thrift.metadata-refresh-threads` + +Number of refresh threads for metadata cache. + +This property is optional; the default is `1`. + +(trino-thrift-service)= + +## TrinoThriftService implementation + +The following IDL describes the `TrinoThriftService` that must be implemented: + +```{literalinclude} /include/TrinoThriftService.thrift +:language: thrift +``` + +(thrift-type-mapping)= + +## Type mapping + +The Thrift service defines data type support and mappings to Trino data types. + +(thrift-sql-support)= + +## SQL support + +The connector provides {ref}`globally available ` and +{ref}`read operation ` statements to access data and +metadata in your Thrift service. diff --git a/docs/src/main/sphinx/connector/thrift.rst b/docs/src/main/sphinx/connector/thrift.rst deleted file mode 100644 index cb86822b2328..000000000000 --- a/docs/src/main/sphinx/connector/thrift.rst +++ /dev/null @@ -1,121 +0,0 @@ -================ -Thrift connector -================ - -The Thrift connector makes it possible to integrate with external storage systems -without a custom Trino connector implementation by using -`Apache Thrift `_ on these servers. It is therefore -generic and can provide access to any backend, as long as it exposes the expected -API by using Thrift. - -In order to use the Thrift connector with an external system, you need to implement -the ``TrinoThriftService`` interface, found below. Next, you configure the Thrift connector -to point to a set of machines, called Thrift servers, that implement the interface. -As part of the interface implementation, the Thrift servers provide metadata, -splits and data. The connector randomly chooses a server to talk to from the available -instances for metadata calls, or for data calls unless the splits include a list of addresses. -All requests are assumed to be idempotent and can be retried freely among any server. - -Requirements ------------- - -To connect to your custom servers with the Thrift protocol, you need: - -* Network access from the Trino coordinator and workers to the Thrift servers. -* A :ref:`trino-thrift-service` for your system. - -Configuration -------------- - -To configure the Thrift connector, create a catalog properties file -``etc/catalog/example.properties`` with the following content, replacing the -properties as appropriate: - -.. code-block:: text - - connector.name=trino_thrift - trino.thrift.client.addresses=host:port,host:port - -Multiple Thrift systems -^^^^^^^^^^^^^^^^^^^^^^^ - -You can have as many catalogs as you need, so if you have additional -Thrift systems to connect to, simply add another properties file to ``etc/catalog`` -with a different name, making sure it ends in ``.properties``. - -Configuration properties ------------------------- - -The following configuration properties are available: - -============================================= ============================================================== -Property name Description -============================================= ============================================================== -``trino.thrift.client.addresses`` Location of Thrift servers -``trino-thrift.max-response-size`` Maximum size of data returned from Thrift server -``trino-thrift.metadata-refresh-threads`` Number of refresh threads for metadata cache -``trino.thrift.client.max-retries`` Maximum number of retries for failed Thrift requests -``trino.thrift.client.max-backoff-delay`` Maximum interval between retry attempts -``trino.thrift.client.min-backoff-delay`` Minimum interval between retry attempts -``trino.thrift.client.max-retry-time`` Maximum duration across all attempts of a Thrift request -``trino.thrift.client.backoff-scale-factor`` Scale factor for exponential back off -``trino.thrift.client.connect-timeout`` Connect timeout -``trino.thrift.client.request-timeout`` Request timeout -``trino.thrift.client.socks-proxy`` SOCKS proxy address -``trino.thrift.client.max-frame-size`` Maximum size of a raw Thrift response -``trino.thrift.client.transport`` Thrift transport type (``UNFRAMED``, ``FRAMED``, ``HEADER``) -``trino.thrift.client.protocol`` Thrift protocol type (``BINARY``, ``COMPACT``, ``FB_COMPACT``) -============================================= ============================================================== - -``trino.thrift.client.addresses`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Comma-separated list of thrift servers in the form of ``host:port``. For example: - -.. code-block:: text - - trino.thrift.client.addresses=192.0.2.3:7777,192.0.2.4:7779 - -This property is required; there is no default. - -``trino-thrift.max-response-size`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Maximum size of a data response that the connector accepts. This value is sent -by the connector to the Thrift server when requesting data, allowing it to size -the response appropriately. - -This property is optional; the default is ``16MB``. - -``trino-thrift.metadata-refresh-threads`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Number of refresh threads for metadata cache. - -This property is optional; the default is ``1``. - -.. _trino-thrift-service: - -TrinoThriftService implementation ---------------------------------- - -The following IDL describes the ``TrinoThriftService`` that must be implemented: - -.. literalinclude:: /include/TrinoThriftService.thrift - :language: thrift - -.. _thrift-type-mapping: - -Type mapping ------------- - -The Thrift service defines data type support and mappings to Trino data types. - -.. _thrift-sql-support: - -SQL support ------------ - -The connector provides :ref:`globally available ` and -:ref:`read operation ` statements to access data and -metadata in your Thrift service. diff --git a/docs/src/main/sphinx/connector/tpcds.md b/docs/src/main/sphinx/connector/tpcds.md new file mode 100644 index 000000000000..ac1db6560ecf --- /dev/null +++ b/docs/src/main/sphinx/connector/tpcds.md @@ -0,0 +1,72 @@ +# TPCDS connector + +The TPCDS connector provides a set of schemas to support the +[TPC Benchmark™ DS (TPC-DS)](http://www.tpc.org/tpcds/). TPC-DS is a database +benchmark used to measure the performance of complex decision support databases. + +This connector can be used to test the capabilities and query +syntax of Trino without configuring access to an external data +source. When you query a TPCDS schema, the connector generates the +data on the fly using a deterministic algorithm. + +## Configuration + +To configure the TPCDS connector, create a catalog properties file +`etc/catalog/example.properties` with the following contents: + +```text +connector.name=tpcds +``` + +## TPCDS schemas + +The TPCDS connector supplies several schemas: + +``` +SHOW SCHEMAS FROM example; +``` + +```text + Schema +-------------------- + information_schema + sf1 + sf10 + sf100 + sf1000 + sf10000 + sf100000 + sf300 + sf3000 + sf30000 + tiny +(11 rows) +``` + +Ignore the standard schema `information_schema`, which exists in every +catalog, and is not directly provided by the TPCDS connector. + +Every TPCDS schema provides the same set of tables. Some tables are +identical in all schemas. The *scale factor* of the tables in a particular +schema is determined from the schema name. For example, the schema +`sf1` corresponds to scale factor `1` and the schema `sf300` +corresponds to scale factor `300`. Every unit in the scale factor +corresponds to a gigabyte of data. For example, for scale factor `300`, +a total of `300` gigabytes are generated. The `tiny` schema is an +alias for scale factor `0.01`, which is a very small data set useful for +testing. + +(tpcds-type-mapping)= + +## Type mapping + +Trino supports all data types used within the TPCDS schemas so no mapping is +required. + +(tpcds-sql-support)= + +## SQL support + +The connector provides {ref}`globally available ` and +{ref}`read operation ` statements to access data and +metadata in the TPC-DS dataset. diff --git a/docs/src/main/sphinx/connector/tpcds.rst b/docs/src/main/sphinx/connector/tpcds.rst deleted file mode 100644 index 155d4b2427af..000000000000 --- a/docs/src/main/sphinx/connector/tpcds.rst +++ /dev/null @@ -1,76 +0,0 @@ -=============== -TPCDS connector -=============== - -The TPCDS connector provides a set of schemas to support the -`TPC Benchmark™ DS (TPC-DS) `_. TPC-DS is a database -benchmark used to measure the performance of complex decision support databases. - -This connector can be used to test the capabilities and query -syntax of Trino without configuring access to an external data -source. When you query a TPCDS schema, the connector generates the -data on the fly using a deterministic algorithm. - -Configuration -------------- - -To configure the TPCDS connector, create a catalog properties file -``etc/catalog/example.properties`` with the following contents: - -.. code-block:: text - - connector.name=tpcds - -TPCDS schemas -------------- - -The TPCDS connector supplies several schemas:: - - SHOW SCHEMAS FROM example; - -.. code-block:: text - - Schema - -------------------- - information_schema - sf1 - sf10 - sf100 - sf1000 - sf10000 - sf100000 - sf300 - sf3000 - sf30000 - tiny - (11 rows) - -Ignore the standard schema ``information_schema``, which exists in every -catalog, and is not directly provided by the TPCDS connector. - -Every TPCDS schema provides the same set of tables. Some tables are -identical in all schemas. The *scale factor* of the tables in a particular -schema is determined from the schema name. For example, the schema -``sf1`` corresponds to scale factor ``1`` and the schema ``sf300`` -corresponds to scale factor ``300``. Every unit in the scale factor -corresponds to a gigabyte of data. For example, for scale factor ``300``, -a total of ``300`` gigabytes are generated. The ``tiny`` schema is an -alias for scale factor ``0.01``, which is a very small data set useful for -testing. - -.. _tpcds-type-mapping: - -Type mapping ------------- - -Trino supports all data types used within the TPCDS schemas so no mapping is -required. - -.. _tpcds-sql-support: - -SQL support ------------ - -The connector provides :ref:`globally available ` and -:ref:`read operation ` statements to access data and -metadata in the TPC-DS dataset. diff --git a/docs/src/main/sphinx/connector/tpch.md b/docs/src/main/sphinx/connector/tpch.md new file mode 100644 index 000000000000..2382c256347a --- /dev/null +++ b/docs/src/main/sphinx/connector/tpch.md @@ -0,0 +1,70 @@ +# TPCH connector + +The TPCH connector provides a set of schemas to support the +[TPC Benchmark™ H (TPC-H)](http://www.tpc.org/tpch/). TPC-H is a database +benchmark used to measure the performance of highly-complex decision support databases. + +This connector can be used to test the capabilities and query +syntax of Trino without configuring access to an external data +source. When you query a TPCH schema, the connector generates the +data on the fly using a deterministic algorithm. + +## Configuration + +To configure the TPCH connector, create a catalog properties file +`etc/catalog/example.properties` with the following contents: + +```text +connector.name=tpch +``` + +## TPCH schemas + +The TPCH connector supplies several schemas: + +``` +SHOW SCHEMAS FROM example; +``` + +```text + Schema +-------------------- + information_schema + sf1 + sf100 + sf1000 + sf10000 + sf100000 + sf300 + sf3000 + sf30000 + tiny +(11 rows) +``` + +Ignore the standard schema `information_schema`, which exists in every +catalog, and is not directly provided by the TPCH connector. + +Every TPCH schema provides the same set of tables. Some tables are +identical in all schemas. Other tables vary based on the *scale factor*, +which is determined based on the schema name. For example, the schema +`sf1` corresponds to scale factor `1` and the schema `sf300` +corresponds to scale factor `300`. The TPCH connector provides an +infinite number of schemas for any scale factor, not just the few common +ones listed by `SHOW SCHEMAS`. The `tiny` schema is an alias for scale +factor `0.01`, which is a very small data set useful for testing. + +(tpch-type-mapping)= + +## Type mapping + +Trino supports all data types used within the TPCH schemas so no mapping +is required. + +(tpch-sql-support)= + +## SQL support + +The connector provides {ref}`globally available ` and +{ref}`read operation ` statements to access data and +metadata in the TPC-H dataset. diff --git a/docs/src/main/sphinx/connector/tpch.rst b/docs/src/main/sphinx/connector/tpch.rst deleted file mode 100644 index 7bec8f9926ff..000000000000 --- a/docs/src/main/sphinx/connector/tpch.rst +++ /dev/null @@ -1,74 +0,0 @@ -============== -TPCH connector -============== - -The TPCH connector provides a set of schemas to support the -`TPC Benchmark™ H (TPC-H) `_. TPC-H is a database -benchmark used to measure the performance of highly-complex decision support databases. - -This connector can be used to test the capabilities and query -syntax of Trino without configuring access to an external data -source. When you query a TPCH schema, the connector generates the -data on the fly using a deterministic algorithm. - -Configuration -------------- - -To configure the TPCH connector, create a catalog properties file -``etc/catalog/example.properties`` with the following contents: - -.. code-block:: text - - connector.name=tpch - -TPCH schemas ------------- - -The TPCH connector supplies several schemas:: - - SHOW SCHEMAS FROM example; - -.. code-block:: text - - Schema - -------------------- - information_schema - sf1 - sf100 - sf1000 - sf10000 - sf100000 - sf300 - sf3000 - sf30000 - tiny - (11 rows) - -Ignore the standard schema ``information_schema``, which exists in every -catalog, and is not directly provided by the TPCH connector. - -Every TPCH schema provides the same set of tables. Some tables are -identical in all schemas. Other tables vary based on the *scale factor*, -which is determined based on the schema name. For example, the schema -``sf1`` corresponds to scale factor ``1`` and the schema ``sf300`` -corresponds to scale factor ``300``. The TPCH connector provides an -infinite number of schemas for any scale factor, not just the few common -ones listed by ``SHOW SCHEMAS``. The ``tiny`` schema is an alias for scale -factor ``0.01``, which is a very small data set useful for testing. - -.. _tpch-type-mapping: - -Type mapping ------------- - -Trino supports all data types used within the TPCH schemas so no mapping -is required. - -.. _tpch-sql-support: - -SQL support ------------ - -The connector provides :ref:`globally available ` and -:ref:`read operation ` statements to access data and -metadata in the TPC-H dataset.