Skip to content

Commit

Permalink
Apache DataSketches plugin for Trino
Browse files Browse the repository at this point in the history
  • Loading branch information
ShashwatArghode committed Sep 1, 2021
1 parent 46b37aa commit b772682
Show file tree
Hide file tree
Showing 15 changed files with 855 additions and 0 deletions.
6 changes: 6 additions & 0 deletions core/trino-server/src/main/provisio/presto.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@
</artifact>
</artifactSet>

<artifactSet to="plugin/datasketches">
<artifact id="${project.groupId}:trino-datasketches:zip:${project.version}">
<unpack />
</artifact>
</artifactSet>

<artifactSet to="plugin/example-http">
<artifact id="${project.groupId}:trino-example-http:zip:${project.version}">
<unpack />
Expand Down
1 change: 1 addition & 0 deletions docs/src/main/sphinx/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ and the :doc:`SQL statement and syntax reference</sql>`.
Conditional <functions/conditional>
Conversion <functions/conversion>
Date and time <functions/datetime>
Datasketches <functions/datasketches>
Decimal <functions/decimal>
Geospatial <functions/geospatial>
HyperLogLog <functions/hyperloglog>
Expand Down
39 changes: 39 additions & 0 deletions docs/src/main/sphinx/functions/datasketches.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
======================
DataSketches Functions
======================
DataSketches is a high-performance library of stochastic streaming
algorithms commonly called ”sketches” in the data sciences. Sketches are
small, stateful programs that process massive data as a stream and can
provide approximate answers, with mathematical guarantees, to
computationally difficult queries orders-of-magnitude faster than
traditional, exact methods.
The DataSketches functions allows querying the fast and memory-efficient `Apache
DataSkecthes <https://datasketches.apache.org/docs/Community/Research.html>`_
from Trino. Support for `Theta Sketch Framework <https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html>`_
is added, specifically :func:`theta_sketch_union` and :func:`theta_sketch_estimate` functions.
These functions are used in the ``count distinct`` queries using sketches.
Datasketches can be created using Hive or Pig using respective sketch APIs.

DataSketches functions
----------------------

.. function:: theta_sketch_union(sketches) -> sketch

Returns a single sketch which is a merged collection of sketches.

.. function:: theta_sketch_estimate(sketch) -> double

Returns the estimated value of the sketch.

Example in Trino for using DataSketches
---------------------------------------
Query::

sql
SELECT
o_orderdate as date,
theta_sketch_estimate(theta_sketch_union(o_custkey_sketch)) AS unique_user_count
SUM(o_totalprice) AS user_spent,
FROM tpch.sf100000.orders WHERE o_orderdate >= dateadd(day, -90, current_date)
GROUP BY o_orderdate;

8 changes: 8 additions & 0 deletions docs/src/main/sphinx/functions/list-by-topic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ For more details, see :doc:`conversion`
* :func:`try_cast`
* :func:`typeof`

DataSketches
------------

For more details, see :doc:`datasketches`

* :func:`theta_sketch_estimate`
* :func:`theta_sketch_union`

Date and time
-------------

Expand Down
2 changes: 2 additions & 0 deletions docs/src/main/sphinx/functions/list.rst
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,8 @@ T
- :func:`tan`
- :func:`tanh`
- :func:`tdigest_agg`
- :func:`theta_sketch_estimate`
- :func:`theta_sketch_union`
- :func:`timestamp_objectid`
- :func:`timezone_hour`
- :func:`timezone_minute`
Expand Down
101 changes: 101 additions & 0 deletions plugin/trino-datasketches/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
<?xml version="1.0"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>io.trino</groupId>
<artifactId>trino-root</artifactId>
<version>362-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

<artifactId>trino-datasketches</artifactId>
<name>trino-datasketches</name>
<packaging>trino-plugin</packaging>
<url>http://datasketches.apache.org/</url>

<properties>
<air.main.basedir>${project.parent.basedir}</air.main.basedir>
</properties>

<dependencies>
<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-array</artifactId>
</dependency>

<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>

<dependency>
<groupId>org.apache.datasketches</groupId>
<artifactId>datasketches-java</artifactId>
<version>2.0.0</version>
</dependency>

<dependency>
<groupId>org.apache.datasketches</groupId>
<artifactId>datasketches-memory</artifactId>
<version>1.3.0</version>
</dependency>

<!-- Trino SPI -->
<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-spi</artifactId>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>io.airlift</groupId>
<artifactId>slice</artifactId>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.openjdk.jol</groupId>
<artifactId>jol-core</artifactId>
<scope>provided</scope>
</dependency>

<!-- Test Dependencies -->
<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-hive</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-hive</artifactId>
<type>test-jar</type>
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-main</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.trino</groupId>
<artifactId>trino-testing</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>io.trino.hadoop</groupId>
<artifactId>hadoop-apache</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.datasketches.state;

import io.airlift.slice.Slice;
import io.trino.spi.function.AccumulatorState;
import io.trino.spi.function.AccumulatorStateMetadata;

/**
* State object to keep track of sketch aggregations.
*/
@AccumulatorStateMetadata(stateSerializerClass = SketchStateSerializer.class, stateFactoryClass = SketchStateFactory.class)
public interface SketchState
extends AccumulatorState
{
Slice getSketch();

int getNominalEntries();

long getSeed();

void setSketch(Slice value);

void setNominalEntries(int value);

void setSeed(long value);

void merge(SketchState state);
}
Loading

0 comments on commit b772682

Please sign in to comment.