From 5a970ecdedd65bbd36ca7c87916bd54199b79e5f Mon Sep 17 00:00:00 2001
From: Bobby Iliev <bobby@bobbyiliev.com>
Date: Fri, 17 Mar 2023 18:46:56 +0200
Subject: [PATCH] Add docker compose example demo (#83)

---
 README.md                                     |  6 +-
 examples/README.md                            | 10 +++
 examples/docker-compose/README.md             | 88 +++++++++++++++++++
 examples/docker-compose/docker-compose.yaml   | 60 +++++++++++++
 examples/docker-compose/schemas/schema.json   |  9 ++
 .../{ecommerce.md => ecommerce/README.md}     | 12 +--
 examples/{ => ecommerce}/blog.json            |  0
 examples/{ => ecommerce}/ecommerce.json       |  0
 8 files changed, 176 insertions(+), 9 deletions(-)
 create mode 100644 examples/README.md
 create mode 100644 examples/docker-compose/README.md
 create mode 100644 examples/docker-compose/docker-compose.yaml
 create mode 100644 examples/docker-compose/schemas/schema.json
 rename examples/{ecommerce.md => ecommerce/README.md} (97%)
 rename examples/{ => ecommerce}/blog.json (100%)
 rename examples/{ => ecommerce}/ecommerce.json (100%)

diff --git a/README.md b/README.md
index 24cb118..35a455e 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ The benefits of using this datagen tool are:
 - You can specify what values are generated using the expansive [FakerJS API](https://fakerjs.dev/api/) to craft data that more faithfully imitates your use case. This allows you to more easily apply business logic downstream.
 - This is a relatively simple CLI tool compared to other Kafka data generators that require Kafka Connect.
 - When using the `avro` output format, datagen connects to Schema Registry. This allows you to take advantage of the [benefits](https://www.confluent.io/blog/schema-registry-kafka-stream-processing-yes-virginia-you-really-need-one/) of using Schema Registry.
-- Often when you generate random data, your downstream join results won't make sense because it's unlikely a randomly generated field in one dataset will match a randomly generated field in another. With this datagen tool, you can specify relationships between your datasets so that related columns will match up, resulting in meaningful joins downstream. Jump to the [end-to-end ecommerce tutorial](./examples/ecommerce.md) for a full example.
+- Often when you generate random data, your downstream join results won't make sense because it's unlikely a randomly generated field in one dataset will match a randomly generated field in another. With this datagen tool, you can specify relationships between your datasets so that related columns will match up, resulting in meaningful joins downstream. Jump to the [end-to-end ecommerce tutorial](./examples/ecommerce) for a full example.
 
 > :construction: Specifying relationships between datasets currently requires using JSON for the input schema.
 
@@ -212,7 +212,7 @@ Here is the general syntax for a JSON input schema:
 ]
 ```
 
-Go to the [end-to-end ecommerce tutorial](./examples/ecommerce.md) to walk through an example that uses a JSON input schema with relational data.
+Go to the [end-to-end ecommerce tutorial](./examples/ecommerce) to walk through an example that uses a JSON input schema with relational data.
 
 
 ### SQL Schema
@@ -253,4 +253,4 @@ Here is an example Avro input schema from `tests/schema.avsc` that will produce
     { "name": "ownerIds", "type": { "type": "array", "items": "string" } }
   ]
 }
-```
\ No newline at end of file
+```
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..957ecda
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,10 @@
+## Datagen end-to-end tutorials
+
+This directory contains end-to-end tutorials for the `datagen` tool.
+
+| Tutorial | Description |
+| -------- | ----------- |
+| [ecommerce](ecommerce) | A tutorial for the `datagen` tool that generates data for an ecommerce website. |
+| [docker-compose](docker-compose) | A `docker-compose` setup for the `datagen`. |
+
+To request a new tutorial, please [open an issue](https://github.com/MaterializeInc/datagen/issues/new?assignees=&labels=feature%2C+enhancement&template=feature_request.md&title=Feature%3A+).
diff --git a/examples/docker-compose/README.md b/examples/docker-compose/README.md
new file mode 100644
index 0000000..7b6a44f
--- /dev/null
+++ b/examples/docker-compose/README.md
@@ -0,0 +1,88 @@
+# Datagen Docker Compose Demo
+
+In this demo, we will show you how to use `docker-compose` to run multiple `datagen` instances and produce 30GB of data to a Kafka cluster.
+
+## Overview
+
+The [`docker-compose.yaml`](docker-compose.yaml) file defines the following services:
+
+- `redpanda`: A single-node Kafka instance.
+- 3 `datagen` instances that produce data to Redpanda simultaneously.
+
+### Datagen instances overview
+
+Each `datagen` instance produces 10GB of random data to Redpanda using an auto incrementing key thanks to the `iteration.index` identifier in the [`schemas/schema.json`](schemas/schema.json) file. This allows you to simulate an upsert source with a total of 30GB of data but only 10GB of unique data.
+
+Example of the `datagen` instance configuration:
+
+```yaml
+  datagen1:
+    image: materialize/datagen:latest
+    container_name: datagen1
+    depends_on:
+      - redpanda
+    environment:
+      KAFKA_BROKERS: redpanda:9092
+    volumes:
+      - ./schemas:/schemas
+    entrypoint:
+      datagen -s /tests/schema.json -f json -n 10024 --record-size 1048576 -d
+```
+
+Rundown of the `datagen` instance configuration:
+
+- `image`: The `datagen` Docker image.
+- `container_name`: The name of the container. This should be unique for each instance.
+- `depends_on`: The `datagen` instance depends on the `redpanda` service.
+- `environment`: The `KAFKA_BROKERS` environment variable is used to configure the Kafka/Redpanda brokers. If you are using a Kafka cluster with SASL authentication, you can also set the `SASL_USERNAME`, `SASL_PASSWORD` and `SASL_MECHANISM` environment variables.
+- `volumes`: The `datagen` instance mounts the `schemas` directory to the `/schemas` directory in the container. This is where we have the `schema.json` file.
+- `entrypoint`: The `datagen` command line arguments. The `-s` flag is used to specify the schema file. The `-f` flag is used to specify the output format. The `-n` flag is used to specify the number of records to generate. The `--record-size` flag is used to specify the size of each record. The `-d` flag is used to enable debug logging.
+
+## Prerequisites
+
+- [Docker](https://docs.docker.com/get-docker/)
+- [Docker Compose](https://docs.docker.com/compose/install/)
+
+## Running the demo
+
+1. Clone the `datagen` repository:
+
+    ```bash
+    git clone https://github.com/MaterializeInc/datagen.git
+
+    cd datagen/examples/docker-compose
+    ```
+
+1. Start the demo:
+
+    ```bash
+    docker-compose up -d
+    ```
+
+    The demo will take a few minutes to start up. You should see the following output:
+
+    ```bash
+    Creating network "docker-compose_default" with the default driver
+    Creating docker-compose_redpanda_1  ... done
+    Creating docker-compose_datagen_1   ... done
+    Creating docker-compose_datagen_2   ... done
+    Creating docker-compose_datagen_3   ... done
+    ```
+
+1. Verify that the demo is running:
+
+    ```bash
+    docker-compose ps -a
+    ```
+
+1. Stopping the demo:
+
+    ```bash
+    docker-compose down -v
+    ```
+
+## Useful links
+
+- [Materialize documentation](https://materialize.com/docs/)
+- [Materialize community Slack](https://materialize.com/s/chat)
+- [Materialize Blog](https://materialize.com/blog/)
diff --git a/examples/docker-compose/docker-compose.yaml b/examples/docker-compose/docker-compose.yaml
new file mode 100644
index 0000000..76a64ef
--- /dev/null
+++ b/examples/docker-compose/docker-compose.yaml
@@ -0,0 +1,60 @@
+---
+version: "3.9"
+
+services:
+
+  redpanda:
+    image: docker.vectorized.io/vectorized/redpanda:v21.11.2
+    command:
+      - redpanda start
+      - --overprovisioned
+      - --smp 1
+      - --memory 1G
+      - --reserve-memory 0M
+      - --node-id 0
+      - --check=false
+      - --kafka-addr 0.0.0.0:9092
+      - --advertise-kafka-addr ${EXTERNAL_IP:-redpanda}:9092
+      - --pandaproxy-addr 0.0.0.0:8082
+      - --advertise-pandaproxy-addr ${EXTERNAL_IP:-redpanda}:8082
+      - --set redpanda.enable_transactions=true
+      - --set redpanda.enable_idempotence=true
+    ports:
+      - 9092:9092
+      - 8081:8081
+      - 8082:8082
+    healthcheck: {test: curl -f localhost:9644/v1/status/ready, interval: 1s, start_period: 30s}
+
+  datagen1:
+    image: materialize/datagen:latest
+    container_name: datagen1
+    depends_on:
+      - redpanda
+    environment:
+      KAFKA_BROKERS: redpanda:9092
+    volumes:
+      - ./schemas:/schemas
+    entrypoint:
+      datagen -s /schemas/schema.json -f json -n 10024 --record-size 1048576 -d
+  datagen2:
+    image: materialize/datagen:latest
+    container_name: datagen2
+    depends_on:
+      - redpanda
+    environment:
+      KAFKA_BROKERS: redpanda:9092
+    volumes:
+      - ./schemas:/schemas
+    entrypoint:
+      datagen -s /schemas/schema.json -f json -n 10024 --record-size 1048576 -d
+  datagen3:
+    image: materialize/datagen:latest
+    container_name: datagen3
+    depends_on:
+      - redpanda
+    environment:
+      KAFKA_BROKERS: redpanda:9092
+    volumes:
+      - ./schemas:/schemas
+    entrypoint:
+      datagen -s /schemas/schema.json -f json -n 10024 --record-size 1048576 -d
diff --git a/examples/docker-compose/schemas/schema.json b/examples/docker-compose/schemas/schema.json
new file mode 100644
index 0000000..cbf9a5e
--- /dev/null
+++ b/examples/docker-compose/schemas/schema.json
@@ -0,0 +1,9 @@
+[
+    {
+        "_meta": {
+            "topic": "large_topic",
+            "key": "id"
+        },
+        "id": "iteration.index"
+    }
+]
diff --git a/examples/ecommerce.md b/examples/ecommerce/README.md
similarity index 97%
rename from examples/ecommerce.md
rename to examples/ecommerce/README.md
index d787a87..6f52a44 100644
--- a/examples/ecommerce.md
+++ b/examples/ecommerce/README.md
@@ -89,12 +89,12 @@ This tutorial will use a Confluent Cloud Basic Kafka Cluster and Schema Registry
 
 ### Datagen
 
-1. [Install datagen](../README.md#installation) if you haven't already.
-1. Create a `.env` file with your Kafka and Schema Registry credentials (see [.env.example](../.env.example)).
+1. [Install datagen](../../README.md#installation) if you haven't already.
+1. Create a `.env` file with your Kafka and Schema Registry credentials (see [.env.example](../../.env.example)).
 1. Generate a single iteration of records with dry run and debug modes and check the output.
     ```bash
     datagen \
-        --schema examples/ecommerce.json \
+        --schema ecommerce.json \
         --format avro \
         --number 1 \
         --dry-run \
@@ -103,7 +103,7 @@ This tutorial will use a Confluent Cloud Basic Kafka Cluster and Schema Registry
 1. Start producing data to Kafka while you set up Materialize.
     ```bash
     datagen \
-        -s examples/ecommerce.json \
+        -s ecommerce.json \
         -f avro \
         -n -1 \
         --wait 500
@@ -297,11 +297,11 @@ Materialize specializes in efficient, incremental view maintenance over changing
 1. Run `datagen` again with the `--clean` option to destroy topics and schema subjects.
     ```bash
     datagen \
-        -s examples/ecommerce.json \
+        -s ecommerce.json \
         -f avro \
         --clean
     ```
 
 ## Learn More
 
-Check out the Materialize [docs](www.materialize.com/docs) and [blog](www.materialize.com/blog) for more!
\ No newline at end of file
+Check out the Materialize [docs](www.materialize.com/docs) and [blog](www.materialize.com/blog) for more!
diff --git a/examples/blog.json b/examples/ecommerce/blog.json
similarity index 100%
rename from examples/blog.json
rename to examples/ecommerce/blog.json
diff --git a/examples/ecommerce.json b/examples/ecommerce/ecommerce.json
similarity index 100%
rename from examples/ecommerce.json
rename to examples/ecommerce/ecommerce.json