Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[apache_spark][driver] Add Apache Spark package with Driver data stream #2945

Merged
merged 10 commits into from
Apr 22, 2022
Merged
10 changes: 9 additions & 1 deletion packages/apache_spark/_dev/build/docs/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Apache Spark
# Apache Spark Integration

The Apache Spark integration collects and parses data using the Jolokia Metricbeat Module.

Expand Down Expand Up @@ -70,3 +70,11 @@ This is the `nodes` data stream.
{{event "nodes"}}

{{fields "nodes"}}

### Driver

This is the `driver` data stream.

{{event "driver"}}

{{fields "driver"}}
15 changes: 0 additions & 15 deletions packages/apache_spark/_dev/deploy/docker/Dockerfile

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys
import signal
import time

from operator import add
from datetime import datetime

from pyspark.sql import SparkSession

if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: wordcount <file>", file=sys.stderr)
sys.exit(-1)

spark = SparkSession\
.builder\
.master(sys.argv[2])\
.appName("PythonWordCount")\
.getOrCreate()

t_end = time.time() + 60 * 15

# Run loop for 15 mins
while time.time() < t_end:
lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
counts = lines.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (x, 1)) \
.reduceByKey(add)
output = counts.collect()
for (word, count) in output:
print("%s: %i" % (word, count))

spark.stop()
26 changes: 22 additions & 4 deletions packages/apache_spark/_dev/deploy/docker/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,27 @@
version: '2'
version: '2.3'
services:
apache_spark:
hostname: apache-spark-main
build:
context: .
dockerfile: Dockerfile
image: docker.io/bitnami/spark@sha256:cb19b1bdebc0bc9dc20ea13f2109763be6a73b357b144a01efd94902540f6d27
ports:
- 7777
- 7779
environment:
- SPARK_MAIN_URL=spark://apache-spark-main:7077
- SPARK_WORKER_MEMORY=1024G
- SPARK_WORKER_CORES=8
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
volumes:
- ./jolokia-agent:/usr/share/java/
- ./application:/opt/bitnami/spark/examples/src/main/python/
- ./jolokia-configs:/spark/conf/
- ./docker-entrypoint/docker-entrypoint.sh:/opt/bitnami/scripts/spark/docker-entrypoint.sh:rw
healthcheck:
interval: 1s
retries: 120
timeout: 120s
test: |-
curl -f -s http://localhost:7777/jolokia/version -o /dev/null
entrypoint: /opt/bitnami/scripts/spark/docker-entrypoint.sh /opt/bitnami/scripts/spark/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash

echo 'export SPARK_MASTER_OPTS="$SPARK_MASTER_OPTS -javaagent:/usr/share/java/jolokia-agent.jar=config=/spark/conf/jolokia-master.properties"' >> "/opt/bitnami/spark/conf/spark-env.sh"
echo 'export SPARK_WORKER_OPTS="$SPARK_WORKER_OPTS -javaagent:/usr/share/java/jolokia-agent.jar=config=/spark/conf/jolokia-worker.properties"' >> "/opt/bitnami/spark/conf/spark-env.sh"

echo '*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink' >> "/opt/bitnami/spark/conf/metrics.properties"
echo '*.source.jvm.class=org.apache.spark.metrics.source.JvmSource' >> "/opt/bitnami/spark/conf/metrics.properties"

echo 'spark.driver.extraJavaOptions -javaagent:/usr/share/java/jolokia-agent.jar=config=/spark/conf/jolokia-driver.properties' >> "/opt/bitnami/spark/conf/spark-defaults.conf"

# shellcheck disable=SC1091

set -o errexit
set -o nounset
set -o pipefail
#set -o xtrace

# Load libraries
. /opt/bitnami/scripts/libbitnami.sh
. /opt/bitnami/scripts/libspark.sh

# Load Spark environment variables
eval "$(spark_env)"

print_welcome_page

if [ ! $EUID -eq 0 ] && [ -e "$LIBNSS_WRAPPER_PATH" ]; then
echo "spark:x:$(id -u):$(id -g):Spark:$SPARK_HOME:/bin/false" > "$NSS_WRAPPER_PASSWD"
echo "spark:x:$(id -g):" > "$NSS_WRAPPER_GROUP"
echo "LD_PRELOAD=$LIBNSS_WRAPPER_PATH" >> "$SPARK_CONFDIR/spark-env.sh"
fi

if [[ "$1" = "/opt/bitnami/scripts/spark/run.sh" ]]; then
info "** Starting Spark setup **"
/opt/bitnami/scripts/spark/setup.sh
info "** Spark setup finished! **"
fi

eval "$(spark_env)"
cd /opt/bitnami/spark/sbin
./start-worker.sh $SPARK_MAIN_URL --cores $SPARK_WORKER_CORES --memory $SPARK_WORKER_MEMORY &
cd /opt/bitnami/spark/examples/src/main/python/
/opt/bitnami/spark/bin/spark-submit wordcount.py wordcount.py $SPARK_MAIN_URL &

echo ""
exec "$@"
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
[Spark-Master]
stats: http://127.0.0.1:7777/jolokia/read
[Spark-Master]
stats: http://127.0.0.1:7777/jolokia/read
[Spark-Worker]
stats: http://127.0.0.1:7778/jolokia/read
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
host=0.0.0.0
port=7779
agentContext=/jolokia
backlog=100

policyLocation=file:///spark/conf/jolokia.policy
historyMaxEntries=10
debug=false
debugMaxEntries=100
maxDepth=15
maxCollectionSize=1000
maxObjects=0
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
host=0.0.0.0
port=7777
agentContext=/jolokia
backlog=100
policyLocation=file:///spark/conf/jolokia.policy
historyMaxEntries=10
debug=false
debugMaxEntries=100
maxDepth=15
maxCollectionSize=1000
maxObjects=0
host=0.0.0.0
port=7777
agentContext=/jolokia
backlog=100

policyLocation=file:///spark/conf/jolokia.policy
historyMaxEntries=10
debug=false
debugMaxEntries=100
maxDepth=15
maxCollectionSize=1000
maxObjects=0
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
host=0.0.0.0
port=7778
agentContext=/jolokia
backlog=100

policyLocation=file:///spark/conf/jolokia.policy
historyMaxEntries=10
debug=false
debugMaxEntries=100
maxDepth=15
maxCollectionSize=1000
maxObjects=0
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
<?xml version="1.0" encoding="utf-8"?>
<restrict>
<http>
<method>get</method>
<method>post</method>
</http>
<commands>
<command>read</command>
<command>list</command>
<command>search</command>
<command>version</command>
</commands>
</restrict>
<?xml version="1.0" encoding="utf-8"?>
<restrict>
<http>
<method>get</method>
<method>post</method>
</http>
<commands>
<command>read</command>
<command>list</command>
<command>search</command>
<command>version</command>
</commands>
</restrict>
3 changes: 3 additions & 0 deletions packages/apache_spark/changelog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

- version: "0.1.0"
changes:
- description: Implement "driver" data stream
type: enhancement
link: https://github.com/elastic/integrations/pull/2945
- description: Implement "nodes" data stream
type: enhancement
link: https://github.com/elastic/integrations/pull/2939
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
vars: ~
data_stream:
vars:
hosts:
- http://apache-spark-main:{{Ports.[1]}}
path:
- /jolokia/?ignoreErrors=true&canonicalNaming=false
Loading