Skip to content

Latest commit

 

History

History
81 lines (71 loc) · 2.48 KB

README.md

File metadata and controls

81 lines (71 loc) · 2.48 KB

AlgoSEDD

Publish Dep

cd Dep
sbt clean publishLocal

Build assembly

sbt assembly

Run

Local

Setup

Spark Version

Use any Spark version compatible with Scala version used in the project.

Spark History Server
# Create a dir for logs, default file:/tmp/spark-events
mkdir /tmp/spark-events

# start Spark History Server
bash $SPARK_HOME/sbin/start-history-server.sh

Execution

bash scripts/generic/submit.sh \
  target/scala-2.12/AlgoSEDD-assembly-0.1-SNAPSHOT.jar \ 
  local[2] \
  2010-01-01 \
  2022-01-01 \
  '8 weeks' \
  /mnt/datastore/data/StackExchangeDataDump/2021-06-07/3dprinting.meta.stackexchange.com \
  target/output/

GCP Dataproc [TODO: UPDATE]

Setup

Spark Version

Use any Spark version compatible with Scala version used in the project.

Execution

gcloud dataproc jobs submit spark \
    --cluster=cluster-5cff \
    --class=pl.epsilondeltalimit.analyzer.StackExchangeDataDumpAnalyzerSingle \
    --jars=gs://stack-exchange-data-dump-analyzer-single/StackExchangeDataDumpAnalyzerSingle-0.1-SNAPSHOT-jar-with-dependencies.jar \
    --region=europe-west3 \
    --driver-log-levels root=DEBUG \
    -- 2010-01-01 2021-01-01 '13 weeks' gs://stack-exchange-data-dump/scifi.stackexchange.com/Badges.xml gs://stack-exchange-data-dump/scifi.stackexchange.com/Comments.xml gs://stack-exchange-data-dump/scifi.stackexchange.com/PostHistory.xml gs://stack-exchange-data-dump/scifi.stackexchange.com/PostLinks.xml gs://stack-exchange-data-dump/scifi.stackexchange.com/Posts.xml gs://stack-exchange-data-dump/scifi.stackexchange.com/Tags.xml gs://stack-exchange-data-dump/scifi.stackexchange.com/Users.xml gs://stack-exchange-data-dump/scifi.stackexchange.com/Votes.xml gs://stack-exchange-data-dump-analyzer-single/output/scifi.stackexchange.com/13weeks

Plot

Local

# relative popularity
#bash scripts/plot/generic/plot_relative_popularity_tag.sh \
#  <csv result file> \
#  <tag name> \
#  <aggregation interval> \
#  <y axis max> \
#  <optional >
#e.g.
#bash scripts/relative_popularity_plot_tag.sh \
#  output/tag\=print-quality/part-00000-9b6e8399-3e48-4a97-a355-4b239b975515.c000.csv \
#  print-quality \
#  8weeks \
#  1.0 \
#  0.1

# entries count
bash scripts/plot/generic/plot_entries_count_tag.sh \
  target/output/scifi.stackexchange.com/tag\=star-wars/part-00000-28150a61-065b-4e24-8435-3b71b4911bcf.c000.csv \
  star-wars \
  8weeks \
  100000