From 26f0cc091f9f110f8481788d56688698855f31e0 Mon Sep 17 00:00:00 2001 From: mhebrard Date: Wed, 11 Oct 2023 09:44:58 +0800 Subject: [PATCH 1/2] Fix README --- README.md | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 3225b66..3096ca2 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ Deploy an [EMR cluster on AWS](https://aws.amazon.com/emr/), with Spark, [Hail]( * A github repository to store the zeppelin notebooks * A github account with write permission on the repository and a personal access token with full repo permissions. -In addition you may want to install / be able to run Ensembl's Variant Effect Predictor (VEP) -* A S3 bucket containining VEP cache data, see section [Install Ensembl's Variant Effect Predictor (VEP)](#Install-Ensembls-Variant-Effect-Predictor-VEP). +In addition you may want to install / be able to run Ensembl's Variant Effect Predictor (VEP) + +* A S3 bucket containining VEP cache data, see section [Install Ensembl's Variant Effect Predictor (VEP)](#install-ensembls-variant-effect-predictor-vep). ## Create a Spark/Hail/Zeppelin EMR using AWS CloudFormation service @@ -76,7 +77,8 @@ The template used below create a cluster with cheaper instance (AWS Spot instanc MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 8890:$MASTER:8890 -L 18080:$MASTER:18080 hadoop@$MASTER ``` -### Accessing the EMR cluster via Zeppelin UI +### Accessing the EMR cluster via Zeppelin UI + * Visit [Zeppelin](http://localhost:8890) * Create a new note(book) * Import and initialize Hail and SparkContext @@ -101,9 +103,10 @@ output_notebook(notebook_type='zeppelin') ``` ### Commit changes to Zeppelin note(book) - * in Zeppelin menu, click on **Version control** - * Write a commit message and click on **Commit** - * Click on **Ok** + +* in Zeppelin menu, click on **Version control** +* Write a commit message and click on **Commit** +* Click on **Ok** * Save your work on github ```sh @@ -112,12 +115,13 @@ cd /opt/zeppelin git push origin master ``` -## Install Ensembl's Variant Effect Predictor (VEP) -First we need to download VEP cache and store it on AWS. -Be aware that the data represents ~25Gb. +## Install Ensembl's Variant Effect Predictor (VEP) + +First we need to download VEP cache and store it on AWS. +Be aware that the data represents ~25Gb. Set `DiskSizeGB` CloudFormation template parameter accordingly -### Connect to EMR master node (shell) +### Re-connect to EMR master node (shell) ```sh # Replace [EMRMasterDNS] below by the value displayed in stack Outputs @@ -159,10 +163,10 @@ docker run -v /mnt/vep/vep_data:/opt/vep/.vep -w /opt/vep/src/ensembl-vep $IMAGE aws s3 cp /mnt/vep/vep_data//homo_sapiens_merged/ s3://[Bucket]/Hail-on-AWS/vep_data/homo_sapiens_merged/ --recursive ``` -### CloudFormation template parameters +### CloudFormation template parameters (VEP) + Now we can create a cluster with VEP installed by default -### CloudFormation template parameters * DiskSizeGB: `50` * VEPInstall: `true` * VEPBucket: `s3://[Bucket]/Hail-on-AWS/vep_data/` @@ -185,6 +189,7 @@ ht_vep = hl.vep(ht_nostar, 's3://[Bucket]/Hail-on-AWS/vep_data/vep[VEPVersion]_[ # Write table ht.write('s3://[Path/to/table].vep.ht', overwrite=True) ``` + ## Export to Elasticsearch In Hail v0.2.60, the function `hl.export_elasticsearch` is not compatible with scala v2.12.x that is included in emr-6.x. Hail team is actively working on that issue, see [#9767](https://github.com/hail-is/hail/issues/9767) @@ -193,7 +198,8 @@ In the mean time we can deploy Hail on emr-5.x that includes scala v2.11.x where Note that emr-6.x and emr-5.x includes different version of zeppelin (v0.8 vs v0.9) with incompatibility. Therefore the notebooks created on one emr version will not appears on the other emr version that. -### CloudFormation template parameters +### CloudFormation template parameters (elasticsearch) + * EMRReleaseLabel: `emr-5.31.0` -## END \ No newline at end of file +## END From 08c4622644228f24f43e9349a01bc52cb8326fb6 Mon Sep 17 00:00:00 2001 From: mhebrard Date: Wed, 11 Oct 2023 17:23:49 +0800 Subject: [PATCH 2/2] Switch to JupyterLab --- README.md | 52 +++++++----------------- src/hail_emr_spot.yml | 41 +++++++++---------- src/install_hail.sh | 88 +++++++++-------------------------------- src/install_zeppelin.sh | 60 ---------------------------- src/step_jupyter.sh | 88 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 139 insertions(+), 190 deletions(-) delete mode 100644 src/install_zeppelin.sh create mode 100644 src/step_jupyter.sh diff --git a/README.md b/README.md index 3096ca2..0e88f78 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,20 @@ # Hail-on-AWS -Deploy an [EMR cluster on AWS](https://aws.amazon.com/emr/), with Spark, [Hail](https://hail.is/index.html), [Zeppelin](https://zeppelin.apache.org/) and [Ensembl VEP](https://ensembl.org/info/docs/tools/vep/index.html) using [CloudFormation service](https://aws.amazon.com/cloudformation/). +Deploy an [EMR cluster on AWS](https://aws.amazon.com/emr/), with Spark, [Hail](https://hail.is/index.html), [JupyterLab](https://jupyter.org/about.html) and [Ensembl VEP](https://ensembl.org/info/docs/tools/vep/index.html) using [CloudFormation service](https://aws.amazon.com/cloudformation/). ## Requirements * A valid AWS account with appropriate permissions * A VPC, a subnet and a security group ready to ensure appropriate access to the cluster * A S3 bucket to receive the data -* A github repository to store the zeppelin notebooks +* A github repository to store the notebooks * A github account with write permission on the repository and a personal access token with full repo permissions. In addition you may want to install / be able to run Ensembl's Variant Effect Predictor (VEP) * A S3 bucket containining VEP cache data, see section [Install Ensembl's Variant Effect Predictor (VEP)](#install-ensembls-variant-effect-predictor-vep). -## Create a Spark/Hail/Zeppelin EMR using AWS CloudFormation service +## Create a Spark/Hail/Jupyter EMR using AWS CloudFormation service * Clone this repository @@ -39,7 +39,7 @@ aws s3 sync src/ s3://[Bucket]/Hail-on-AWS/ The template used below create a cluster with cheaper instance (AWS Spot instances). Note that if user require 0 CPU, a minimal cluster is created with 1 MASTER of 4 CPUs and 1 CORE of 4 CPUs, both instances been charged on demand. Additional spot instances are created when `SpotCPUCount > 4` * Template URL: `https://s3.amazonaws.com/[Bucket]/Hail-on-AWS/hail_emr_spot.yml` -* Stack Name: `EMRCluster-hail-zep-vep` +* Stack Name: `EMRCluster-hail-lab-vep` * EMRClusterName `emr-cluster` * EMRReleaseLabel `emr-6.1.0` * EMRLogBucket `s3n://[Bucket]/EMR_logs/` @@ -63,7 +63,7 @@ The template used below create a cluster with cheaper instance (AWS Spot instanc * OwnerTag `owner` * ProjectTag `project` -## Accessing the AWS CloudFormation created Spark/Hail/Zeppelin EMR +## Accessing the AWS CloudFormation created Spark/Hail/Jupyter EMR ### Connect to EMR master node (shell) @@ -72,48 +72,26 @@ The template used below create a cluster with cheaper instance (AWS Spot instanc # Replace [path/to/key] below by the path to your EC2 Key .pem file # SSH on the master node (with tunnel) # * Hadoop :8088 - # * Zeppelin :8890 + # * Jupyter :9443 # * SparkUI :18080 - MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 8890:$MASTER:8890 -L 18080:$MASTER:18080 hadoop@$MASTER + MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 9443:$MASTER:9443 -L 18080:$MASTER:18080 hadoop@$MASTER ``` -### Accessing the EMR cluster via Zeppelin UI +### Accessing the EMR cluster via Jupyter Lab -* Visit [Zeppelin](http://localhost:8890) -* Create a new note(book) +* Visit [Jupyter](https://localhost:9443/user/jovyan/lab) +* Create a new notebook with pyspak kernel * Import and initialize Hail and SparkContext ```py -%pyspark # Import and initialize Hail import hail as hl hl.init(sc) ``` -* Import Bokehjs +### Commit changes to Jupyter Notebook -```py -%pyspark -# Import bokeh -from bokeh.io import show, output_notebook -from bokeh.plotting import figure -# Import bokeh-zeppelin -import bkzep -output_notebook(notebook_type='zeppelin') -``` - -### Commit changes to Zeppelin note(book) - -* in Zeppelin menu, click on **Version control** -* Write a commit message and click on **Commit** -* Click on **Ok** -* Save your work on github - -```sh -%sh -cd /opt/zeppelin -git push origin master -``` +TBD ## Install Ensembl's Variant Effect Predictor (VEP) @@ -128,9 +106,9 @@ Set `DiskSizeGB` CloudFormation template parameter accordingly # Replace [path/to/key] below by the path to your EC2 Key .pem file # SSH on the master node (with tunnel) # * Hadoop :8088 - # * Zeppelin :8890 + # * Jupyter :9443 # * SparkUI :18080 - MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 8890:$MASTER:8890 -L 18080:$MASTER:18080 hadoop@$MASTER + MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 9443:$MASTER:9443 -L 18080:$MASTER:18080 hadoop@$MASTER ``` ### Download VEP Docker image @@ -196,8 +174,6 @@ In Hail v0.2.60, the function `hl.export_elasticsearch` is not compatible with s In the mean time we can deploy Hail on emr-5.x that includes scala v2.11.x where `hl.export_elasticsearch` works. -Note that emr-6.x and emr-5.x includes different version of zeppelin (v0.8 vs v0.9) with incompatibility. Therefore the notebooks created on one emr version will not appears on the other emr version that. - ### CloudFormation template parameters (elasticsearch) * EMRReleaseLabel: `emr-5.31.0` diff --git a/src/hail_emr_spot.yml b/src/hail_emr_spot.yml index fbdb719..895e424 100644 --- a/src/hail_emr_spot.yml +++ b/src/hail_emr_spot.yml @@ -1,6 +1,6 @@ # Copyright 2019-2021 Maxime HEBRARD @ https://github.com/c-BIG # EMR v5.31.0 - Hadoop v2.10.0 - Java v1.8.0 - Python3 v3.7.9 - Spark v2.4.6 - Scala v2.11.12 - Zeppelin v0.8.2 - Hail v0.2.60 - VEP v95 -# EMR v6.1.0 - Hadoop v3.2.1 - Java v1.8.0 - Python3 v3.7.9 - Spark v3.0.0 - Scala v2.12.10 - Zeppelin v0.9.0 - Hail v0.2.59 - VEP v95 +# EMR v6.1.0 - Hadoop v3.2.1 - Java v1.8.0 - Python3 v3.7.16 - Spark v3.0.0 - Scala v2.12.10 - JupyterLab v2.0.1 - Hail v0.2.60 - VEP v95 AWSTemplateFormatVersion: '2010-09-09' Description: Provision an EMR cluster on spot instances with Python3, Spark, Hail, ensembl-VEP and Zeppelin Metadata: @@ -165,7 +165,6 @@ Resources: - Name: JupyterHub - Name: Livy - Name: Spark - - Name: Zeppelin BootstrapActions: - Name: install_hail ScriptBootstrapAction: @@ -177,16 +176,6 @@ Resources: - Ref: HailVersion - --emr-version - Ref: EMRReleaseLabel - - Name: install_zeppelin - ScriptBootstrapAction: - Path: !Sub "${CFNBucket}install_zeppelin.sh" - Args: - - --account - - Ref: GitHubAccount - - --repo - - Ref: GitHubRepository - - --token - - Ref: GitHubToken - Name: install_vep ScriptBootstrapAction: Path: !Sub "${CFNBucket}install_vep.sh" @@ -205,7 +194,8 @@ Resources: fs.s3.maxConnections: 1000 - Classification: livy-conf ConfigurationProperties: - livy.server.session.timeout-check": false, + livy.server.session.timeout-check: false + livy.server.session.timeout: 100h - Classification: spark ConfigurationProperties: maximizeResourceAllocation: true @@ -215,15 +205,7 @@ Resources: spark.driver.extraClassPath: /usr/local/lib/python3.7/site-packages/hail/backend/hail-all-spark.jar:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar spark.executor.extraClassPath: /usr/local/lib/python3.7/site-packages/hail/backend/hail-all-spark.jar:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar spark.serializer: org.apache.spark.serializer.KryoSerializer - spark.kryo.registrator: is.hail.kryo.HailKryoRegistrator - - Classification: zeppelin-env - Configurations: - - Classification: export - ConfigurationProperties: - ZEPPELIN_NOTEBOOK_DIR: /opt/zeppelin - ZEPPELIN_NOTEBOOK_STORAGE: org.apache.zeppelin.notebook.repo.GitHubNotebookRepo - ZEPPELIN_NOTEBOOK_GIT_REMOTE_URL: !Join ['', ['https://', !Ref GitHubAccount, ':', !Ref GitHubToken, '@github.com/', !Ref GitHubRepository, '.git']] - ZEPPELIN_NOTEBOOK_GIT_REMOTE_ACCESS_TOKEN: !Ref GitHubToken + spark.kryo.registrator: is.hail.kryo.HailKryoRegistrator Instances: AdditionalMasterSecurityGroups: - Ref: SecurityGroup @@ -319,6 +301,21 @@ Resources: LogUri: Ref: EMRLogBucket VisibleToAllUsers: True + Steps: + - Name: step_install_jupyter_git + ActionOnFailure: CONTINUE + HadoopJarStep: + Jar: s3://ap-southeast-1.elasticmapreduce/libs/script-runner/script-runner.jar + Args: + - !Sub "${CFNBucket}step_jupyter.sh" + - --branch + - !Ref OwnerTag + - --account + - !Ref GitHubAccount + - --repo + - !Ref GitHubRepository + - --token + - !Ref GitHubToken Tags: - Key: Name Value: diff --git a/src/install_hail.sh b/src/install_hail.sh index 3ae6583..32595fc 100644 --- a/src/install_hail.sh +++ b/src/install_hail.sh @@ -5,7 +5,7 @@ exec 3>&1 4>&2 trap 'exec 2>&4 1>&3' 0 1 2 3 exec 1>>/tmp/cloudcreation_log.out 2>&1 -echo '### INSTALL_HAIL.SH ###' +echo '### INSTALL_HAIL.SH v4.0.0 ###' # Default parameters OUTPUT_PATH="" @@ -59,78 +59,26 @@ echo '# Install libs #' sudo yum install -y lz4 lz4-devel sudo yum install -y git -echo '# Test if hail exists #' -echo " aws s3 ls ${OUTPUT_PATH}${EMR_VERSION}/site-packages/| grep hail-${HAIL_VERSION}.dist-info | wc -c" -wc=`aws s3 ls ${OUTPUT_PATH}${EMR_VERSION}/site-packages/ | grep hail-${HAIL_VERSION}.dist-info | wc -c` -echo "word count = ${wc}" +echo '# Clone Hail #' +sudo mkdir -p /opt/broad-hail +cd /opt/broad-hail +sudo git clone --branch $HAIL_VERSION --depth 1 https://github.com/broadinstitute/hail.git . +cd /opt/broad-hail/hail/ -if [ "${wc}" -eq 0 ] -then - echo '# Clone Hail #' - sudo mkdir -p /opt/broad-hail - cd /opt/broad-hail - sudo git clone --branch $HAIL_VERSION --depth 1 https://github.com/broadinstitute/hail.git . - cd /opt/broad-hail/hail/ - - echo '# Build Hail #' - # Fix java - sudo ln -s /etc/alternatives/java_sdk/include /etc/alternatives/jre/include - - # Adjust scala version - if [ "${EMR_VERSION}" = "emr-5.31.0" ] - then - sudo make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.11.12 SPARK_VERSION=2.4.6 - elif [ "${EMR_VERSION}" = "emr-6.1.0" ] - then - sudo make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.12.10 SPARK_VERSION=3.0.0 - else - echo "EMR version ${EMR_VERSION} not supported !" - exit 0 - fi +echo '# Build Hail #' +# Fix java +sudo ln -s /etc/alternatives/java_sdk/include /etc/alternatives/jre/include - # Test if Hail already build by another node - wc=`aws s3 ls ${OUTPUT_PATH}${EMR_VERSION}/site-packages/ | grep hail-${HAIL_VERSION}.dist-info | wc -c` - if [ "${wc}" -eq 0 ] - then - echo '# Copy hail to S3' - aws s3 sync ${PYTHON_PACKAGES}site-packages/hail/ ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hail/ - aws s3 sync ${PYTHON_PACKAGES}site-packages/hailtop/ ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hailtop/ - aws s3 sync ${PYTHON_PACKAGES}site-packages/hail-${HAIL_VERSION}.dist-info/ ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hail-${HAIL_VERSION}.dist-info/ - fi +# Adjust scala version +if [ "${EMR_VERSION}" = "emr-5.31.0" ] +then + sudo make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.11.12 SPARK_VERSION=2.4.6 +elif [ "${EMR_VERSION}" = "emr-6.1.0" ] +then + sudo make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.12.10 SPARK_VERSION=3.0.0 else - echo '# Download hail #' - sudo aws s3 sync ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hail/ ${PYTHON_PACKAGES}site-packages/hail/ - sudo aws s3 sync ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hailtop/ ${PYTHON_PACKAGES}site-packages/hailtop/ - sudo aws s3 sync ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hail-${HAIL_VERSION}.dist-info/ ${PYTHON_PACKAGES}site-packages/hail-${HAIL_VERSION}.dist-info/ + echo "EMR version ${EMR_VERSION} not supported !" + exit 0 fi -echo '# Install hail dependencies #' -WHEELS="aiohttp>=3.6,<3.7 -aiohttp-session<2.8,>=2.7 -asyncinit<0.3,>=0.2.4 -bkzep -bokeh>1.1,<1.3 -decorator<5 -Deprecated>=1.2.10,<1.3 -dill<0.4,>=0.3.1.1 -gcsfs==0.2.2 -google-cloud-storage==1.25.* -humanize==1.0.0 -hurry.filesize==0.9 -nest-asyncio -parsimonious<0.9 -pandas==0.25 -PyJWT -pyspark>=2.4,<2.4.2 -python-json-logger==0.1.11 -requests==2.22.0 -scipy==1.3 -tabulate==0.8.3 -tqdm==4.42.1" - -for WHEEL_NAME in $WHEELS -do - sudo python3 -m pip install $WHEEL_NAME -done - echo '### END INSTALL_HAIL.SH ###' diff --git a/src/install_zeppelin.sh b/src/install_zeppelin.sh deleted file mode 100644 index 2ba2b10..0000000 --- a/src/install_zeppelin.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# Logs -exec 3>&1 4>&2 -trap 'exec 2>&4 1>&3' 0 1 2 3 -exec 1>>/tmp/cloudcreation_log.out 2>&1 - -echo '### INSTALL_ZEPPELIN.SH ###' - -# Default parameters -ACCOUNT="" -REPO="" -TOKEN="" - -# Read CLI script parameters -while [ $# -gt 0 ]; do - case "$1" in - --account) - shift - ACCOUNT=$1 - ;; - --repo) - shift - REPO=$1 - ;; - --token) - shift - TOKEN=$1 - ;; - -*) - error_msg "unrecognized option: $1" - ;; - *) - break; - ;; - esac - shift -done - -echo '# Parameters #' -echo "ACCOUNT: $ACCOUNT" -echo "REPO: $REPO" -echo "TOKEN: $TOKEN" - -echo '# Update system #' -sudo yum update -y --skip-broken -sudo yum install -y python-pip -sudo python3 -m pip install --upgrade pip -sudo yum install -y git - -echo '# Install dependencies #' -sudo python3 -m pip install bkzep - -echo '# Clone notebooks' -sudo mkdir -p /opt/zeppelin -cd /opt/zeppelin -sudo git clone --depth 1 https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git . -sudo chmod -R 777 /opt/zeppelin/ - -echo '### END INSTALL_ZEPPELIN.SH ###' diff --git a/src/step_jupyter.sh b/src/step_jupyter.sh new file mode 100644 index 0000000..fc45f16 --- /dev/null +++ b/src/step_jupyter.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# Logs +exec 3>&1 4>&2 +trap 'exec 2>&4 1>&3' 0 1 2 3 +exec 1>>/tmp/cloudcreation_log.out 2>&1 + +echo '### STEP_JUPYTER.SH v4.0.0 ###' + +# Default parameters +BRANCH="master" + +# Read CLI script parameters +while [ $# -gt 0 ]; do + case "$1" in + --branch) + shift + BRANCH=$1 + ;; + --account) + shift + ACCOUNT=$1 + ;; + --repo) + shift + REPO=$1 + ;; + --token) + shift + TOKEN=$1 + ;; + -*) + error_msg "unrecognized option: $1" + ;; + *) + break; + ;; + esac + shift +done + +echo '# Parameters #' +echo "BRANCH: $BRANCH" +echo "ACCOUNT: $ACCOUNT" +echo "REPO: $REPO" +echo "TOKEN: [...]" + +echo '# Install system libs #' +sudo yum update -y --skip-broken +sudo yum install -y python3-devel python3-pip +sudo yum install -y git + +echo '# Install python libs #' +sudo python3 -m pip install ipython +sudo python3 -m pip install Jinja2==3.0.3 +# Use matplotlib version compatible with numpy 1.16.5 +# numpy version is fixed on EMR (after bootstrap) +sudo python3 -m pip install matplotlib==3.4.3 +sudo python3 -m pip install seaborn +sudo python3 -m pip install umap-learn +sudo python3 -m pip install pycrypto + +echo '# Install docker libs #' +sudo docker exec jupyterhub conda install -c conda-forge \ +jupyterlab git jupyterlab-git ipympl + +echo '# Test branch #' +https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git +lsr=`git ls-remote --heads https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git ${BRANCH} | wc -l` +echo "ls-rempte = ${lsr}" + +if [ "${lsr}" -eq 0 ] +then + echo '# Clone main & create branch #' + sudo docker exec jupyterhub \ + git clone --depth 1 https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git + +else + echo '# Clone branch #' + sudo docker exec jupyterhub \ + git clone --depth 1 --branch ${BRANCH} https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git +fi + +echo '# Change mode #' +sudo docker exec jupyterhub chmod -R 777 /home/jovyan/ +sudo docker exec jupyterhub chown -R jovyan:users /home/jovyan/ + +echo '### END STEP_JUPYTER.SH ###'