From 26f0cc091f9f110f8481788d56688698855f31e0 Mon Sep 17 00:00:00 2001
From: mhebrard <maxime.hebrard@gmail.com>
Date: Wed, 11 Oct 2023 09:44:58 +0800
Subject: [PATCH 1/2] Fix README

---
 README.md | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 3225b66..3096ca2 100644
--- a/README.md
+++ b/README.md
@@ -10,8 +10,9 @@ Deploy an [EMR cluster on AWS](https://aws.amazon.com/emr/), with Spark, [Hail](
 * A github repository to store the zeppelin notebooks
 * A github account with write permission on the repository and a personal access token with full repo permissions.
 
-In addition you may want to install / be able to run Ensembl's Variant Effect Predictor (VEP) 
-* A S3 bucket containining VEP cache data, see section [Install Ensembl's Variant Effect Predictor (VEP)](#Install-Ensembls-Variant-Effect-Predictor-VEP).
+In addition you may want to install / be able to run Ensembl's Variant Effect Predictor (VEP)
+
+* A S3 bucket containining VEP cache data, see section [Install Ensembl's Variant Effect Predictor (VEP)](#install-ensembls-variant-effect-predictor-vep).
 
 ## Create a Spark/Hail/Zeppelin EMR using AWS CloudFormation service
 
@@ -76,7 +77,8 @@ The template used below create a cluster with cheaper instance (AWS Spot instanc
   MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 8890:$MASTER:8890 -L 18080:$MASTER:18080 hadoop@$MASTER
  ```
 
-### Accessing the EMR cluster via Zeppelin UI  
+### Accessing the EMR cluster via Zeppelin UI
+
 * Visit [Zeppelin](http://localhost:8890)
 * Create a new note(book)
 * Import and initialize Hail and SparkContext
@@ -101,9 +103,10 @@ output_notebook(notebook_type='zeppelin')
 ```
 
 ### Commit changes to Zeppelin note(book)
-  * in Zeppelin menu, click on **Version control**
-  * Write a commit message and click on **Commit**
-  * Click on **Ok**
+
+* in Zeppelin menu, click on **Version control**
+* Write a commit message and click on **Commit**
+* Click on **Ok**
 * Save your work on github
 
 ```sh
@@ -112,12 +115,13 @@ cd /opt/zeppelin
 git push origin master
 ```
 
-## Install Ensembl's Variant Effect Predictor (VEP) 
-First we need to download VEP cache and store it on AWS. 
-Be aware that the data represents ~25Gb. 
+## Install Ensembl's Variant Effect Predictor (VEP)
+
+First we need to download VEP cache and store it on AWS.
+Be aware that the data represents ~25Gb.
 Set `DiskSizeGB` CloudFormation template parameter accordingly
 
-### Connect to EMR master node (shell)
+### Re-connect to EMR master node (shell)
 
 ```sh
   # Replace [EMRMasterDNS] below by the value displayed in stack Outputs
@@ -159,10 +163,10 @@ docker run -v /mnt/vep/vep_data:/opt/vep/.vep -w /opt/vep/src/ensembl-vep $IMAGE
 aws s3 cp /mnt/vep/vep_data//homo_sapiens_merged/ s3://[Bucket]/Hail-on-AWS/vep_data/homo_sapiens_merged/ --recursive
 ```
 
-### CloudFormation template parameters
+### CloudFormation template parameters (VEP)
+
 Now we can create a cluster with VEP installed by default
 
-### CloudFormation template parameters
 * DiskSizeGB: `50`
 * VEPInstall: `true`
 * VEPBucket: `s3://[Bucket]/Hail-on-AWS/vep_data/`
@@ -185,6 +189,7 @@ ht_vep = hl.vep(ht_nostar, 's3://[Bucket]/Hail-on-AWS/vep_data/vep[VEPVersion]_[
 # Write table
 ht.write('s3://[Path/to/table].vep.ht', overwrite=True)
 ```
+
 ## Export to Elasticsearch
 
 In Hail v0.2.60, the function `hl.export_elasticsearch` is not compatible with scala v2.12.x that is included in emr-6.x. Hail team is actively working on that issue, see [#9767](https://github.com/hail-is/hail/issues/9767)
@@ -193,7 +198,8 @@ In the mean time we can deploy Hail on emr-5.x that includes scala v2.11.x where
 
 Note that emr-6.x and emr-5.x includes different version of zeppelin (v0.8 vs v0.9) with incompatibility. Therefore the notebooks created on one emr version will not appears on the other emr version that.
 
-### CloudFormation template parameters
+### CloudFormation template parameters (elasticsearch)
+
 * EMRReleaseLabel: `emr-5.31.0`
 
-## END
\ No newline at end of file
+## END

From 08c4622644228f24f43e9349a01bc52cb8326fb6 Mon Sep 17 00:00:00 2001
From: mhebrard <maxime.hebrard@gmail.com>
Date: Wed, 11 Oct 2023 17:23:49 +0800
Subject: [PATCH 2/2] Switch to JupyterLab

---
 README.md               | 52 +++++++-----------------
 src/hail_emr_spot.yml   | 41 +++++++++----------
 src/install_hail.sh     | 88 +++++++++--------------------------------
 src/install_zeppelin.sh | 60 ----------------------------
 src/step_jupyter.sh     | 88 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 139 insertions(+), 190 deletions(-)
 delete mode 100644 src/install_zeppelin.sh
 create mode 100644 src/step_jupyter.sh

diff --git a/README.md b/README.md
index 3096ca2..0e88f78 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,20 @@
 # Hail-on-AWS
 
-Deploy an [EMR cluster on AWS](https://aws.amazon.com/emr/), with Spark, [Hail](https://hail.is/index.html), [Zeppelin](https://zeppelin.apache.org/) and [Ensembl VEP](https://ensembl.org/info/docs/tools/vep/index.html) using [CloudFormation service](https://aws.amazon.com/cloudformation/).
+Deploy an [EMR cluster on AWS](https://aws.amazon.com/emr/), with Spark, [Hail](https://hail.is/index.html), [JupyterLab](https://jupyter.org/about.html) and [Ensembl VEP](https://ensembl.org/info/docs/tools/vep/index.html) using [CloudFormation service](https://aws.amazon.com/cloudformation/).
 
 ## Requirements
 
 * A valid AWS account with appropriate permissions
 * A VPC, a subnet and a security group ready to ensure appropriate access to the cluster
 * A S3 bucket to receive the data
-* A github repository to store the zeppelin notebooks
+* A github repository to store the notebooks
 * A github account with write permission on the repository and a personal access token with full repo permissions.
 
 In addition you may want to install / be able to run Ensembl's Variant Effect Predictor (VEP)
 
 * A S3 bucket containining VEP cache data, see section [Install Ensembl's Variant Effect Predictor (VEP)](#install-ensembls-variant-effect-predictor-vep).
 
-## Create a Spark/Hail/Zeppelin EMR using AWS CloudFormation service
+## Create a Spark/Hail/Jupyter EMR using AWS CloudFormation service
 
 * Clone this repository
 
@@ -39,7 +39,7 @@ aws s3 sync src/ s3://[Bucket]/Hail-on-AWS/
 The template used below create a cluster with cheaper instance (AWS Spot instances). Note that if user require 0 CPU, a minimal cluster is created with 1 MASTER of 4 CPUs and 1 CORE of 4 CPUs, both instances been charged on demand. Additional spot instances are created when `SpotCPUCount > 4`
 
 * Template URL: `https://s3.amazonaws.com/[Bucket]/Hail-on-AWS/hail_emr_spot.yml`
-* Stack Name: `EMRCluster-hail-zep-vep`
+* Stack Name: `EMRCluster-hail-lab-vep`
 * EMRClusterName `emr-cluster`
 * EMRReleaseLabel `emr-6.1.0`
 * EMRLogBucket `s3n://[Bucket]/EMR_logs/`
@@ -63,7 +63,7 @@ The template used below create a cluster with cheaper instance (AWS Spot instanc
 * OwnerTag `owner`
 * ProjectTag `project`
 
-## Accessing the AWS CloudFormation created Spark/Hail/Zeppelin EMR
+## Accessing the AWS CloudFormation created Spark/Hail/Jupyter EMR
 
 ### Connect to EMR master node (shell)
 
@@ -72,48 +72,26 @@ The template used below create a cluster with cheaper instance (AWS Spot instanc
   # Replace [path/to/key] below by the path to your EC2 Key .pem file
   # SSH on the master node (with tunnel)
   # * Hadoop                :8088
-  # * Zeppelin              :8890
+  # * Jupyter               :9443
   # * SparkUI               :18080
-  MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 8890:$MASTER:8890 -L 18080:$MASTER:18080 hadoop@$MASTER
+  MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 9443:$MASTER:9443 -L 18080:$MASTER:18080 hadoop@$MASTER
  ```
 
-### Accessing the EMR cluster via Zeppelin UI
+### Accessing the EMR cluster via Jupyter Lab
 
-* Visit [Zeppelin](http://localhost:8890)
-* Create a new note(book)
+* Visit [Jupyter](https://localhost:9443/user/jovyan/lab)
+* Create a new notebook with pyspak kernel
 * Import and initialize Hail and SparkContext
 
 ```py
-%pyspark
 # Import and initialize Hail
 import hail as hl
 hl.init(sc)
 ```
 
-* Import Bokehjs
+### Commit changes to Jupyter Notebook
 
-```py
-%pyspark
-# Import bokeh
-from bokeh.io import show, output_notebook
-from bokeh.plotting import figure
-# Import bokeh-zeppelin
-import bkzep
-output_notebook(notebook_type='zeppelin')
-```
-
-### Commit changes to Zeppelin note(book)
-
-* in Zeppelin menu, click on **Version control**
-* Write a commit message and click on **Commit**
-* Click on **Ok**
-* Save your work on github
-
-```sh
-%sh
-cd /opt/zeppelin
-git push origin master
-```
+TBD
 
 ## Install Ensembl's Variant Effect Predictor (VEP)
 
@@ -128,9 +106,9 @@ Set `DiskSizeGB` CloudFormation template parameter accordingly
   # Replace [path/to/key] below by the path to your EC2 Key .pem file
   # SSH on the master node (with tunnel)
   # * Hadoop                :8088
-  # * Zeppelin              :8890
+  # * Jupyter               :9443
   # * SparkUI               :18080
-  MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 8890:$MASTER:8890 -L 18080:$MASTER:18080 hadoop@$MASTER
+  MASTER=[EMRMasterDNS]; ssh -i [path/to/key].pem -L 8088:$MASTER:8088 -L 9443:$MASTER:9443 -L 18080:$MASTER:18080 hadoop@$MASTER
 ```
 
 ### Download VEP Docker image
@@ -196,8 +174,6 @@ In Hail v0.2.60, the function `hl.export_elasticsearch` is not compatible with s
 
 In the mean time we can deploy Hail on emr-5.x that includes scala v2.11.x where `hl.export_elasticsearch` works.
 
-Note that emr-6.x and emr-5.x includes different version of zeppelin (v0.8 vs v0.9) with incompatibility. Therefore the notebooks created on one emr version will not appears on the other emr version that.
-
 ### CloudFormation template parameters (elasticsearch)
 
 * EMRReleaseLabel: `emr-5.31.0`
diff --git a/src/hail_emr_spot.yml b/src/hail_emr_spot.yml
index fbdb719..895e424 100644
--- a/src/hail_emr_spot.yml
+++ b/src/hail_emr_spot.yml
@@ -1,6 +1,6 @@
 # Copyright 2019-2021 Maxime HEBRARD @ https://github.com/c-BIG
 # EMR v5.31.0 - Hadoop v2.10.0 - Java v1.8.0 - Python3 v3.7.9 - Spark v2.4.6 - Scala v2.11.12 - Zeppelin v0.8.2 - Hail v0.2.60 - VEP v95
-# EMR v6.1.0 - Hadoop v3.2.1 - Java v1.8.0 - Python3 v3.7.9 - Spark v3.0.0 - Scala v2.12.10 - Zeppelin v0.9.0 - Hail v0.2.59 - VEP v95
+# EMR v6.1.0 - Hadoop v3.2.1 - Java v1.8.0 - Python3 v3.7.16 - Spark v3.0.0 - Scala v2.12.10 - JupyterLab v2.0.1 - Hail v0.2.60 - VEP v95
 AWSTemplateFormatVersion: '2010-09-09'
 Description: Provision an EMR cluster on spot instances with Python3, Spark, Hail, ensembl-VEP and Zeppelin
 Metadata:
@@ -165,7 +165,6 @@ Resources:
       - Name: JupyterHub
       - Name: Livy
       - Name: Spark
-      - Name: Zeppelin
       BootstrapActions:
       - Name: install_hail
         ScriptBootstrapAction:
@@ -177,16 +176,6 @@ Resources:
           - Ref: HailVersion
           - --emr-version
           - Ref: EMRReleaseLabel
-      - Name: install_zeppelin
-        ScriptBootstrapAction:
-          Path: !Sub "${CFNBucket}install_zeppelin.sh"
-          Args:
-          - --account
-          - Ref: GitHubAccount
-          - --repo
-          - Ref: GitHubRepository
-          - --token
-          - Ref: GitHubToken
       - Name: install_vep
         ScriptBootstrapAction:
           Path: !Sub "${CFNBucket}install_vep.sh"
@@ -205,7 +194,8 @@ Resources:
           fs.s3.maxConnections: 1000
       - Classification: livy-conf
         ConfigurationProperties:
-          livy.server.session.timeout-check": false,
+          livy.server.session.timeout-check: false
+          livy.server.session.timeout: 100h
       - Classification: spark
         ConfigurationProperties:
           maximizeResourceAllocation: true
@@ -215,15 +205,7 @@ Resources:
           spark.driver.extraClassPath: /usr/local/lib/python3.7/site-packages/hail/backend/hail-all-spark.jar:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
           spark.executor.extraClassPath: /usr/local/lib/python3.7/site-packages/hail/backend/hail-all-spark.jar:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
           spark.serializer: org.apache.spark.serializer.KryoSerializer
-          spark.kryo.registrator: is.hail.kryo.HailKryoRegistrator
-      - Classification: zeppelin-env
-        Configurations:
-          - Classification: export
-            ConfigurationProperties:
-              ZEPPELIN_NOTEBOOK_DIR: /opt/zeppelin
-              ZEPPELIN_NOTEBOOK_STORAGE: org.apache.zeppelin.notebook.repo.GitHubNotebookRepo
-              ZEPPELIN_NOTEBOOK_GIT_REMOTE_URL: !Join ['', ['https://', !Ref GitHubAccount, ':', !Ref GitHubToken, '@github.com/', !Ref GitHubRepository, '.git']]
-              ZEPPELIN_NOTEBOOK_GIT_REMOTE_ACCESS_TOKEN: !Ref GitHubToken     
+          spark.kryo.registrator: is.hail.kryo.HailKryoRegistrator 
       Instances:
         AdditionalMasterSecurityGroups:
         - Ref: SecurityGroup
@@ -319,6 +301,21 @@ Resources:
       LogUri: 
         Ref: EMRLogBucket
       VisibleToAllUsers: True
+      Steps:
+        - Name: step_install_jupyter_git
+          ActionOnFailure: CONTINUE
+          HadoopJarStep:
+            Jar: s3://ap-southeast-1.elasticmapreduce/libs/script-runner/script-runner.jar
+            Args:
+              - !Sub "${CFNBucket}step_jupyter.sh"
+              - --branch
+              - !Ref OwnerTag
+              - --account
+              - !Ref GitHubAccount
+              - --repo
+              - !Ref GitHubRepository
+              - --token
+              - !Ref GitHubToken
       Tags:
       - Key: Name
         Value: 
diff --git a/src/install_hail.sh b/src/install_hail.sh
index 3ae6583..32595fc 100644
--- a/src/install_hail.sh
+++ b/src/install_hail.sh
@@ -5,7 +5,7 @@ exec 3>&1 4>&2
 trap 'exec 2>&4 1>&3' 0 1 2 3
 exec 1>>/tmp/cloudcreation_log.out 2>&1
 
-echo '### INSTALL_HAIL.SH ###'
+echo '### INSTALL_HAIL.SH v4.0.0 ###'
 
 # Default parameters
 OUTPUT_PATH=""
@@ -59,78 +59,26 @@ echo '# Install libs #'
 sudo yum install -y lz4 lz4-devel
 sudo yum install -y git
 
-echo '# Test if hail exists #'
-echo " aws s3 ls ${OUTPUT_PATH}${EMR_VERSION}/site-packages/| grep hail-${HAIL_VERSION}.dist-info | wc -c"
-wc=`aws s3 ls ${OUTPUT_PATH}${EMR_VERSION}/site-packages/ | grep hail-${HAIL_VERSION}.dist-info | wc -c`
-echo "word count = ${wc}"
+echo '# Clone Hail #'
+sudo mkdir -p /opt/broad-hail
+cd /opt/broad-hail
+sudo git clone --branch $HAIL_VERSION --depth 1 https://github.com/broadinstitute/hail.git .
+cd /opt/broad-hail/hail/
 
-if [ "${wc}" -eq 0 ]
-then
-  echo '# Clone Hail #'
-  sudo mkdir -p /opt/broad-hail
-  cd /opt/broad-hail
-  sudo git clone --branch $HAIL_VERSION --depth 1 https://github.com/broadinstitute/hail.git .
-  cd /opt/broad-hail/hail/
-
-  echo '# Build Hail #'
-  # Fix java
-  sudo ln -s /etc/alternatives/java_sdk/include /etc/alternatives/jre/include
-
-  # Adjust scala version
-  if [ "${EMR_VERSION}" = "emr-5.31.0" ]
-  then
-    sudo make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.11.12 SPARK_VERSION=2.4.6
-  elif [ "${EMR_VERSION}" = "emr-6.1.0" ]
-  then
-    sudo make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.12.10 SPARK_VERSION=3.0.0
-  else
-    echo "EMR version ${EMR_VERSION} not supported !"
-    exit 0
-  fi
+echo '# Build Hail #'
+# Fix java
+sudo ln -s /etc/alternatives/java_sdk/include /etc/alternatives/jre/include
 
-  # Test if Hail already build by another node
-  wc=`aws s3 ls ${OUTPUT_PATH}${EMR_VERSION}/site-packages/ | grep hail-${HAIL_VERSION}.dist-info | wc -c`
-  if [ "${wc}" -eq 0 ]
-  then
-    echo '# Copy hail to S3'
-    aws s3 sync ${PYTHON_PACKAGES}site-packages/hail/ ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hail/
-    aws s3 sync ${PYTHON_PACKAGES}site-packages/hailtop/ ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hailtop/
-    aws s3 sync ${PYTHON_PACKAGES}site-packages/hail-${HAIL_VERSION}.dist-info/ ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hail-${HAIL_VERSION}.dist-info/
-  fi
+# Adjust scala version
+if [ "${EMR_VERSION}" = "emr-5.31.0" ]
+then
+  sudo make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.11.12 SPARK_VERSION=2.4.6
+elif [ "${EMR_VERSION}" = "emr-6.1.0" ]
+then
+  sudo make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.12.10 SPARK_VERSION=3.0.0
 else
-  echo '# Download hail #'
-    sudo aws s3 sync ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hail/ ${PYTHON_PACKAGES}site-packages/hail/
-    sudo aws s3 sync ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hailtop/ ${PYTHON_PACKAGES}site-packages/hailtop/
-    sudo aws s3 sync ${OUTPUT_PATH}${EMR_VERSION}/site-packages/hail-${HAIL_VERSION}.dist-info/ ${PYTHON_PACKAGES}site-packages/hail-${HAIL_VERSION}.dist-info/
+  echo "EMR version ${EMR_VERSION} not supported !"
+  exit 0
 fi
 
-echo '# Install hail dependencies #'
-WHEELS="aiohttp>=3.6,<3.7
-aiohttp-session<2.8,>=2.7
-asyncinit<0.3,>=0.2.4
-bkzep
-bokeh>1.1,<1.3
-decorator<5
-Deprecated>=1.2.10,<1.3
-dill<0.4,>=0.3.1.1
-gcsfs==0.2.2
-google-cloud-storage==1.25.*
-humanize==1.0.0
-hurry.filesize==0.9
-nest-asyncio
-parsimonious<0.9
-pandas==0.25
-PyJWT
-pyspark>=2.4,<2.4.2
-python-json-logger==0.1.11
-requests==2.22.0
-scipy==1.3
-tabulate==0.8.3
-tqdm==4.42.1"
-
-for WHEEL_NAME in $WHEELS
-do
-  sudo python3 -m pip install $WHEEL_NAME
-done
-
 echo '### END INSTALL_HAIL.SH ###'
diff --git a/src/install_zeppelin.sh b/src/install_zeppelin.sh
deleted file mode 100644
index 2ba2b10..0000000
--- a/src/install_zeppelin.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Logs
-exec 3>&1 4>&2
-trap 'exec 2>&4 1>&3' 0 1 2 3
-exec 1>>/tmp/cloudcreation_log.out 2>&1
-
-echo '### INSTALL_ZEPPELIN.SH ###'
-
-# Default parameters
-ACCOUNT=""
-REPO=""
-TOKEN=""
-
-# Read CLI script parameters
-while [ $# -gt 0 ]; do
-    case "$1" in
-     --account)
-      shift
-      ACCOUNT=$1
-      ;;
-    --repo)
-      shift
-      REPO=$1
-      ;;
-    --token)
-      shift
-      TOKEN=$1
-      ;;
-    -*)
-      error_msg "unrecognized option: $1"
-      ;;
-    *)
-      break;
-      ;;
-    esac
-    shift
-done
-
-echo '# Parameters #'
-echo "ACCOUNT: $ACCOUNT"
-echo "REPO: $REPO"
-echo "TOKEN: $TOKEN"
-
-echo '# Update system #'
-sudo yum update -y --skip-broken
-sudo yum install -y python-pip
-sudo python3 -m pip install --upgrade pip
-sudo yum install -y git
-
-echo '# Install dependencies #'
-sudo python3 -m pip install bkzep
-
-echo '# Clone notebooks'
-sudo mkdir -p /opt/zeppelin
-cd /opt/zeppelin
-sudo git clone --depth 1 https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git .
-sudo chmod -R 777 /opt/zeppelin/
-
-echo '### END INSTALL_ZEPPELIN.SH ###'
diff --git a/src/step_jupyter.sh b/src/step_jupyter.sh
new file mode 100644
index 0000000..fc45f16
--- /dev/null
+++ b/src/step_jupyter.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# Logs
+exec 3>&1 4>&2
+trap 'exec 2>&4 1>&3' 0 1 2 3
+exec 1>>/tmp/cloudcreation_log.out 2>&1
+
+echo '### STEP_JUPYTER.SH v4.0.0 ###'
+
+# Default parameters
+BRANCH="master"
+
+# Read CLI script parameters
+while [ $# -gt 0 ]; do
+    case "$1" in
+    --branch)
+      shift
+      BRANCH=$1
+      ;;
+    --account)
+      shift
+      ACCOUNT=$1
+      ;;
+    --repo)
+      shift
+      REPO=$1
+      ;;
+    --token)
+      shift
+      TOKEN=$1
+      ;;
+    -*)
+      error_msg "unrecognized option: $1"
+      ;;
+    *)
+      break;
+      ;;
+    esac
+    shift
+done
+
+echo '# Parameters #'
+echo "BRANCH: $BRANCH"
+echo "ACCOUNT: $ACCOUNT"
+echo "REPO: $REPO"
+echo "TOKEN: [...]"
+
+echo '# Install system libs #'
+sudo yum update -y --skip-broken
+sudo yum install -y python3-devel python3-pip
+sudo yum install -y git
+
+echo '# Install python libs #'
+sudo python3 -m pip install ipython
+sudo python3 -m pip install Jinja2==3.0.3
+# Use matplotlib version compatible with numpy 1.16.5
+# numpy version is fixed on EMR (after bootstrap)
+sudo python3 -m pip install matplotlib==3.4.3
+sudo python3 -m pip install seaborn
+sudo python3 -m pip install umap-learn
+sudo python3 -m pip install pycrypto
+
+echo '# Install docker libs #'
+sudo docker exec jupyterhub conda install -c conda-forge \
+jupyterlab git jupyterlab-git ipympl
+
+echo '# Test branch #'
+https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git
+lsr=`git ls-remote --heads https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git ${BRANCH} | wc -l`
+echo "ls-rempte = ${lsr}"
+
+if [ "${lsr}" -eq 0 ]
+then
+  echo '# Clone main & create branch #'
+  sudo docker exec jupyterhub \
+  git clone --depth 1 https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git
+
+else 
+  echo '# Clone branch #'
+  sudo docker exec jupyterhub \
+  git clone --depth 1 --branch ${BRANCH} https://${ACCOUNT}:${TOKEN}@github.com/${REPO}.git
+fi
+
+echo '# Change mode #'
+sudo docker exec jupyterhub chmod -R 777 /home/jovyan/
+sudo docker exec jupyterhub chown -R jovyan:users /home/jovyan/
+
+echo '### END STEP_JUPYTER.SH ###'