Merge pull request #216 from uc-cdis/fix/etl

Fix/etl
uc-cdis · Dec 4, 2024 · a464375 · a464375
2 parents 93fd600 + c24545b
commit a464375
Show file tree

Hide file tree

Showing 7 changed files with 143 additions and 41 deletions.
diff --git a/helm/etl/Chart.yaml b/helm/etl/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.5
+version: 0.1.6
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/helm/etl/README.md b/helm/etl/README.md
@@ -1,6 +1,6 @@
 # etl
 
-![Version: 0.1.5](https://img.shields.io/badge/Version-0.1.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: master](https://img.shields.io/badge/AppVersion-master-informational?style=flat-square)
+![Version: 0.1.6](https://img.shields.io/badge/Version-0.1.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: master](https://img.shields.io/badge/AppVersion-master-informational?style=flat-square)
 
 A Helm chart for gen3 etl
 
@@ -14,6 +14,7 @@ A Helm chart for gen3 etl
 | esGarbageCollect.enabled | bool | `false` | Whether to create es garbage collect cronjob. |
 | esGarbageCollect.schedule | string | `"0 0 * * *"` | The cron schedule expression to use in the es garbage collect cronjob. Runs once a day by default. |
 | esGarbageCollect.slack_webhook | string | `"None"` | Slack webhook endpoint to use for cronjob. |
+| etlForced | string | `"TRUE"` |  |
 | etlMapping.mappings[0].aggregated_props[0].fn | string | `"count"` |  |
 | etlMapping.mappings[0].aggregated_props[0].name | string | `"_samples_count"` |  |
 | etlMapping.mappings[0].aggregated_props[0].path | string | `"samples"` |  |
@@ -93,17 +94,14 @@ A Helm chart for gen3 etl
 | image.tube.repository | string | `"quay.io/cdis/tube"` | The Docker image repository for the fence service |
 | image.tube.tag | string | `"master"` | Overrides the image tag whose default is the chart appVersion. |
 | imagePullSecrets | list | `[]` | Docker image pull secrets. |
+| legacySupport | bool | `false` |  |
 | podAnnotations | map | `{}` | Annotations to add to the pod |
-| resources | map | `{"spark":{"limits":{"cpu":1,"memory":"2Gi"},"requests":{"cpu":0.3,"memory":"128Mi"}},"tube":{"limits":{"cpu":1,"memory":"2Gi"},"requests":{"cpu":0.3,"memory":"128Mi"}}}` | Resource requests and limits for the containers in the pod |
-| resources.spark.limits | map | `{"cpu":1,"memory":"2Gi"}` | The maximum amount of resources that the container is allowed to use |
-| resources.spark.limits.cpu | string | `1` | The maximum amount of CPU the container can use |
-| resources.spark.limits.memory | string | `"2Gi"` | The maximum amount of memory the container can use |
+| resources | map | `{"spark":{"requests":{"cpu":0.3,"memory":"128Mi"}},"tube":{"requests":{"cpu":0.3,"memory":"128Mi"}}}` | Resource requests and limits for the containers in the pod |
 | resources.spark.requests | map | `{"cpu":0.3,"memory":"128Mi"}` | The amount of resources that the container requests |
 | resources.spark.requests.cpu | string | `0.3` | The amount of CPU requested |
 | resources.spark.requests.memory | string | `"128Mi"` | The amount of memory requested |
-| resources.tube.limits | map | `{"cpu":1,"memory":"2Gi"}` | The maximum amount of resources that the container is allowed to use |
-| resources.tube.limits.cpu | string | `1` | The maximum amount of CPU the container can use |
-| resources.tube.limits.memory | string | `"2Gi"` | The maximum amount of memory the container can use |
 | resources.tube.requests | map | `{"cpu":0.3,"memory":"128Mi"}` | The amount of resources that the container requests |
 | resources.tube.requests.cpu | string | `0.3` | The amount of CPU requested |
 | resources.tube.requests.memory | string | `"128Mi"` | The amount of memory requested |
+| schedule | string | `"*/30 * * * *"` |  |
+| suspendCronjob | bool | `true` |  |
diff --git a/helm/etl/templates/etl-job.yaml b/helm/etl/templates/etl-job.yaml
@@ -3,7 +3,8 @@ kind: CronJob
 metadata:
   name: etl-cronjob
 spec:
-  schedule: "0 0 1 1 */5"
+  suspend: {{ .Values.suspendCronjob }}
+  schedule: {{ .Values.schedule | quote }}
   jobTemplate:
     spec:
       backoffLimit: 0
@@ -35,6 +36,12 @@ spec:
                         values:
                           - ONDEMAND
           volumes:
+            {{- if .Values.legacySupport }}
+            - name: config-volume
+              secret:
+                defaultMode: 420
+                secretName: etl-secret  
+            {{- end }}        
             - name: signal-volume
               emptyDir: {}
             - name: creds-volume
@@ -80,9 +87,6 @@ spec:
                 requests:
                   cpu: {{  .Values.resources.spark.requests.cpu }}
                   memory: {{ .Values.resources.spark.requests.memory }}
-                # limits:
-                #   cpu: {{ .Values.resources.spark.limits.cpu }}
-                #   memory: {{ .Values.resources.spark.limits.memory }}
               command: ["/bin/bash" ]
               args: 
                 - "-c"
@@ -105,7 +109,6 @@ spec:
                   while true; do sleep 5; done
             - name: tube
               imagePullPolicy: IfNotPresent
-              # image: quay.io/cdis/tube:feat_helm_test
               image: {{ .Values.image.tube.repository }}:{{ .Values.image.tube.tag }}
               ports:
                 - containerPort: 80
@@ -153,7 +156,7 @@ spec:
                 - name: SPARK_DRIVER_MEMORY
                   value: 6g
                 - name: ETL_FORCED
-                  value: "TRUE"
+                  value: {{ .Values.etlForced }}
                 - name: gen3Env
                   valueFrom:
                     configMapKeyRef:
@@ -166,11 +169,11 @@ spec:
                       key: slack_webhook
                       optional: true
               volumeMounts:
-                # - name: "creds-volume"
-                #   readOnly: true
-                #   mountPath: "/gen3/tube/creds.json"
-                #   subPath: creds.json
-                # Volume to signal when to kill spark
+                {{- if .Values.legacySupport }}
+                - mountPath: /tube/tube/settings.py
+                  name: config-volume
+                  subPath: settings.py
+                {{- end }}
                 - mountPath: /usr/share/pod
                   name: signal-volume
                 - name: "etl-mapping"
@@ -185,9 +188,6 @@ spec:
                 requests:
                   cpu: {{  .Values.resources.tube.requests.cpu }}
                   memory: {{ .Values.resources.tube.requests.memory }}
-                # limits:
-                #   cpu: {{ .Values.resources.tube.limits.cpu }}
-                #   memory: {{ .Values.resources.tube.limits.memory }}
               command: ["/bin/bash"]
               args:
                 - "-c"
@@ -199,13 +199,17 @@ spec:
 
                   # Port 9000 is open, continue with the rest of the script
                   echo "Port 9000 is now open. Continuing with the script..."
-
-                  echo "python run_config.py && python run_etl.py"
-                  python run_config.py && python run_etl.py
+                  if [[ $ETL_FORCED != "false" ]]; then
+                    echo "python run_config.py && python run_etl.py --force"
+                    python run_config.py && python run_etl.py --force
+                  else
+                    echo "python run_config.py && python run_etl.py"
+                    python run_config.py && python run_etl.py
+                  fi
                   exitcode=$?
                   
                   # Kill sidecar and all processes
                   echo "Exit code: $exitcode"
                   pkill -u root && exit $exitcode
                   exit "$exitcode" &
-          restartPolicy: Never
+          restartPolicy: Never
diff --git a/helm/etl/templates/etl-secret.yaml b/helm/etl/templates/etl-secret.yaml
@@ -0,0 +1,104 @@
+{{ if  .Values.legacySupport }}
+kind: Secret
+apiVersion: v1
+metadata:
+  name: etl-secret
+stringData:
+  settings.py: |-
+      import os
+      import tube.enums as enums
+
+      from cdislogging import get_logger
+      from tube.config_helper import find_paths, load_json
+      from .utils.general import get_resource_paths_from_yaml
+
+
+      logger = get_logger("__name__", log_level="warn")
+
+      LIST_TABLES_FILES = "tables.txt"
+
+      #
+      # Load db credentials from a creds.json file.
+      # See config_helper.py for paths searched for creds.json
+      # ex: export XDG_DATA_HOME="$HOME/.local/share"
+      #    and setup $XDG_DATA_HOME/.local/share/gen3/tube/creds.json
+      #
+      conf_data = load_json("creds.json", "tube")
+      DB_HOST = os.getenv("DB_HOST") or conf_data.get("db_host", "localhost")
+      DB_PORT = os.getenv("DB_PORT") or conf_data.get("db_port", "5432")
+      DB_DATABASE = os.getenv("DB_DATABASE") or conf_data.get("db_database", "sheepdog")
+      DB_USERNAME = os.getenv("DB_USERNAME") or conf_data.get("db_username", "peregrine")
+      DB_PASSWORD = os.getenv("DB_PASSWORD") or conf_data.get("db_password", "unknown")
+
+      DB_USE_SSL = os.getenv("DB_USE_SSL") or conf_data.get(
+          "db_use_ssl", False
+      )  # optional property to db_use_ssl
+      JDBC = (
+          "jdbc:postgresql://{}:{}/{}".format(DB_HOST, DB_PORT, DB_DATABASE)
+          if DB_USE_SSL is False
+          else "jdbc:postgresql://{}:{}/{}?sslmode=require".format(
+              DB_HOST, DB_PORT, DB_DATABASE
+          )
+      )
+      PYDBC = "postgresql://{}:{}@{}:{}/{}".format(
+          DB_USERNAME, DB_PASSWORD, DB_HOST, DB_PORT, DB_DATABASE
+      )
+      DICTIONARY_URL = os.getenv(
+          "DICTIONARY_URL",
+          "https://s3.amazonaws.com/dictionary-artifacts/datadictionary/develop/schema.json",
+      )
+      ES_URL = os.getenv("ES_URL", "esproxy-service")
+
+      HDFS_DIR = "/result"
+      # Three modes: Test, Dev, Prod
+      RUNNING_MODE = os.getenv("RUNNING_MODE", enums.RUNNING_MODE_DEV)  # 'Prod' or 'Dev'
+
+      PARALLEL_JOBS = 1
+      LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
+
+      ES = {
+          "es.nodes": ES_URL,
+          "es.port": "9200",
+          "es.input.json": "yes",
+          "es.nodes.client.only": "false",
+          "es.nodes.discovery": "false",
+          "es.nodes.data.only": "false",
+          "es.nodes.wan.only": "true",
+      }
+
+      HADOOP_HOME = os.getenv("HADOOP_HOME", "/usr/local/Cellar/hadoop/3.1.0/libexec/")
+      JAVA_HOME = os.getenv(
+          "JAVA_HOME", "/Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home"
+      )
+      HADOOP_URL = os.getenv("HADOOP_URL", "http://spark-service:9000")
+      ES_HADOOP_VERSION = os.getenv("ES_HADOOP_VERSION", "")
+      ES_HADOOP_HOME_BIN = "{}/elasticsearch-hadoop-{}".format(
+          os.getenv("ES_HADOOP_HOME", ""), os.getenv("ES_HADOOP_VERSION", "")
+      )
+      HADOOP_HOST = os.getenv("HADOOP_HOST", "spark-service")
+      # Searches same folders as load_json above
+
+      try:
+          MAPPING_FILE = find_paths("etlMapping.yaml", "tube")[0]
+      except:
+          MAPPING_FILE = None
+
+      try:
+          USERYAML_FILE = find_paths("user.yaml", "tube")[0]
+      except IndexError:
+          USERYAML_FILE = None
+      PROJECT_TO_RESOURCE_PATH = get_resource_paths_from_yaml(USERYAML_FILE)
+
+      SPARK_MASTER = os.getenv("SPARK_MASTER", "local[1]")  # 'spark-service'
+      SPARK_EXECUTOR_MEMORY = os.getenv("SPARK_EXECUTOR_MEMORY", "2g")
+      SPARK_DRIVER_MEMORY = os.getenv("SPARK_DRIVER_MEMORY", "512m")
+      APP_NAME = "Gen3 ETL"
+
+      os.environ[
+          "PYSPARK_SUBMIT_ARGS"
+      ] = "--jars {}/dist/elasticsearch-spark-20_2.11-{}.jar pyspark-shell".format(
+          ES_HADOOP_HOME_BIN, ES_HADOOP_VERSION
+      )
+      os.environ["HADOOP_CLIENT_OPTS"] = os.getenv("HADOOP_CLIENT_OPTS", "")
+
+{{- end }}
diff --git a/helm/etl/values.yaml b/helm/etl/values.yaml
@@ -33,25 +33,13 @@ resources:
       cpu: 0.3
       # -- (string) The amount of memory requested
       memory: 128Mi
-    # -- (map) The maximum amount of resources that the container is allowed to use
-    limits:
-      # -- (string) The maximum amount of CPU the container can use
-      cpu: 1.0
-      # -- (string) The maximum amount of memory the container can use
-      memory: 2Gi
   spark:
     # -- (map) The amount of resources that the container requests
     requests:
       # -- (string) The amount of CPU requested
       cpu: 0.3
       # -- (string) The amount of memory requested
       memory: 128Mi
-    # -- (map) The maximum amount of resources that the container is allowed to use
-    limits:
-      # -- (string) The maximum amount of CPU the container can use
-      cpu: 1.0
-      # -- (string) The maximum amount of memory the container can use
-      memory: 2Gi
 
 
 esEndpoint: gen3-elasticsearch-master
@@ -154,3 +142,11 @@ esGarbageCollect:
   custom_image:
   # -- (string) Slack webhook endpoint to use for cronjob.
   slack_webhook: None
+
+schedule: "*/30 * * * *"
+
+suspendCronjob: true
+
+legacySupport: false
+
+etlForced: "TRUE"
diff --git a/helm/gen3/Chart.yaml b/helm/gen3/Chart.yaml
@@ -28,7 +28,7 @@ dependencies:
     version: 0.1.16
     repository: file://../common
   - name: etl
-    version: 0.1.5
+    version: 0.1.6
     repository: file://../etl
     condition: etl.enabled
   - name: frontend-framework

diff --git a/helm/gen3/README.md b/helm/gen3/README.md
@@ -24,7 +24,7 @@ Helm chart to deploy Gen3 Data Commons
 | file://../audit | audit | 0.1.16 |
 | file://../aws-es-proxy | aws-es-proxy | 0.1.13 |
 | file://../common | common | 0.1.16 |
-| file://../etl | etl | 0.1.5 |
+| file://../etl | etl | 0.1.6 |
 | file://../fence | fence | 0.1.26 |
 | file://../frontend-framework | frontend-framework | 0.1.5 |
 | file://../gen3-network-policies | gen3-network-policies | 0.1.2 |