From 9a956b67170d9f9702b06a0c02c913798a7ec0a4 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 2 Apr 2018 11:54:20 -0700 Subject: [PATCH 1/2] add gitattributes, convert crlf to lf --- .gitattributes | 1 + aztk/spark/__init__.py | 4 +- docs/50-sdk.md | 1452 ++++++++++++++++++++-------------------- docs/70-jobs.md | 350 +++++----- 4 files changed, 904 insertions(+), 903 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..fcadb2cf --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text eol=lf diff --git a/aztk/spark/__init__.py b/aztk/spark/__init__.py index f7ccbfd4..a4807b9f 100644 --- a/aztk/spark/__init__.py +++ b/aztk/spark/__init__.py @@ -1,2 +1,2 @@ -from .models import * -from .client import Client +from .models import * +from .client import Client diff --git a/docs/50-sdk.md b/docs/50-sdk.md index 422aa891..11efca72 100644 --- a/docs/50-sdk.md +++ b/docs/50-sdk.md @@ -1,726 +1,726 @@ -# SDK - - -Operationalize AZTK with the provided Python SDK. - -Find some samples and getting stated tutorial in the `examples/sdk/` directory of the repository. - -## Public Interface - -### Client - -- `create_cluster(self, cluster_conf: aztk.spark.models.ClusterConfiguration, wait=False)` - - Create an AZTK cluster with the given cluster configuration - - Parameters: - - - cluster_conf: models.ClusterConfiguration - - the definition of the cluster to create - - wait: bool = False - - If true, block until the cluster is running, else return immediately - - Returns: - - - aztk.spark.models.Cluster - -- `create_clusters_in_parallel(self, cluster_confs: List[aztk.models.ClusterConfiguration])` - - Create an AZTK clusters with the given list of cluster configurations - - Parameters: - - - cluster_confs: List[aztk.models.ClusterConfiguration] - - Returns: - - - None - -- `delete_cluster(self, cluster_id: str, keep_logs: bool = False)` - - Delete an AZTK cluster with the given ID - - Parameters: - - - cluster_id: str - - The ID of the cluster to delete - - keep_logs: bool - - If true, the logs associated with this cluster will not be deleted. - - Returns: - - - None - -- `get_cluster(self, cluster_id: str)` - - Retrieve detailed information about the cluster with the given ID - - Parameters: - - - cluster_id - - the ID of the cluster to get - - Returns: - - - aztk.models.Cluster() - - -- `list_clusters(self)` - Retrieve a list of existing AZTK clusters. - - Returns: - - - List[aztk.models.Cluster] - -- `get_remote_login_settings(self, cluster_id: str, node_id: str)` - - Return the settings required to login to a node - - Parameters: - - - cluster_id: str - The cluster to login to - - node_id: str - The node to login to - Returns: - - - aztk.spark.models.RemoteLogin - -- `submit(self, cluster_id: str, application: aztk.spark.models.Application)` - - Parameters: - - - cluster_id: str - The cluster that the application is submitted to - - application: aztk.spark.models.Application - The application to submit - - Returns: - - - None - -- `submit_all_applications(self, cluster_id: str, applications: List[aztk.spark.models.Application])` - - Submit a list of applications to be exected on a cluster - - Parameters: - - - cluster_id: str - The cluster that the applications are submitted to - - applications: List[aztk.spark.models.Application] - List of applications to submit - Returns: - - - None - -- `wait_until_application_done(self, cluster_id: str, task_id: str)` - - Block until the given application has completed on the given cluster - - Parameters: - - - cluster_id: str - The cluster on which the application is running - - task_id - The application to wait for - Returns: - - - None - -- `wait_until_applications_done(self, cluster_id: str)` - - Block until all applications on the given cluster are completed - - Parameters: - - - cluster_id: str - The cluster on which the application is running - - Returns: - - - None - -- `wait_until_cluster_is_ready(self, cluster_id: str)` - - - Block until the given cluster is running - - Parameters: - - - cluster_id: str - The ID of the cluster to wait for - - Returns: - - - aztk.spark.models.Cluster - - -- `wait_until_all_clusters_are_ready(self, clusters: List[str])` - - Wait until all clusters in the given list are ready - - Parameters: - - - clusters: List[str] - A list of the IDs of all the clusters to wait for - - Returns: - - - None - - - `create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None)` - - Create a user on the given cluster - - Parameters: - - - cluster_id: List[str] - The cluster on which to create the user - - - password: str - The password to create the user with (mutually exclusive with ssh_key) - - - ssh_key: str - The ssh_key to create the user with (mutually exclusive with password) - - Returns: - - - None - - -- `get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0)` - - Get the logs of a completed or currently running application - - Parameters: - - - cluster_id: str - The id of the cluster on which the application ran or is running. - - - application_name: str - The name of the application to retrieve logs for - - - tail: bool - Set to true if you want to get only the newly added data after current_bytes. - - - current_bytes: int - The amount of bytes already retrieved. To get the entire log, leave this at 0. If you are streaming, set this to the current number of bytes you have already retrieved, so you only retrieve the newly added bytes. - - Returns: - - - aztk.spark.models.ApplicationLog - -- `get_application_status(self, cluster_id: str, app_name: str)` - - Get the status of an application - - Parameters: - - cluster_id: str - The id of the cluster to which the app was submitted - - - app_name - the name of the application in question - - Returns: - - - str - - -- `submit_job(self, job_configuration)` - - Submit an AZTK Spark Job - - Parameters: - - - job_configuration: aztk.spark.models.JobConfiguration - The configuration of the job to be submitted - - Returns: - - - aztk.spark.models.Job - -- `list_jobs(self)` - - List all created AZTK Spark Jobs - - Parameters: - - - job_configuration: aztk.spark.models.JobConfiguration - The configuration of the job to be submitted - - Returns: - - - List[aztk.spark.models.Job] - -- `list_applicaitons(self, job_id)` - - List all applications created on the AZTK Spark Job with id job_id - - Parameters: - - - job_id: str - The id of the Job - - Returns: - - - Dict{str: aztk.spark.models.Application or None} - - the key is the name of the application - - the value is None if the application has not yet been scheduled or an Application model if it has been scheduled - -- `get_job(self, job_id)` - - Get information about the AZTK Spark Job with id job_id - - Parameters: - - - job_id: str - The id of the Job - - Returns: - - - List[aztk.spark.models.Job] - -- `stop_job(self, job_id)` - - Stop the AZTK Spark Job with id job_id - - Parameters: - - - job_id: str - The id of the Job - - Returns: - - - None - -- `delete_job(self, job_id, keep_logs: bool = False)` - - Delete the AZTK Spark Job with id job_id - - Parameters: - - - job_id: str - The id of the Job - - keep_logs: bool - - If true, the logs associated with this Job will not be deleted. - - Returns: - - - bool - -- `get_application(self, job_id, application_name)` - - Get information about an AZTK Spark Job's application - - Parameters: - - - job_id: str - The id of the Job - - application_name: str - The name of the Application - - Returns: - - - aztk.spark.models.Application - -- `get_job_application_log(self, job_id, application_name)` - - Get the log of an AZTK Spark Job's application - - - Parameters: - - - job_id: str - The id of the Job - - application_name: str - The name of the Application - - Returns: - - - aztk.spark.models.ApplicationLog - - -- `stop_job_app(self, job_id, application_name)` - - Stop an Application running on an AZTK Spark Job - - Parameters: - - - job_id: str - The id of the Job - - application_name: str - The name of the Application - - Returns: - - - None - - -- `wait_until_job_finished(self, job_id)` - - Wait until the AZTK Spark Job with id job_id is complete - - Parameters: - - - job_id: str - The id of the Job - - application_name: str - The name of the Application - - Returns: - - - None - - -- `wait_until_all_jobs_finished(self, jobs)` - - Wait until all of the given AZTK Spark Jobs are complete - - Parameters: - - - jobs: List[str] - The ids of the Jobs to wait for - - Returns: - - - None - - - -### Models - - -- `Application` - - The definition of an AZTK Spark Application as it exists in the cloud. Please note that this object is not used to configure Applications, only to read information about existing Applications. Please see ApplicationConfiguration if you are trying to create an Application. - - Fields: - - - name: str - - last_modified: datetime - - creation_time: datetime - - state: str - - state_transition_time: datetime - - previous_state: str - - previous_state_transition_time: datetime - - exit_code: int - - - - -- `ApplicationConfiguration` - - Define a Spark application to run on a cluster. - - Fields: - - - name: str - Unique identifier for the application. - - - application: str - Path to the application that will be executed. Can be jar or python file. - - - application_args: [str] - List of arguments for the application - - - main_class: str - The application's main class. (Only applies to Java/Scala) - - - jars: [str] - Additional jars to supply for the application. - - - py_files: [str] - Additional Python files to supply for the application. Can be .zip, .egg, or .py files. - - files: [str] - Additional files to supply for the application. - - - driver_java_options: str - Extra Java options to pass to the driver. - - - driver_library_path: str - Extra library path entries to pass to the driver. - - - driver_class_path: str - Extra class path entries to pass to the driver. Note that jars added with --jars are automatically included in the classpath. - - - driver_memory: str - Memory for driver (e.g. 1000M, 2G) (Default: 1024M). - - - executor_memory: str - Memory per executor (e.g. 1000M, 2G) (Default: 1G). - - - driver_cores: str - Cores for driver (Default: 1). - - - executor_cores: str - Number of cores per executor. (Default: All available cores on the worker) - - - max_retry_count: int - Number of times the Spark job may be retried if there is a failure - -- `ApplicationLog` - - Holds the logged data from a spark application and metadata about the application and log. - - Fields: - - - name: str - - cluster_id: str - - log: str - - total_bytes: int - - application_state: str - - exit_code: str - - -- `Cluster` - - An AZTK cluster. Note that this model is not used to create a cluster, for that see `ClusterConfiguration`. - - Fields: - - - id: str - - The unique id of the cluster - - - pool: azure.batch.models.CloudPool - - A pool in the Azure Batch service. - - - nodes: azure.batch.models.ComputeNodePaged - - A paging container for iterating over a list of ComputeNode objects - - - vm_size: str - - The size of virtual machines in the cluster. All virtual machines in a cluster are the same size. For information about available sizes of virtual machines, see Sizes for Virtual Machines (Linux) (https://azure.microsoft.com/documentation/articles/virtual-machines-linux-sizes/). AZTK supports all Azure VM sizes except STANDARD_A0 and those with premium storage (STANDARD_GS, STANDARD_DS, and STANDARD_DSV2 series). - - - visible_state - - The current state of the cluster. Possible values are: - resizing = 'resizing' - steady = 'steady' - stopping = 'stopping' - active = 'active' - deleting = 'deleting' - upgrading = 'upgrading' - - - total_current_nodes - The total number of nodes currently allocated to the cluster. - - - total_target_nodes - The desired number of nodes in the cluster. Sum of target_dedicated_nodes and target_low_pri_nodes. - - - current_dedicated_nodes - The number of dedicated nodes currently in the cluster. - - - current_low_pri_nodes - The number of low-priority nodes currently in the cluster. Low-priority nodes which have been preempted are included in this count. - - - target_dedicated_nodes - The desired number of dedicated nodes in the cluster. - - - target_low_pri_nodes - The desired number of low-priority nodes in the cluster. - - - - `ClusterConfiguration` - - Define a Spark cluster to be created. - - Fields: - - - custom_scripts: [CustomScript] - A list of custom scripts to execute in the Spark Docker container. - - - cluster_id: str - A unique ID of the cluster to be created. The ID can contain any combination of alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. The ID is case-preserving and case-insensitive (that is, you may not have two IDs within an account that differ only by case). - - - vm_count: int - The number of dedicated VMs (nodes) to be allocated to the cluster. Mutually exclusive with vm_low_pri_count. - - - vm_size: str - The size of virtual machines in the cluster. All virtual machines in a cluster are the same size. For information about available sizes of virtual machines, see Sizes for Virtual Machines (Linux) (https://azure.microsoft.com/documentation/articles/virtual-machines-linux-sizes/). AZTK supports all Azure VM sizes except STANDARD_A0 and those with premium storage (STANDARD_GS, STANDARD_DS, and STANDARD_DSV2 series). - - - vm_low_pri_count: int - The number of VMs (nodes) to be allocated to the cluster. Mutually exclusive with vm_count. - - - docker_repo: str - The docker repository and image to use. For more information, see [Docker Image](./12-docker-image.md). - - - spark_configuration: aztk.spark.models.SparkConfiguration - Configuration object for spark-specific values. - - - - `Custom Script` - - A script that executed in the Docker container of specified nodes in the cluster. - - Fields: - - - name: str - A unique name for the script - - script: str or aztk.spark.models.File - Path to the script to be run or File object - - run_on: str - Set which nodes the script should execute on. Possible values: - - all-nodes - master - worker - - Please note that by default, the Master node is also a worker node. - - -- `File` - - A File definition for programmatically defined configuration files. - - Fields: - - name: str - - payload: io.StringIO - - -- `JobConfiguration` - - Define an AZTK Job. - - Methods: - - - `__init__( - self, - id, - applications=None, - custom_scripts=None, - spark_configuration=None, - vm_size=None, - docker_repo=None, - max_dedicated_nodes=None, - subnet_id=None)` - - - Fields: - - - id: str - - applications: List[aztk.spark.models.ApplicationConfiguration] - - custom_scripts: str - - spark_configuration: aztk.spark.models.SparkConfiguration - - vm_size: int - - gpu_enabled: str - - docker_repo: str - - max_dedicated_nodes: str - - subnet_id: str - - -- `Job` - - Methods: - - `__init__(self, cloud_job_schedule: batch_models.CloudJobSchedule, cloud_tasks: List[batch_models.CloudTask] = None)` - - Fields: - - - id: str - - last_modified: datetime - - state: datetime - - state_transition_time: datetime - - applications: datetime - - - - - -- `SecretsConfiguration` - - The Batch, Storage, Docker and SSH secrets used to create AZTK clusters. For more help with setting these values see [Getting Started](./00-getting-started.md). - - Exactly one of `service_principal` and `shared_key` must be provided to this object. If both or none validation will fail. - - Fields: - service_principal: ServicePrincipalConfiguration - shared_key: SharedKeyConfiguration - docker: DockerConfiguration - - ssh_pub_key: str - ssh_priv_key: str - -- `ServicePrincipalConfiguration` - - Configuration needed to use aad auth. - - Fields: - tenant_id: str - client_id: str - credential: str - batch_account_resource_id: str - storage_account_resource_id: str - -- `SharedKeyConfiguration` - - Configuration needed to use shared key auth. - - Fields: - batch_account_name: str - batch_account_key: str - batch_service_url: str - storage_account_name: str - storage_account_key: str - storage_account_suffix: str - -- `DockerConfiguration` - - Configuration needed to use custom docker. - - Fields: - endpoint: str - username: str - password: str - -- `SparkConfiguration` - - Define cluster-wide Spark specific parameters. - - Fields: - - - spark_defaults_conf: str or aztk.spark.models.File - Path or File object defining spark_defaults.conf configuration file to be used. - - - spark_env_sh: str or aztk.spark.models.File - Path or File object defining spark_env.sh configuration file to be used. - - - core_site_xml: str or aztk.spark.models.File - Path or File object defining the core-site.xml configuration file to be used. - - - jars: [str or aztk.spark.models.File] - Paths to or File objects defining Additional Jars to be uploaded +# SDK + + +Operationalize AZTK with the provided Python SDK. + +Find some samples and getting stated tutorial in the `examples/sdk/` directory of the repository. + +## Public Interface + +### Client + +- `create_cluster(self, cluster_conf: aztk.spark.models.ClusterConfiguration, wait=False)` + + Create an AZTK cluster with the given cluster configuration + + Parameters: + + - cluster_conf: models.ClusterConfiguration + - the definition of the cluster to create + - wait: bool = False + - If true, block until the cluster is running, else return immediately + + Returns: + + - aztk.spark.models.Cluster + +- `create_clusters_in_parallel(self, cluster_confs: List[aztk.models.ClusterConfiguration])` + + Create an AZTK clusters with the given list of cluster configurations + + Parameters: + + - cluster_confs: List[aztk.models.ClusterConfiguration] + + Returns: + + - None + +- `delete_cluster(self, cluster_id: str, keep_logs: bool = False)` + + Delete an AZTK cluster with the given ID + + Parameters: + + - cluster_id: str + - The ID of the cluster to delete + - keep_logs: bool + - If true, the logs associated with this cluster will not be deleted. + + Returns: + + - None + +- `get_cluster(self, cluster_id: str)` + + Retrieve detailed information about the cluster with the given ID + + Parameters: + + - cluster_id + - the ID of the cluster to get + + Returns: + + - aztk.models.Cluster() + + +- `list_clusters(self)` + Retrieve a list of existing AZTK clusters. + + Returns: + + - List[aztk.models.Cluster] + +- `get_remote_login_settings(self, cluster_id: str, node_id: str)` + + Return the settings required to login to a node + + Parameters: + + - cluster_id: str + The cluster to login to + - node_id: str + The node to login to + Returns: + + - aztk.spark.models.RemoteLogin + +- `submit(self, cluster_id: str, application: aztk.spark.models.Application)` + + Parameters: + + - cluster_id: str + The cluster that the application is submitted to + - application: aztk.spark.models.Application + The application to submit + + Returns: + + - None + +- `submit_all_applications(self, cluster_id: str, applications: List[aztk.spark.models.Application])` + + Submit a list of applications to be exected on a cluster + + Parameters: + + - cluster_id: str + The cluster that the applications are submitted to + - applications: List[aztk.spark.models.Application] + List of applications to submit + Returns: + + - None + +- `wait_until_application_done(self, cluster_id: str, task_id: str)` + + Block until the given application has completed on the given cluster + + Parameters: + + - cluster_id: str + The cluster on which the application is running + - task_id + The application to wait for + Returns: + + - None + +- `wait_until_applications_done(self, cluster_id: str)` + + Block until all applications on the given cluster are completed + + Parameters: + + - cluster_id: str + The cluster on which the application is running + + Returns: + + - None + +- `wait_until_cluster_is_ready(self, cluster_id: str)` + + + Block until the given cluster is running + + Parameters: + + - cluster_id: str + The ID of the cluster to wait for + + Returns: + + - aztk.spark.models.Cluster + + +- `wait_until_all_clusters_are_ready(self, clusters: List[str])` + + Wait until all clusters in the given list are ready + + Parameters: + + - clusters: List[str] + A list of the IDs of all the clusters to wait for + + Returns: + + - None + + - `create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None)` + + Create a user on the given cluster + + Parameters: + + - cluster_id: List[str] + The cluster on which to create the user + + - password: str + The password to create the user with (mutually exclusive with ssh_key) + + - ssh_key: str + The ssh_key to create the user with (mutually exclusive with password) + + Returns: + + - None + + +- `get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0)` + + Get the logs of a completed or currently running application + + Parameters: + + - cluster_id: str + The id of the cluster on which the application ran or is running. + + - application_name: str + The name of the application to retrieve logs for + + - tail: bool + Set to true if you want to get only the newly added data after current_bytes. + + - current_bytes: int + The amount of bytes already retrieved. To get the entire log, leave this at 0. If you are streaming, set this to the current number of bytes you have already retrieved, so you only retrieve the newly added bytes. + + Returns: + + - aztk.spark.models.ApplicationLog + +- `get_application_status(self, cluster_id: str, app_name: str)` + + Get the status of an application + + Parameters: + - cluster_id: str + The id of the cluster to which the app was submitted + + - app_name + the name of the application in question + + Returns: + + - str + + +- `submit_job(self, job_configuration)` + + Submit an AZTK Spark Job + + Parameters: + + - job_configuration: aztk.spark.models.JobConfiguration + The configuration of the job to be submitted + + Returns: + + - aztk.spark.models.Job + +- `list_jobs(self)` + + List all created AZTK Spark Jobs + + Parameters: + + - job_configuration: aztk.spark.models.JobConfiguration + The configuration of the job to be submitted + + Returns: + + - List[aztk.spark.models.Job] + +- `list_applicaitons(self, job_id)` + + List all applications created on the AZTK Spark Job with id job_id + + Parameters: + + - job_id: str + The id of the Job + + Returns: + + - Dict{str: aztk.spark.models.Application or None} + - the key is the name of the application + - the value is None if the application has not yet been scheduled or an Application model if it has been scheduled + +- `get_job(self, job_id)` + + Get information about the AZTK Spark Job with id job_id + + Parameters: + + - job_id: str + The id of the Job + + Returns: + + - List[aztk.spark.models.Job] + +- `stop_job(self, job_id)` + + Stop the AZTK Spark Job with id job_id + + Parameters: + + - job_id: str + The id of the Job + + Returns: + + - None + +- `delete_job(self, job_id, keep_logs: bool = False)` + + Delete the AZTK Spark Job with id job_id + + Parameters: + + - job_id: str + The id of the Job + - keep_logs: bool + - If true, the logs associated with this Job will not be deleted. + + Returns: + + - bool + +- `get_application(self, job_id, application_name)` + + Get information about an AZTK Spark Job's application + + Parameters: + + - job_id: str + The id of the Job + - application_name: str + The name of the Application + + Returns: + + - aztk.spark.models.Application + +- `get_job_application_log(self, job_id, application_name)` + + Get the log of an AZTK Spark Job's application + + + Parameters: + + - job_id: str + The id of the Job + - application_name: str + The name of the Application + + Returns: + + - aztk.spark.models.ApplicationLog + + +- `stop_job_app(self, job_id, application_name)` + + Stop an Application running on an AZTK Spark Job + + Parameters: + + - job_id: str + The id of the Job + - application_name: str + The name of the Application + + Returns: + + - None + + +- `wait_until_job_finished(self, job_id)` + + Wait until the AZTK Spark Job with id job_id is complete + + Parameters: + + - job_id: str + The id of the Job + - application_name: str + The name of the Application + + Returns: + + - None + + +- `wait_until_all_jobs_finished(self, jobs)` + + Wait until all of the given AZTK Spark Jobs are complete + + Parameters: + + - jobs: List[str] + The ids of the Jobs to wait for + + Returns: + + - None + + + +### Models + + +- `Application` + + The definition of an AZTK Spark Application as it exists in the cloud. Please note that this object is not used to configure Applications, only to read information about existing Applications. Please see ApplicationConfiguration if you are trying to create an Application. + + Fields: + + - name: str + - last_modified: datetime + - creation_time: datetime + - state: str + - state_transition_time: datetime + - previous_state: str + - previous_state_transition_time: datetime + - exit_code: int + + + + +- `ApplicationConfiguration` + + Define a Spark application to run on a cluster. + + Fields: + + - name: str + Unique identifier for the application. + + - application: str + Path to the application that will be executed. Can be jar or python file. + + - application_args: [str] + List of arguments for the application + + - main_class: str + The application's main class. (Only applies to Java/Scala) + + - jars: [str] + Additional jars to supply for the application. + + - py_files: [str] + Additional Python files to supply for the application. Can be .zip, .egg, or .py files. + - files: [str] + Additional files to supply for the application. + + - driver_java_options: str + Extra Java options to pass to the driver. + + - driver_library_path: str + Extra library path entries to pass to the driver. + + - driver_class_path: str + Extra class path entries to pass to the driver. Note that jars added with --jars are automatically included in the classpath. + + - driver_memory: str + Memory for driver (e.g. 1000M, 2G) (Default: 1024M). + + - executor_memory: str + Memory per executor (e.g. 1000M, 2G) (Default: 1G). + + - driver_cores: str + Cores for driver (Default: 1). + + - executor_cores: str + Number of cores per executor. (Default: All available cores on the worker) + + - max_retry_count: int + Number of times the Spark job may be retried if there is a failure + +- `ApplicationLog` + + Holds the logged data from a spark application and metadata about the application and log. + + Fields: + + - name: str + - cluster_id: str + - log: str + - total_bytes: int + - application_state: str + - exit_code: str + + +- `Cluster` + + An AZTK cluster. Note that this model is not used to create a cluster, for that see `ClusterConfiguration`. + + Fields: + + - id: str + + The unique id of the cluster + + - pool: azure.batch.models.CloudPool + + A pool in the Azure Batch service. + + - nodes: azure.batch.models.ComputeNodePaged + + A paging container for iterating over a list of ComputeNode objects + + - vm_size: str + + The size of virtual machines in the cluster. All virtual machines in a cluster are the same size. For information about available sizes of virtual machines, see Sizes for Virtual Machines (Linux) (https://azure.microsoft.com/documentation/articles/virtual-machines-linux-sizes/). AZTK supports all Azure VM sizes except STANDARD_A0 and those with premium storage (STANDARD_GS, STANDARD_DS, and STANDARD_DSV2 series). + + - visible_state + + The current state of the cluster. Possible values are: + resizing = 'resizing' + steady = 'steady' + stopping = 'stopping' + active = 'active' + deleting = 'deleting' + upgrading = 'upgrading' + + - total_current_nodes + The total number of nodes currently allocated to the cluster. + + - total_target_nodes + The desired number of nodes in the cluster. Sum of target_dedicated_nodes and target_low_pri_nodes. + + - current_dedicated_nodes + The number of dedicated nodes currently in the cluster. + + - current_low_pri_nodes + The number of low-priority nodes currently in the cluster. Low-priority nodes which have been preempted are included in this count. + + - target_dedicated_nodes + The desired number of dedicated nodes in the cluster. + + - target_low_pri_nodes + The desired number of low-priority nodes in the cluster. + + + - `ClusterConfiguration` + + Define a Spark cluster to be created. + + Fields: + + - custom_scripts: [CustomScript] + A list of custom scripts to execute in the Spark Docker container. + + - cluster_id: str + A unique ID of the cluster to be created. The ID can contain any combination of alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. The ID is case-preserving and case-insensitive (that is, you may not have two IDs within an account that differ only by case). + + - vm_count: int + The number of dedicated VMs (nodes) to be allocated to the cluster. Mutually exclusive with vm_low_pri_count. + + - vm_size: str + The size of virtual machines in the cluster. All virtual machines in a cluster are the same size. For information about available sizes of virtual machines, see Sizes for Virtual Machines (Linux) (https://azure.microsoft.com/documentation/articles/virtual-machines-linux-sizes/). AZTK supports all Azure VM sizes except STANDARD_A0 and those with premium storage (STANDARD_GS, STANDARD_DS, and STANDARD_DSV2 series). + + - vm_low_pri_count: int + The number of VMs (nodes) to be allocated to the cluster. Mutually exclusive with vm_count. + + - docker_repo: str + The docker repository and image to use. For more information, see [Docker Image](./12-docker-image.md). + + - spark_configuration: aztk.spark.models.SparkConfiguration + Configuration object for spark-specific values. + + + - `Custom Script` + + A script that executed in the Docker container of specified nodes in the cluster. + + Fields: + + - name: str + A unique name for the script + - script: str or aztk.spark.models.File + Path to the script to be run or File object + - run_on: str + Set which nodes the script should execute on. Possible values: + + all-nodes + master + worker + + Please note that by default, the Master node is also a worker node. + + +- `File` + + A File definition for programmatically defined configuration files. + + Fields: + - name: str + - payload: io.StringIO + + +- `JobConfiguration` + + Define an AZTK Job. + + Methods: + + - `__init__( + self, + id, + applications=None, + custom_scripts=None, + spark_configuration=None, + vm_size=None, + docker_repo=None, + max_dedicated_nodes=None, + subnet_id=None)` + + + Fields: + + - id: str + - applications: List[aztk.spark.models.ApplicationConfiguration] + - custom_scripts: str + - spark_configuration: aztk.spark.models.SparkConfiguration + - vm_size: int + - gpu_enabled: str + - docker_repo: str + - max_dedicated_nodes: str + - subnet_id: str + + +- `Job` + + Methods: + + `__init__(self, cloud_job_schedule: batch_models.CloudJobSchedule, cloud_tasks: List[batch_models.CloudTask] = None)` + + Fields: + + - id: str + - last_modified: datetime + - state: datetime + - state_transition_time: datetime + - applications: datetime + + + + + +- `SecretsConfiguration` + + The Batch, Storage, Docker and SSH secrets used to create AZTK clusters. For more help with setting these values see [Getting Started](./00-getting-started.md). + + Exactly one of `service_principal` and `shared_key` must be provided to this object. If both or none validation will fail. + + Fields: + service_principal: ServicePrincipalConfiguration + shared_key: SharedKeyConfiguration + docker: DockerConfiguration + + ssh_pub_key: str + ssh_priv_key: str + +- `ServicePrincipalConfiguration` + + Configuration needed to use aad auth. + + Fields: + tenant_id: str + client_id: str + credential: str + batch_account_resource_id: str + storage_account_resource_id: str + +- `SharedKeyConfiguration` + + Configuration needed to use shared key auth. + + Fields: + batch_account_name: str + batch_account_key: str + batch_service_url: str + storage_account_name: str + storage_account_key: str + storage_account_suffix: str + +- `DockerConfiguration` + + Configuration needed to use custom docker. + + Fields: + endpoint: str + username: str + password: str + +- `SparkConfiguration` + + Define cluster-wide Spark specific parameters. + + Fields: + + - spark_defaults_conf: str or aztk.spark.models.File + Path or File object defining spark_defaults.conf configuration file to be used. + + - spark_env_sh: str or aztk.spark.models.File + Path or File object defining spark_env.sh configuration file to be used. + + - core_site_xml: str or aztk.spark.models.File + Path or File object defining the core-site.xml configuration file to be used. + + - jars: [str or aztk.spark.models.File] + Paths to or File objects defining Additional Jars to be uploaded diff --git a/docs/70-jobs.md b/docs/70-jobs.md index 91c801ec..590fb8a3 100644 --- a/docs/70-jobs.md +++ b/docs/70-jobs.md @@ -1,175 +1,175 @@ -# Jobs -In the Azure Distributed Data Engineering Toolkit, a Job is a serverless entity that runs applications and records application output. A Job will manage the full lifecycle of the infrastructure so you do not have to. This document describes how to create and use AZTK Jobs. - ------------------------------------------------------- - - -## Creating a Job - -Creating a Job starts with defining the necessary properties in your `.aztk/job.yaml` file. Jobs have one or more applications to run as well as values that define the Cluster the applications will run on. - -### Job.yaml - -Each Job has one or more applications given as a List in Job.yaml. Applications are defined using the following properties: -```yaml - applications: - - name: - application: - application_args: - - - main_class: - jars: - - - py_files: - - - files: - - - driver_java_options: - - - driver_library_path: - driver_class_path: - driver_memory: - executor_memory: - driver_cores: - executor_cores: -``` -_Please note: the only required fields are name and application. All other fields may be removed or left blank._ - -NOTE: The Applcaition name can only contain alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. Each application **must** have a unique name. - -Jobs also require a definition of the cluster on which the Applications will run. The following properties define a cluster: -```yaml - cluster_configuration: - vm_size: - size: - docker_repo: - subnet_id: - custom_scripts: - - List - - of - - paths - - to - - custom - - scripts -``` -_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-iamge.md)._ - -_The only required fields are vm_size and either size or size_low_pri, all other fields can be left blank or removed._ - -A Job definition may also include a default Spark Configuration. The following are the properties to define a Spark Configuration: -```yaml - spark_configuration: - spark_defaults_conf: - spark_env_sh: - core_site_xml: -``` -_Please note: including a Spark Configuration is optional. Spark Configuration values defined as part of an application will take precedence over the values specified in these files._ - - -Below we will define a simple, functioning job definition. -```yaml -# Job Configuration - -job: - id: test-job - cluster_configuration: - vm_size: standard_f2 - size: 3 - - applications: - - name: pipy100 - application: /path/to/pi.py - application_args: - - 100 - - name: pipy200 - application: /path/to/pi.py - application_args: - - 200 -``` -Once submitted, this Job will run two applications, pipy100 and pipy200, on an automatically provisioned Cluster with 3 dedicated Standard_f2 size Azure VMs. Immediately after both pipy100 and pipy200 have completed the Cluster will be destroyed. Application logs will be persisted and available. - -### Commands -Submit a Spark Job: - -```sh -aztk spark job submit --id --configuration -``` - -NOTE: The Job id (`--id`) can only contain alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. Each Job **must** have a unique id. - -#### Low priority nodes -You can create your Job with [low-priority](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms) VMs at an 80% discount by using `--size-low-pri` instead of `--size`. Note that these are great for experimental use, but can be taken away at any time. We recommend against this option when doing long running jobs or for critical workloads. - - -### Listing Jobs -You can list all Jobs currently running in your account by running - -```sh -aztk spark job list -``` - - -### Viewing a Job -To view details about a particular Job, run: - -```sh -aztk spark job get --id -``` - -For example here Job 'pipy' has 2 applications which have already completed. - -```sh -Job pipy ------------------------------------------- -State: | completed -Transition Time: | 21:29PM 11/12/17 - -Applications | State | Transition Time -------------------------------------|----------------|----------------- -pipy100 | completed | 21:25PM 11/12/17 -pipy200 | completed | 21:24PM 11/12/17 -``` - - -### Deleting a Job -To delete a Job run: - -```sh -aztk spark job delete --id -``` -Deleting a Job also permanently deletes any data or logs associated with that cluster. If you wish to persist this data, use the `--keep-logs` flag. - -__You are only charged for the job while it is active, Jobs handle provisioning and destorying infrastructure, so you are only charged for the time that your applications are running.__ - - -### Stopping a Job -To stop a Job run: - -```sh -aztk spark job stop --id -``` -Stopping a Job will end any currently running Applications and will prevent any new Applications from running. - - -### Get information about a Job's Application -To get information about a Job's Application: - -```sh -aztk spark job get-app --id --name -``` - - -### Getting a Job's Application's log -To get a job's application logs: - -```sh -aztk spark job get-app-logs --id --name -``` - - -### Stopping a Job's Application -To stop an application that is running or going to run on a Job: - -```sh -aztk spark job stop-app --id --name -``` +# Jobs +In the Azure Distributed Data Engineering Toolkit, a Job is a serverless entity that runs applications and records application output. A Job will manage the full lifecycle of the infrastructure so you do not have to. This document describes how to create and use AZTK Jobs. + +------------------------------------------------------ + + +## Creating a Job + +Creating a Job starts with defining the necessary properties in your `.aztk/job.yaml` file. Jobs have one or more applications to run as well as values that define the Cluster the applications will run on. + +### Job.yaml + +Each Job has one or more applications given as a List in Job.yaml. Applications are defined using the following properties: +```yaml + applications: + - name: + application: + application_args: + - + main_class: + jars: + - + py_files: + - + files: + - + driver_java_options: + - + driver_library_path: + driver_class_path: + driver_memory: + executor_memory: + driver_cores: + executor_cores: +``` +_Please note: the only required fields are name and application. All other fields may be removed or left blank._ + +NOTE: The Applcaition name can only contain alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. Each application **must** have a unique name. + +Jobs also require a definition of the cluster on which the Applications will run. The following properties define a cluster: +```yaml + cluster_configuration: + vm_size: + size: + docker_repo: + subnet_id: + custom_scripts: + - List + - of + - paths + - to + - custom + - scripts +``` +_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-iamge.md)._ + +_The only required fields are vm_size and either size or size_low_pri, all other fields can be left blank or removed._ + +A Job definition may also include a default Spark Configuration. The following are the properties to define a Spark Configuration: +```yaml + spark_configuration: + spark_defaults_conf: + spark_env_sh: + core_site_xml: +``` +_Please note: including a Spark Configuration is optional. Spark Configuration values defined as part of an application will take precedence over the values specified in these files._ + + +Below we will define a simple, functioning job definition. +```yaml +# Job Configuration + +job: + id: test-job + cluster_configuration: + vm_size: standard_f2 + size: 3 + + applications: + - name: pipy100 + application: /path/to/pi.py + application_args: + - 100 + - name: pipy200 + application: /path/to/pi.py + application_args: + - 200 +``` +Once submitted, this Job will run two applications, pipy100 and pipy200, on an automatically provisioned Cluster with 3 dedicated Standard_f2 size Azure VMs. Immediately after both pipy100 and pipy200 have completed the Cluster will be destroyed. Application logs will be persisted and available. + +### Commands +Submit a Spark Job: + +```sh +aztk spark job submit --id --configuration +``` + +NOTE: The Job id (`--id`) can only contain alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. Each Job **must** have a unique id. + +#### Low priority nodes +You can create your Job with [low-priority](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms) VMs at an 80% discount by using `--size-low-pri` instead of `--size`. Note that these are great for experimental use, but can be taken away at any time. We recommend against this option when doing long running jobs or for critical workloads. + + +### Listing Jobs +You can list all Jobs currently running in your account by running + +```sh +aztk spark job list +``` + + +### Viewing a Job +To view details about a particular Job, run: + +```sh +aztk spark job get --id +``` + +For example here Job 'pipy' has 2 applications which have already completed. + +```sh +Job pipy +------------------------------------------ +State: | completed +Transition Time: | 21:29PM 11/12/17 + +Applications | State | Transition Time +------------------------------------|----------------|----------------- +pipy100 | completed | 21:25PM 11/12/17 +pipy200 | completed | 21:24PM 11/12/17 +``` + + +### Deleting a Job +To delete a Job run: + +```sh +aztk spark job delete --id +``` +Deleting a Job also permanently deletes any data or logs associated with that cluster. If you wish to persist this data, use the `--keep-logs` flag. + +__You are only charged for the job while it is active, Jobs handle provisioning and destorying infrastructure, so you are only charged for the time that your applications are running.__ + + +### Stopping a Job +To stop a Job run: + +```sh +aztk spark job stop --id +``` +Stopping a Job will end any currently running Applications and will prevent any new Applications from running. + + +### Get information about a Job's Application +To get information about a Job's Application: + +```sh +aztk spark job get-app --id --name +``` + + +### Getting a Job's Application's log +To get a job's application logs: + +```sh +aztk spark job get-app-logs --id --name +``` + + +### Stopping a Job's Application +To stop an application that is running or going to run on a Job: + +```sh +aztk spark job stop-app --id --name +``` From 0a0b83e5b6c38536e4cdb504e0450978f1324dff Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 4 Apr 2018 13:39:34 -0700 Subject: [PATCH 2/2] update job doc --- docs/70-jobs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/70-jobs.md b/docs/70-jobs.md index 590fb8a3..a98c2174 100644 --- a/docs/70-jobs.md +++ b/docs/70-jobs.md @@ -1,5 +1,5 @@ # Jobs -In the Azure Distributed Data Engineering Toolkit, a Job is a serverless entity that runs applications and records application output. A Job will manage the full lifecycle of the infrastructure so you do not have to. This document describes how to create and use AZTK Jobs. +In the Azure Distributed Data Engineering Toolkit,a Job is an entity that runs against an automatically provisioned and managed cluster. Jobs run a collection of Spark applications and and persist the outputs. ------------------------------------------------------