Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add v0.1.0b5 for azure-ai-ml #25042

Merged
merged 19 commits into from
Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions sdk/ml/azure-ai-ml/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,32 @@
## Release History
# Release History

### 0.1.0b4 (unreleased)
## 0.1.0b5 (2022-07-15)

#### Features Added
### Features Added

#### Breaking Changes
- Allow Input/Output objects to be used by CommandComponent.
- Added MoonCake cloud support.
- Unified inputs/outputs building and validation logic in BaseNode.
- Allow Git repo URLs to be used as code for jobs and components.
- Updated AutoML YAML schema to use InputSchema.
- Added end_time to job schedule.
- MIR and pipeline job now support registry assets.

#### Bugs Fixed
### Breaking Changes

#### Other Changes
### Bugs Fixed

- Have mldesigner use argparser to parse incoming args.
- Bumped pyjwt version to <3.0.0.
- Reverted "upload support for symlinks".
- Error message improvement when a YAML UnionField fails to match.
- Reintroduced support for symlinks when uploading.
- Hard coded registry base URL to eastus region to support preview.

## 0.1.0b4 (2022-06-16)

## 0.1.0b3 (2022-05-24)

### Features Added

### 0.1.0b3 (2022-05-24)
- First preview.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
# ---------------------------------------------------------

from typing import Dict, Any, Optional
from azure.ai.ml._azure_environments import ENDPOINT_URLS, _get_cloud_details, resource_to_scopes
from azure.ai.ml._azure_environments import (
_get_cloud_details,
_get_base_url_from_metadata,
_resource_to_scopes,
_get_azure_portal_id_from_metadata,
)
from azure.core.polling import LROPoller
from azure.ai.ml._arm_deployments.arm_helper import deployment_message_mapping
from azure.ai.ml._utils._arm_id_utils import get_arm_id_object_from_id
Expand Down Expand Up @@ -39,8 +44,8 @@ def __init__(
self._resource_group_name = resource_group_name
self._deployment_name = deployment_name
self._cloud = _get_cloud_details()
management_hostname = self._cloud.get(ENDPOINT_URLS.RESOURCE_MANAGER_ENDPOINT).strip("/")
credential_scopes = resource_to_scopes(management_hostname)
management_hostname = _get_base_url_from_metadata()
credential_scopes = _resource_to_scopes(management_hostname)
kwargs.pop("base_url", None)
if credential_scopes is not None:
kwargs["credential_scopes"] = credential_scopes
Expand Down Expand Up @@ -82,7 +87,7 @@ def deploy_resource(
)
module_logger.info(
ENDPOINT_DEPLOYMENT_START_MSG.format(
self._cloud.get(ENDPOINT_URLS.AZURE_PORTAL_ENDPOINT),
_get_azure_portal_id_from_metadata(),
self._subscription_id,
self._resource_group_name,
self._deployment_name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pathlib import Path
from datetime import datetime, timedelta
import uuid
from azure.ai.ml._azure_environments import ENDPOINT_URLS, _get_cloud_details
from azure.ai.ml._azure_environments import _get_storage_endpoint_from_metadata

from azure.storage.blob import generate_blob_sas, BlobSasPermissions
from azure.storage.filedatalake import generate_file_sas, FileSasPermissions
Expand Down Expand Up @@ -68,8 +68,7 @@ def get_datastore_info(operations: DatastoreOperations, name: str) -> Dict[str,
else:
datastore = operations.get_default(include_secrets=True)

cloud_details = _get_cloud_details()
storage_endpoint = cloud_details.get(ENDPOINT_URLS.STORAGE_ENDPOINT)
storage_endpoint = _get_storage_endpoint_from_metadata()
credentials = datastore.credentials
datastore_info["storage_type"] = datastore.type
datastore_info["storage_account"] = datastore.account_name
Expand Down
118 changes: 12 additions & 106 deletions sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_blob_storage_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,26 @@
import logging
import time
import os
import warnings
from contextlib import suppress
from typing import Optional, Dict, Any, List, TYPE_CHECKING
from typing import Dict, List, TYPE_CHECKING
from pathlib import PurePosixPath, Path
from multiprocessing import cpu_count
from colorama import Fore
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm, TqdmWarning
from platform import system
import sys

from azure.ai.ml._utils._exception_utils import EmptyDirectoryError
from azure.storage.blob import BlobServiceClient, ContainerClient
from azure.core.exceptions import ResourceNotFoundError
from azure.ai.ml._utils._asset_utils import (
generate_asset_id,
traverse_directory,
upload_directory,
upload_file,
AssetNotChangedError,
_build_metadata_dict,
IgnoreFile,
FileUploadProgressBar,
get_directory_size,
)
from azure.ai.ml._artifacts._constants import (
UPLOAD_CONFIRMATION,
ARTIFACT_ORIGIN,
LEGACY_ARTIFACT_DIRECTORY,
EMPTY_DIRECTORY_ERROR,
PROCESSES_PER_CORE,
MAX_CONCURRENCY,
FILE_SIZE_WARNING,
BLOB_DATASTORE_IS_HDI_FOLDER_KEY,
Expand Down Expand Up @@ -103,11 +94,18 @@ def upload(

# start upload
if os.path.isdir(source):
self.upload_dir(source, asset_id, msg, show_progress, ignore_file=ignore_file)
upload_directory(
storage_client=self,
source=source,
dest=asset_id,
msg=msg,
show_progress=show_progress,
ignore_file=ignore_file,
)
else:
self.indicator_file = dest
self.check_blob_exists()
self.upload_file(source, dest, msg, show_progress)
upload_file(storage_client=self, source=source, dest=dest, msg=msg, show_progress=show_progress)
print(Fore.RESET + "\n", file=sys.stderr)

# upload must be completed before we try to generate confirmation file
Expand All @@ -124,98 +122,6 @@ def upload(

return artifact_info

def upload_file(
self,
source: str,
dest: str,
msg: Optional[str] = None,
show_progress: Optional[bool] = None,
in_directory: bool = False,
callback: Any = None,
) -> None:
"""
Upload a single file to a path inside the container
"""
validate_content = os.stat(source).st_size > 0 # don't do checksum for empty files

with open(source, "rb") as data:
if show_progress and not in_directory:
file_size, _ = get_directory_size(source)
file_size_in_mb = file_size / 10**6
if file_size_in_mb < 1:
msg += Fore.GREEN + " (< 1 MB)"
else:
msg += Fore.GREEN + f" ({round(file_size_in_mb, 2)} MBs)"
cntx_manager = FileUploadProgressBar(msg=msg)
else:
cntx_manager = suppress()

with cntx_manager as c:
callback = c.update_to if (show_progress and not in_directory) else None
self.container_client.upload_blob(
name=dest,
data=data,
validate_content=validate_content,
overwrite=self.overwrite,
raw_response_hook=callback,
max_concurrency=MAX_CONCURRENCY,
)

self.uploaded_file_count += 1

def upload_dir(self, source: str, dest: str, msg: str, show_progress: bool, ignore_file: IgnoreFile) -> None:
"""
Upload a directory to a path inside the container

Azure Blob doesn't allow metadata setting at the directory level, so the first
file in the directory is designated as the file where the confirmation metadata
will be added at the end of the upload.
"""
source_path = Path(source).resolve()
prefix = "" if dest == "" else dest + "/"
prefix += os.path.basename(source_path) + "/"

# get all paths in directory and each file's size
upload_paths = []
size_dict = {}
total_size = 0
for root, _, files in os.walk(source_path):
upload_paths += list(traverse_directory(root, files, source_path, prefix, ignore_file=ignore_file))

for path, _ in upload_paths:
path_size = os.path.getsize(path)
size_dict[path] = path_size
total_size += path_size

upload_paths = sorted(upload_paths)
if len(upload_paths) == 0:
raise EmptyDirectoryError(
message=EMPTY_DIRECTORY_ERROR.format(source),
no_personal_data_message=msg.format("[source]"),
target=ErrorTarget.ARTIFACT,
error_category=ErrorCategory.USER_ERROR,
)

self.indicator_file = upload_paths[0][1]
self.check_blob_exists()
self.total_file_count = len(upload_paths)

# submit paths to workers for upload
num_cores = int(cpu_count()) * PROCESSES_PER_CORE
with ThreadPoolExecutor(max_workers=num_cores) as ex:
futures_dict = {
ex.submit(self.upload_file, src, dest, in_directory=True, show_progress=show_progress): (src, dest)
for (src, dest) in upload_paths
}
if show_progress:
warnings.simplefilter("ignore", category=TqdmWarning)
msg += f" ({round(total_size/10**6, 2)} MBs)"
ascii = system() == "Windows" # Default unicode progress bar doesn't display well on Windows
with tqdm(total=total_size, desc=msg, ascii=ascii) as pbar:
for future in as_completed(futures_dict):
file_path_name = futures_dict[future][0]
pbar.update(size_dict.get(file_path_name) or 0)

def check_blob_exists(self) -> None:
"""
Throw error if blob already exists.
Expand Down
2 changes: 2 additions & 0 deletions sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,5 @@
"{jsonSchemaErrorPath}{jsonSchemaMessage}\n{invalidMLTableMsg}:\n{invalidSchemaSnippet}"
)
BLOB_DATASTORE_IS_HDI_FOLDER_KEY = "hdi_isfolder"
BLOB_STORAGE_CLIENT_NAME = "BlobStorageClient"
GEN2_STORAGE_CLIENT_NAME = "Gen2StorageClient"
Loading