Skip to content

Commit

Permalink
feat: Python 3.12 (#2559)
Browse files Browse the repository at this point in the history
Signed-off-by: Anton Kukushkin <[email protected]>
Co-authored-by: Leon Luttenberger <[email protected]>
  • Loading branch information
kukushking and LeonLuttenberger authored Dec 20, 2023
1 parent 8e2a793 commit 3507fda
Show file tree
Hide file tree
Showing 35 changed files with 2,755 additions and 2,639 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/minimal-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.8]
python-version: ["3.8", "3.11", "3.12"]
platform: [ubuntu-latest, macos-latest, windows-latest]

env:
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ AWS SDK for pandas can also run your workflows at scale by leveraging [Modin](ht

The quickest way to get started is to use AWS Glue with Ray. Read our [docs](https://aws-sdk-pandas.readthedocs.io/en/3.5.0/scale.html), our blogs ([1](https://aws.amazon.com/blogs/big-data/scale-aws-sdk-for-pandas-workloads-with-aws-glue-for-ray/)/[2](https://aws.amazon.com/blogs/big-data/advanced-patterns-with-aws-sdk-for-pandas-on-aws-glue-for-ray/)), or head to our latest [tutorials](https://github.com/aws/aws-sdk-pandas/tree/main/tutorials) to discover even more features.

> ⚠️ **Ray is currently not available for Python 3.12. While AWS SDK for pandas supports Python 3.12, it cannot be used at scale.**
## [Read The Docs](https://aws-sdk-pandas.readthedocs.io/)

- [**What is AWS SDK for pandas?**](https://aws-sdk-pandas.readthedocs.io/en/3.5.0/about.html)
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
_logger: logging.Logger = logging.getLogger(__name__)


_ConfigValueType = Union[str, bool, int, float, botocore.config.Config, dict]
_ConfigValueType = Union[str, bool, int, float, botocore.config.Config, Dict[Any, Any]]


class _ConfigArg(NamedTuple):
Expand Down
4 changes: 2 additions & 2 deletions awswrangler/_databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def _records2df(
for col_values, col_name in zip(tuple(zip(*records)), cols_names): # Transposing
if (dtype is None) or (col_name not in dtype):
if _oracledb_found:
col_values = oracle.handle_oracle_objects(col_values, col_name) # ruff: noqa: PLW2901
col_values = oracle.handle_oracle_objects(col_values, col_name) # type: ignore[arg-type,assignment] # noqa: PLW2901
try:
array: pa.Array = pa.array(obj=col_values, safe=safe) # Creating Arrow array
except pa.ArrowInvalid as ex:
Expand All @@ -169,7 +169,7 @@ def _records2df(
try:
if _oracledb_found:
if _should_handle_oracle_objects(dtype[col_name]):
col_values = oracle.handle_oracle_objects(col_values, col_name, dtype)
col_values = oracle.handle_oracle_objects(col_values, col_name, dtype) # type: ignore[arg-type,assignment] # noqa: PLW2901
array = pa.array(obj=col_values, type=dtype[col_name], safe=safe) # Creating Arrow array with dtype
except (pa.ArrowInvalid, pa.ArrowTypeError):
array = pa.array(obj=col_values, safe=safe) # Creating Arrow array
Expand Down
6 changes: 3 additions & 3 deletions awswrangler/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def decorator(func: FunctionType) -> FunctionType:

@wraps(func)
def inner(*args: Any, **kwargs: Any) -> Any:
passed_unsupported_kwargs = set(unsupported_kwargs).intersection( # type: ignore
passed_unsupported_kwargs = set(unsupported_kwargs).intersection(
set([key for key, value in kwargs.items() if value is not None])
)

Expand Down Expand Up @@ -620,7 +620,7 @@ def ensure_cpu_count(use_threads: Union[bool, int] = True) -> int:
1
"""
if type(use_threads) == int: # pylint: disable=unidiomatic-typecheck
if type(use_threads) == int: # pylint: disable=unidiomatic-typecheck # noqa: E721
if use_threads < 1:
return 1
return use_threads
Expand Down Expand Up @@ -736,7 +736,7 @@ def get_credentials_from_session(
) -> botocore.credentials.ReadOnlyCredentials:
"""Get AWS credentials from boto3 session."""
session: boto3.Session = ensure_session(session=boto3_session)
credentials: botocore.credentials.Credentials = session.get_credentials()
credentials: botocore.credentials.Credentials = session.get_credentials() # type: ignore[assignment]
frozen_credentials: botocore.credentials.ReadOnlyCredentials = credentials.get_frozen_credentials()
return frozen_credentials

Expand Down
2 changes: 1 addition & 1 deletion awswrangler/athena/_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def update_cache(self, items: List[Dict[str, Any]]) -> None:
if oldest_item:
items = list(
filter(
lambda x: x["Status"]["SubmissionDateTime"] > oldest_item["Status"]["SubmissionDateTime"], # type: ignore[arg-type]
lambda x: x["Status"]["SubmissionDateTime"] > oldest_item["Status"]["SubmissionDateTime"],
items,
)
)
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/athena/_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _add_query_metadata_generator(
) -> Iterator[pd.DataFrame]:
"""Add Query Execution metadata to every DF in iterator."""
for df in dfs:
df = _apply_query_metadata(df=df, query_metadata=query_metadata) # ruff: noqa: PLW2901
df = _apply_query_metadata(df=df, query_metadata=query_metadata) # noqa: PLW2901
yield df


Expand Down
2 changes: 1 addition & 1 deletion awswrangler/athena/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def _parse_describe_table(df: pd.DataFrame) -> pd.DataFrame:
origin_df_dict = df.to_dict()
target_df_dict: Dict[str, List[Union[str, bool]]] = {"Column Name": [], "Type": [], "Partition": [], "Comment": []}
for index, col_name in origin_df_dict["col_name"].items():
col_name = col_name.strip() # ruff: noqa: PLW2901
col_name = col_name.strip() # noqa: PLW2901
if col_name.startswith("#") or not col_name:
pass
elif col_name in target_df_dict["Column Name"]:
Expand Down
1 change: 1 addition & 0 deletions awswrangler/cleanrooms/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def wait_query(
Protected query execution ID
boto3_session : boto3.Session, optional
Boto3 Session. If None, the default boto3 session is used
Returns
-------
Dict[str, Any]
Expand Down
6 changes: 3 additions & 3 deletions awswrangler/data_api/rds.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _execute_statement(
def function(sql: str) -> "ExecuteStatementResponseTypeDef":
return self.client.execute_statement(
resourceArn=self.resource_arn,
database=database, # type: ignore[arg-type]
database=database,
sql=sql,
secretArn=self.secret_arn,
includeResultMetadata=True,
Expand Down Expand Up @@ -196,7 +196,7 @@ def _batch_execute_statement(
def function(sql: str) -> "BatchExecuteStatementResponseTypeDef":
return self.client.batch_execute_statement(
resourceArn=self.resource_arn,
database=database, # type: ignore[arg-type]
database=database,
sql=sql,
secretArn=self.secret_arn,
**additional_kwargs,
Expand Down Expand Up @@ -363,7 +363,7 @@ def _generate_parameters(columns: List[str], values: List[Any]) -> List[Dict[str
parameter_list = []

for col, value in zip(columns, values):
value, type_hint = _create_value_dict(value) # ruff: noqa: PLW2901
value, type_hint = _create_value_dict(value) # noqa: PLW2901

parameter = {
"name": col,
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/distributed/ray/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def _estimate_available_parallelism() -> int:


def ensure_worker_count(use_threads: Union[bool, int] = True) -> int:
if type(use_threads) == int: # pylint: disable=unidiomatic-typecheck
if type(use_threads) == int: # pylint: disable=unidiomatic-typecheck # noqa: E721
if use_threads < 1:
return 1
return use_threads
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def _get_file_suffix(self, file_format: str, compression: Optional[str]) -> str:
# raw pyarrow file fragment causes S3 network calls.
class _SerializedPiece:
def __init__(self, frag: ParquetFileFragment):
self._data = cloudpickle.dumps( # type: ignore[attr-defined]
self._data = cloudpickle.dumps( # type: ignore[attr-defined,no-untyped-call]
(frag.format, frag.path, frag.filesystem, frag.partition_expression)
)

Expand Down
2 changes: 1 addition & 1 deletion awswrangler/distributed/ray/modin/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _validate_partition_shape(df: pd.DataFrame) -> bool:
"""
# Unwrap partitions as they are currently stored (axis=None)
partitions_shape = np.array(unwrap_partitions(df)).shape
return partitions_shape[1] == 1
return partitions_shape[1] == 1 # type: ignore[no-any-return,unused-ignore]


FunctionType = TypeVar("FunctionType", bound=Callable[..., Any])
Expand Down
4 changes: 2 additions & 2 deletions awswrangler/dynamodb/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from mypy_boto3_dynamodb.type_defs import (
AttributeValueTypeDef,
ExecuteStatementOutputTypeDef,
KeySchemaElementTableTypeDef,
KeySchemaElementTypeDef,
WriteRequestTypeDef,
)

Expand Down Expand Up @@ -180,7 +180,7 @@ def execute_statement(


def _validate_items(
items: Union[List[Dict[str, Any]], List[Mapping[str, Any]]], key_schema: List["KeySchemaElementTableTypeDef"]
items: Union[List[Dict[str, Any]], List[Mapping[str, Any]]], key_schema: List["KeySchemaElementTypeDef"]
) -> None:
"""
Validate if all items have the required keys for the Amazon DynamoDB table.
Expand Down
6 changes: 3 additions & 3 deletions awswrangler/dynamodb/_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

if TYPE_CHECKING:
from mypy_boto3_dynamodb.client import DynamoDBClient
from mypy_boto3_dynamodb.type_defs import KeySchemaElementTableTypeDef
from mypy_boto3_dynamodb.type_defs import KeySchemaElementTypeDef


_logger: logging.Logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -139,7 +139,7 @@ def _put_df(
dynamodb_client: Optional["DynamoDBClient"],
df: pd.DataFrame,
table_name: str,
key_schema: List["KeySchemaElementTableTypeDef"],
key_schema: List["KeySchemaElementTypeDef"],
) -> None:
items: List[Mapping[str, Any]] = [v.dropna().to_dict() for _, v in df.iterrows()]

Expand Down Expand Up @@ -214,7 +214,7 @@ def _put_items(
dynamodb_client: Optional["DynamoDBClient"],
items: Union[List[Dict[str, Any]], List[Mapping[str, Any]]],
table_name: str,
key_schema: List["KeySchemaElementTableTypeDef"],
key_schema: List["KeySchemaElementTypeDef"],
) -> None:
_logger.debug("Inserting %d items", len(items))
_validate_items(items=items, key_schema=key_schema)
Expand Down
1 change: 1 addition & 0 deletions awswrangler/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ def create_cluster( # pylint: disable=too-many-arguments,too-many-locals,unused
By default, adds log4j config as follows:
`{"Classification": "spark-log4j", "Properties": {"log4j.rootCategory": f"{pars['spark_log_level']}, console"}}`
Returns
-------
str
Expand Down
3 changes: 2 additions & 1 deletion awswrangler/neptune/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import boto3
from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSPreparedRequest, AWSRequest
from botocore.credentials import Credentials
from typing_extensions import Literal, NotRequired

import awswrangler.neptune._gremlin_init as gremlin
Expand Down Expand Up @@ -126,7 +127,7 @@ def _get_aws_request(
) -> Union[AWSRequest, AWSPreparedRequest]:
req = AWSRequest(method=method, url=url, data=data, params=params, headers=headers)
if self.iam_enabled:
credentials = self.boto3_session.get_credentials()
credentials: Credentials = self.boto3_session.get_credentials() # type: ignore[assignment]
try:
frozen_creds = credentials.get_frozen_credentials()
except AttributeError:
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/neptune/_gremlin_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _parse_dict(data: Any) -> Any:
for k, v in data.items():
# If the key is a Vertex or an Edge do special processing
if isinstance(k, (gremlin.Vertex, gremlin.Edge)):
k = k.id # ruff: noqa: PLW2901
k = k.id # noqa: PLW2901

# If the value is a list do special processing to make it a scalar if the list is of length 1
if isinstance(v, list) and len(v) == 1:
Expand Down
2 changes: 1 addition & 1 deletion awswrangler/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ def to_sql(
df=df, column_placeholders=column_placeholders, chunksize=chunksize
)
for _, parameters in placeholder_parameter_pair_generator:
parameters = list(zip(*[iter(parameters)] * len(df.columns))) # ruff: noqa: PLW2901
parameters = list(zip(*[iter(parameters)] * len(df.columns))) # noqa: PLW2901
_logger.debug("sql: %s", sql)
cursor.executemany(sql, parameters)

Expand Down
2 changes: 1 addition & 1 deletion awswrangler/s3/_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def _copy_objects(
CopySource=copy_source,
Bucket=target_bucket,
Key=target_key,
ExtraArgs=s3_additional_kwargs, # type: ignore[arg-type]
ExtraArgs=s3_additional_kwargs,
Config=TransferConfig(num_download_attempts=10, use_threads=use_threads), # type: ignore[arg-type]
)

Expand Down
2 changes: 1 addition & 1 deletion awswrangler/s3/_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def _ensure_locations_are_valid(paths: Iterable[str]) -> Iterator[str]:
# If the suffix looks like a partition,
if suffix and (suffix.count("=") == 1):
# the path should end in a '/' character.
path = f"{path}/" # ruff: noqa: PLW2901
path = f"{path}/" # noqa: PLW2901
yield path


Expand Down
4 changes: 2 additions & 2 deletions awswrangler/s3/_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def _select_object_content(
for event in response["Payload"]:
if "Records" in event:
records = (
event["Records"]["Payload"] # type: ignore[index]
.decode( # type: ignore[attr-defined]
event["Records"]["Payload"]
.decode(
encoding="utf-8",
errors="ignore",
)
Expand Down
8 changes: 5 additions & 3 deletions awswrangler/s3/_write_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,16 @@ def _to_partitions(
s3_client = client(service_name="s3", session=boto3_session)
for keys, subgroup in df.groupby(by=partition_cols, observed=True):
# Keys are either a primitive type or a tuple if partitioning by multiple cols
keys = (keys,) if not isinstance(keys, tuple) else keys # ruff: noqa: PLW2901
keys = (keys,) if not isinstance(keys, tuple) else keys # noqa: PLW2901
# Drop partition columns from df
subgroup.drop(
columns=[col for col in partition_cols if col in subgroup.columns],
inplace=True,
) # ruff: noqa: PLW2901
) # noqa: PLW2901
# Drop index levels if partitioning by index columns
subgroup = subgroup.droplevel(level=[col for col in partition_cols if col in subgroup.index.names])
subgroup = subgroup.droplevel( # noqa: PLW2901
level=[col for col in partition_cols if col in subgroup.index.names]
)
prefix = _delete_objects(
keys=keys,
path_root=path_root,
Expand Down
82 changes: 56 additions & 26 deletions building/build-lambda-layers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,74 @@ set -ex
VERSION=$(poetry version --short)
DIR_NAME=$(dirname "$PWD")

PYTHON_VERSION=${1:-ALL}

ARCH=$(arch)
[ "${ARCH}" = "aarch64" ] && ARCH_SUFFIX="-arm64" # AWS Lambda, the name arm64 is used instead of aarch64

echo "Building Lambda Layers for AWS SDK for pandas ${VERSION}"
if [[ $PYTHON_VERSION == "ALL" ]]
then
echo "Building Lambda Layers for AWS SDK for pandas ${VERSION} (ALL supported Python versions)"
else
echo "Building Lambda Layers for AWS SDK for pandas ${VERSION} (ONLY Python $PYTHON_VERSION)"
fi

pushd lambda

# Building all related docker images
./build-docker-images.sh
./build-docker-images.sh $PYTHON_VERSION

# Python 3.8
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py38 \
build-lambda-layer.sh "${VERSION}-py3.8${ARCH_SUFFIX}" "ninja-build"
if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.8" ]]
then
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py38 \
build-lambda-layer.sh "${VERSION}-py3.8${ARCH_SUFFIX}" "ninja-build"
fi

# Python 3.9
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py39 \
build-lambda-layer.sh "${VERSION}-py3.9${ARCH_SUFFIX}" "ninja-build"
if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.9" ]]
then
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py39 \
build-lambda-layer.sh "${VERSION}-py3.9${ARCH_SUFFIX}" "ninja-build"
fi

# Python 3.10
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py310 \
build-lambda-layer.sh "${VERSION}-py3.10${ARCH_SUFFIX}" "ninja-build"
if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.10" ]]
then
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py310 \
build-lambda-layer.sh "${VERSION}-py3.10${ARCH_SUFFIX}" "ninja-build"
fi

# Python 3.11
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py311 \
build-lambda-layer.sh "${VERSION}-py3.11${ARCH_SUFFIX}" "ninja-build"
if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.11" ]]
then
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py311 \
build-lambda-layer.sh "${VERSION}-py3.11${ARCH_SUFFIX}" "ninja-build"
fi

# Python 3.12
if [[ $PYTHON_VERSION == "ALL" || $PYTHON_VERSION == "3.12" ]]
then
docker run \
--volume "$DIR_NAME":/aws-sdk-pandas/ \
--workdir /aws-sdk-pandas/building/lambda \
--rm \
awswrangler-build-py312 \
build-lambda-layer.sh "${VERSION}-py3.12${ARCH_SUFFIX}" "ninja-build"
fi
Loading

0 comments on commit 3507fda

Please sign in to comment.