piskvorky · mpenkov · Mar 1, 2021 · Feb 9, 2021 · Feb 10, 2021 · Feb 10, 2021
diff --git a/MIGRATING_FROM_OLDER_VERSIONS.rst b/MIGRATING_FROM_OLDER_VERSIONS.rst
@@ -1,3 +1,75 @@
+Migrating to the new client-based S3 API
+========================================
+
+Version of smart_open prior to 5.0.0 used the boto3 `resource API_` for communicating with S3.
+This API was easy to integrate for smart_open developers, but this came at a cost: it was not thread- or multiprocess-safe.
+Furthermore, as smart_open supported more and more options, the transport parameter list grew, making it less maintainable.
+Starting with version 5.0.0, smart_open uses the `client API`_ instead of the resource API.
+Functionally, the little changes for the smart_open user. 
+The only difference is in passing transport parameters to the S3 backend.
+
+More specifically, the following S3 transport parameters are no longer supported:
+
+- `multipart_upload_kwargs`
+- `object_kwargs`
+- `resource`
+- `resource_kwargs`
+- `session`
+- `singlepart_upload_kwargs`
+
+If you weren't using the above parameters, nothing changes for you.
+However, if you were using any of the above, then you need to adjust your code.
+Here are some quick recipes below.
+
+If you were previously passing `session`, then construct an S3 client from the session and pass that instead.
+For example, before::
+
+    smart_open.open('s3://bucket/key', transport_params={'session': session})
+
+After::
+
+    smart_open.open('s3://bucket/key', transport_params={'client': session.client('s3')})
+
+
+If you were passing `resource`, then replace the resource with a client, and pass that instead.
+For example, before::
+
+    resource = session.resource('s3', **resource_kwargs)
+    smart_open.open('s3://bucket/key', transport_params={'resource': resource})
+
+After::
+
+    client = session.client('s3')
+    smart_open.open('s3://bucket/key', transport_params={'client': client})
+
+If you were passing any of the `*_kwargs` parameters, you will need to include them in `client_kwargs`, keeping in mind the following transformations.
+
+========================== ====================================== ==========================
+Parameter name             Resource API method                    Client API function
+========================== ====================================== ==========================
+`multipart_upload_kwargs`  `s3.Object.initiate_multipart_upload`_ `create_multipart_upload`_
+`object_kwargs`            `s3.Object.get`_                       `get_object`_
+`resource_kwargs`          ???                                    ???
+`singlepart_upload_kwargs` `s3.Object.put`_                       `put_object`_
+========================== ====================================== ==========================
+
+The `client_kwargs` dict can thus contain the following members:
+
+- `s3.Client`: initializer parameters, e.g. those to pass directly to the `boto3.client` function
+- `s3.Client.create_multipart_upload`
+- `s3.Client.get_object`
+- `s3.Client.put_object`
+
+.. _resource_API: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#service-resource
+.. _s3.Object.initiate_multipart_upload: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Object.initiate_multipart_upload
+.. _s3.Object.get: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.ObjectSummary.get
+.. _s3.Object.put: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.ObjectSummary.put
+
+.. _client_API: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#client
+.. _create_multipart_upload: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.create_multipart_upload
+.. _get_object: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object
+.. _get_object: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.put_object
+
 Migrating to the new dependency management subsystem
 ====================================================
 

diff --git a/README.rst b/README.rst
@@ -154,7 +154,7 @@ For the sake of simplicity, the examples below assume you have all the dependenc
     ...     aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
     ... )
     >>> url = 's3://smart-open-py37-benchmark-results/test.txt'
-    >>> with open(url, 'wb', transport_params={'session': session}) as fout:
+    >>> with open(url, 'wb', transport_params={'client': session.client('s3')}) as fout:
     ...     bytes_written = fout.write(b'hello world!')
     ...     print(bytes_written)
     12
@@ -182,12 +182,9 @@ For the sake of simplicity, the examples below assume you have all the dependenc
         print(line)
 
     # Stream to Digital Ocean Spaces bucket providing credentials from boto3 profile
-    transport_params = {
-        'session': boto3.Session(profile_name='digitalocean'),
-        'resource_kwargs': {
-            'endpoint_url': 'https://ams3.digitaloceanspaces.com',
-        }
-    }
+    session = boto3.Session(profile_name='digitalocean')
+    client = session.client('s3', endpoint_url='https://ams3.digitaloceanspaces.com')
+    transport_params = {'client': client}
     with open('s3://bucket/key.txt', 'wb', transport_params=transport_params) as fout:
         fout.write(b'here we stand')
 
@@ -202,15 +199,15 @@ For the sake of simplicity, the examples below assume you have all the dependenc
     # stream from Azure Blob Storage
     connect_str = os.environ['AZURE_STORAGE_CONNECTION_STRING']
     transport_params = {
-        client: azure.storage.blob.BlobServiceClient.from_connection_string(connect_str)
+        'client': azure.storage.blob.BlobServiceClient.from_connection_string(connect_str),
     }
     for line in open('azure://mycontainer/myfile.txt', transport_params=transport_params):
         print(line)
 
     # stream content *into* Azure Blob Storage (write mode):
     connect_str = os.environ['AZURE_STORAGE_CONNECTION_STRING']
     transport_params = {
-        client: azure.storage.blob.BlobServiceClient.from_connection_string(connect_str)
+        'client': azure.storage.blob.BlobServiceClient.from_connection_string(connect_str),
     }
     with open('azure://mycontainer/my_file.txt', 'wb', transport_params=transport_params) as fout:
         fout.write(b'hello world')
@@ -264,7 +261,7 @@ Here are some examples of using this parameter:
 .. code-block:: python
 
   >>> import boto3
-  >>> fin = open('s3://commoncrawl/robots.txt', transport_params=dict(session=boto3.Session()))
+  >>> fin = open('s3://commoncrawl/robots.txt', transport_params=dict(client=boto3.client('s3')))
   >>> fin = open('s3://commoncrawl/robots.txt', transport_params=dict(buffer_size=1024))
 
 For the full list of keyword arguments supported by each transport option, see the documentation:
@@ -281,8 +278,8 @@ S3 Credentials
 By default, ``smart_open`` will defer to ``boto3`` and let the latter take care of the credentials.
 There are several ways to override this behavior.
 
-The first is to pass a ``boto3.Session`` object as a transport parameter to the ``open`` function.
-You can customize the credentials when constructing the session.
+The first is to pass a ``boto3.Client`` object as a transport parameter to the ``open`` function.
+You can customize the credentials when constructing the session for the client.
 ``smart_open`` will then use the session when talking to S3.
 
 .. code-block:: python
@@ -292,15 +289,16 @@ You can customize the credentials when constructing the session.
         aws_secret_access_key=SECRET_KEY,
         aws_session_token=SESSION_TOKEN,
     )
-    fin = open('s3://bucket/key', transport_params=dict(session=session), ...)
+    client = session.client('s3', endpoint_url=..., config=...)
+    fin = open('s3://bucket/key', transport_params=dict(client=client))
 
 Your second option is to specify the credentials within the S3 URL itself:
 
 .. code-block:: python
 
     fin = open('s3://aws_access_key_id:aws_secret_access_key@bucket/key', ...)
 
-*Important*: The two methods above are **mutually exclusive**. If you pass an AWS session *and* the URL contains credentials, ``smart_open`` will ignore the latter.
+*Important*: The two methods above are **mutually exclusive**. If you pass an AWS client *and* the URL contains credentials, ``smart_open`` will ignore the latter.
 
 *Important*: ``smart_open`` ignores configuration files from the older ``boto`` library.
 Port your old ``boto`` settings to ``boto3`` in order to use them with ``smart_open``.

diff --git a/help.txt b/help.txt
@@ -137,17 +137,6 @@ FUNCTIONS
             The buffer size to use when performing I/O.
         min_part_size: int, optional
             The minimum part size for multipart uploads.  For writing only.
-        session: object, optional
-            The S3 session to use when working with boto3.
-        resource_kwargs: dict, optional
-            Keyword arguments to use when accessing the S3 resource for reading or writing.
-        multipart_upload_kwargs: dict, optional
-            Additional parameters to pass to boto3's initiate_multipart_upload function.
-            For writing only.
-        singlepart_upload_kwargs: dict, optional
-            Additional parameters to pass to boto3's S3.Object.put function when using single
-            part upload.
-            For writing only.
         multipart_upload: bool, optional
             Default: `True`
             If set to `True`, will use multipart upload for writing to S3. If set
@@ -157,14 +146,18 @@ FUNCTIONS
         version_id: str, optional
             Version of the object, used when reading object.
             If None, will fetch the most recent version.
-        object_kwargs: dict, optional
-            Additional parameters to pass to boto3's object.get function.
-            Used during reading only.
         defer_seek: boolean, optional
             Default: `False`
             If set to `True` on a file opened for reading, GetObject will not be
             called until the first seek() or read().
             Avoids redundant API queries when seeking before reading.
+        client: object, optional
+            The S3 client to use when working with boto3.
+            If you don't specify this, then smart_open will create a new client for you.
+        client_kwargs: dict, optional
+            Additional parameters to pass to the relevant functions of the client.
+            The keys are fully qualified method names, e.g. `S3.Client.create_multipart_upload`.
+            The values are kwargs to pass to that method each time it is called.
         writebuffer: IO[bytes], optional
             By default, this module will buffer data in memory using io.BytesIO
             when writing. Pass another binary IO instance here to use it instead.
@@ -325,13 +318,13 @@ FUNCTIONS
     s3_iter_bucket(bucket_name, prefix='', accept_key=None, key_limit=None, workers=16, retries=3, **session_kwargs)
         Deprecated.  Use smart_open.s3.iter_bucket instead.
 
-    smart_open(uri, mode='rb', **kw)
+    smart_open(uri, mode='rb', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None, ignore_extension=False, **kwargs)
 
 DATA
     __all__ = ['open', 'parse_uri', 'register_compressor', 's3_iter_bucket...
 
 VERSION
-    2.2.1
+    4.1.2.dev0
 
 FILE
     /Users/misha/git/smart_open/smart_open/__init__.py

diff --git a/howto.md b/howto.md
@@ -73,11 +73,12 @@ The `boto3` library that `smart_open` uses for accessing S3 signs each request u
 If you'd like to access S3 without using an S3 account, then you need disable this signing mechanism.
 
 ```python
+>>> import boto3
 >>> import botocore
 >>> import botocore.client
 >>> from smart_open import open
 >>> config = botocore.client.Config(signature_version=botocore.UNSIGNED)
->>> params = {'resource_kwargs': {'config': config}}
+>>> params = {'client': boto3.client('s3', config=config)}
 >>> with open('s3://commoncrawl/robots.txt', transport_params=params) as fin:
 ...    fin.readline()
 'User-Agent: *\n'
@@ -175,15 +176,15 @@ s3.ObjectVersion(bucket_name='smart-open-versioned', object_key='demo.txt', id='
 
 ## How to Read from S3 Efficiently
 
-Under the covers, `smart_open` uses the [boto3 resource API](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html) to read from S3.
-By default, calling `smart_open.open` with an S3 URL will create its own boto3 session and resource.
+Under the covers, `smart_open` uses the [boto3 client API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#client) to read from S3.
+By default, calling `smart_open.open` with an S3 URL will create its own boto3 client.
 These are expensive operations: they require both CPU time to construct the objects from a low-level API definition, and memory to store the objects once they have been created.
 It is possible to save both CPU time and memory by sharing the same resource across multiple `smart_open.open` calls, for example:
 
 ```python
 >>> import boto3
 >>> from smart_open import open
->>> tp = {'resource': boto3.resource('s3')}
+>>> tp = {'client': boto3.client('s3')}
 >>> for month in (1, 2, 3):
 ...     url = 's3://nyc-tlc/trip data/yellow_tripdata_2020-%02d.csv' % month
 ...     with open(url, transport_params=tp) as fin:
@@ -195,7 +196,7 @@ It is possible to save both CPU time and memory by sharing the same resource acr
 
 ```
 
-The above sharing is safe because it is all happening in the same thread and subprocess (see below for details).
+Clients are thread-safe and multiprocess-safe, so you may share them between other threads and subprocesses.
 
 By default, `smart_open` buffers the most recent part of a multipart upload in memory.
 The default part size is 50MB.
@@ -226,14 +227,6 @@ with tempfile.NamedTemporaryFile() as tmp:
 
 This option reduces memory usage at the expense of additional disk I/O (writing to a reading from a hard disk is slower).
 
-## How to Work in a Parallelized Environment
-
-Under the covers, `smart_open` uses the [boto3 resource API](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/resources.html) to read from S3.
-This API is not thread-safe or multiprocess-safe.
-Do not share the same `smart_open` objects across different threads or subprocesses.
-`smart_open` will create its own session and resource objects for each individual `open` call, so you don't have to worry about managing boto3 objects.
-This comes at a price: each session and resource requires CPU time to create and memory to store, so be wary of keeping hundreds of threads or subprocesses reading/writing from/to S3.
-
 ## How to Specify the Request Payer (S3 only)
 
 Some public buckets require you to [pay for S3 requests for the data in the bucket](https://docs.aws.amazon.com/AmazonS3/latest/dev/RequesterPaysBuckets.html).
@@ -243,7 +236,7 @@ To access such buckets, you need to pass some special transport parameters:
 
 ```python
 >>> from smart_open import open
->>> params = {'object_kwargs': {'RequestPayer': 'requester'}}
+>>> params = {'client_kwargs': {'S3.Client.get_object': {RequestPayer': 'requester'}}}
 >>> with open('s3://arxiv/pdf/arXiv_pdf_manifest.xml', transport_params=params) as fin:
 ...    print(fin.readline())
 <?xml version='1.0' standalone='yes'?>
@@ -258,41 +251,83 @@ This works only when reading and writing via S3.
 Boto3 has a [built-in mechanism](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html) for retrying after a recoverable error.
 You can fine-tune it using several ways:
 
-### Pre-configuring a boto3 resource and then passing the resource to smart_open
+### Pre-configuring a boto3 client and then passing the client to smart_open
 
 ```python
 >>> import boto3
 >>> import botocore.config
 >>> import smart_open
 >>> config = botocore.config.Config(retries={'mode': 'standard'})
->>> resource = boto3.resource('s3', config=config)
->>> tp = {'resource': resource}
+>>> client = boto3.client('s3', config=config)
+>>> tp = {'client': client}
 >>> with smart_open.open('s3://commoncrawl/robots.txt', transport_params=tp) as fin:
 ...     print(fin.readline())
 User-Agent: *
 ```
 
-### Directly passing configuration as transport parameters to smart_open
+To verify your settings have effect:
+
+```python
+import logging
+logging.getLogger('smart_open.s3').setLevel(logging.DEBUG)
+```
+
+and check the log output of your code.
+
+## How to Pass Additional Parameters to boto3
+
+`boto3` is a highly configurable library, and each function call accepts many optional parameters.
+`smart_open` does not attempt to replicate this behavior, since most of these parameters often do not influence the behavior of `smart_open` itself.
+Instead, `smart_open` offers the caller of the function to pass additional parameters as necessary:
 
 ```python
 >>> import boto3
->>> import botocore.config
->>> import smart_open
->>> config = botocore.config.Config(retries={'mode': 'standard'})
->>> tp = {'resource_kwargs': {'config': config}}
->>> with smart_open.open('s3://commoncrawl/robots.txt', transport_params=tp) as fin:
-...     print(fin.readline())
-User-Agent: *
+>>> client_kwargs = {'S3.Client.get_object': {RequestPayer': 'requester'}}}
+>>> with open('s3://arxiv/pdf/arXiv_pdf_manifest.xml', transport_params=params) as fin:
+...     pass
 ```
 
-To verify your settings have effect:
+The above example influences how the [S3.Client.get_object function](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object) gets called by `smart_open` when reading the specified URL.
+More specifically, the `RequestPayer` parameter will be set to `requester` **for each call**.
+Influential functions include:
+
+- S3.Client (the initializer function)
+- S3.Client.abort_multipart_upload
+- S3.Client.complete_multipart_upload
+- S3.Client.create_multipart_upload
+- S3.Client.get_object
+- S3.Client.head_bucket
+- S3.Client.put_object
+- S3.Client.upload_part
+
+If you choose to pass additional parameters, keep the following in mind:
+
+1. Study the [boto3 client API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client) and ensure the function and parameters are valid.
+2. Study the [code for the smart_open.s3 submodule](smart_open/s3.py) and ensure `smart_open` is actually calling the function you're passing additional parameters for.
+
+Finally, in some cases, it's possible to work directly with `boto3` without going through `smart_open`.
+For example, setting the ACL for an object is possible after the object is created (with `boto3`), as opposed to at creation time (with `smart_open`).
+More specifically, here's the direct method:
 
 ```python
-import logging
-logging.getLogger('smart_open.s3').setLevel(logging.DEBUG)
+import boto3
+import smart_open
+with smart_open.open('s3://bucket/key', 'wb') as fout:
+    fout.write(b'hello world!')
+client = boto3.client('s3')
+client.put_object_acl(ACL=acl_as_string)
 ```
 
-and check the log output of your code.
+Here's the same code that passes the above parameter via `smart_open`:
+
+```python
+import smart_open
+tp = {'client_kwargs': {'S3.Client.create_multipart_upload': {'ACL': acl_as_string}}}
+with smart_open.open('s3://bucket/key', 'wb', transport_params=tp) as fout:
+    fout.write(b'hello world!')
+```
+
+If passing everything via `smart_open` feels awkward, try passing part of the parameters directly to `boto3`.
 
 ## How to Read/Write from localstack
 
@@ -315,8 +350,10 @@ where `http://localhost:4566` is the default host/port that localstack uses to l
 You can now read/write to the bucket the same way you would to a real S3 bucket:
 
 ```python
+>>> import boto3
 >>> from smart_open import open
->>> tparams = {'resource_kwargs': {'endpoint_url': 'http://localhost:4566'}}
+>>> client = boto3.client('s3', endpoint_url='http://localhost:4566')
+>>> tparams = {'client': client}
 >>> with open('s3://mybucket/hello.txt', 'wt', transport_params=tparams) as fout:
 ...     fout.write('hello world!')
 >>> with open('s3://mybucket/hello.txt', 'rt', transport_params=tparams) as fin: