Skip to content

Commit

Permalink
Implement to_boto3 function for S3 I/O. (#405)
Browse files Browse the repository at this point in the history
* implement to_boto3 method for S3 I/O

* add doctest to travis.yml

* Revert "add doctest to travis.yml"

This reverts commit 7a37f32.

* fixup

* Update smart_open/s3.py

Co-Authored-By: Radim Řehůřek <[email protected]>

* Update smart_open/s3.py

Co-Authored-By: Radim Řehůřek <[email protected]>

Co-authored-by: Radim Řehůřek <[email protected]>
  • Loading branch information
mpenkov and piskvorky authored Jan 9, 2020
1 parent acd4581 commit a621aeb
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 6 deletions.
33 changes: 33 additions & 0 deletions howto.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,36 @@ Writing example:
>>> os.unlink(tmp.name) # comment this line to keep the file for later

```

## How to Access S3 Object Properties

When working with AWS S3, you may want to look beyond the abstraction
provided by `smart_open` and communicate with `boto3` directly in order to
satisfy your use case.

For example:

- Access the object's properties, such as the content type, timestamp of the last change, etc.
- Access version information for the object (versioned buckets only)
- Copy the object to another location
- Apply an ACL to the object
- and anything else specified in the [boto3 S3 Object API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#object).

To enable such use cases, the file-like objects returned by `smart_open` have a special `to_boto3` method.
This returns a `boto3.s3.Object` that you can work with directly.
For example, let's get the content type of a publicly available file:

```python
>>> from smart_open import open
>>> with open('s3://commoncrawl/robots.txt') as fin:
... print(fin.readline().rstrip())
... boto3_s3_object = fin.to_boto3()
... print(repr(boto3_s3_object))
... print(boto3_s3_object.content_type) # Using the boto3 API here
User-Agent: *
s3.Object(bucket_name='commoncrawl', key='robots.txt')
text/plain

```

This works only when reading and writing via S3.
40 changes: 35 additions & 5 deletions smart_open/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ def open(
Additional parameters to pass to boto3's initiate_multipart_upload function.
For writing only.
version_id: str, optional
Version of the object, used when reading object. If None, will fetch the most recent version.
Version of the object, used when reading object.
If None, will fetch the most recent version.
"""
logger.debug('%r', locals())
Expand Down Expand Up @@ -237,6 +238,9 @@ def __init__(self, bucket, key, version_id=None, buffer_size=DEFAULT_BUFFER_SIZE
if resource_kwargs is None:
resource_kwargs = {}

self._session = session
self._resource_kwargs = resource_kwargs

s3 = session.resource('s3', **resource_kwargs)
self._object = s3.Object(bucket, key)
self._version_id = version_id
Expand Down Expand Up @@ -343,6 +347,18 @@ def terminate(self):
"""Do nothing."""
pass

def to_boto3(self):
"""Create an **independent** `boto3.s3.Object` instance that points to
the same resource as this instance.
The created instance will re-use the session and resource parameters of
the current instance, but it will be independent: changes to the
`boto3.s3.Object` may not necessary affect the current instance.
"""
s3 = self._session.resource('s3', **self._resource_kwargs)
return s3.Object(self._object.bucket_name, self._object.key)

#
# Internal methods.
#
Expand Down Expand Up @@ -373,13 +389,14 @@ def __init__(self, bucket, key, version_id=None, buffer_size=DEFAULT_BUFFER_SIZE
line_terminator=BINARY_NEWLINE, session=None, resource_kwargs=None):

self._buffer_size = buffer_size
self._session = session
self._resource_kwargs = resource_kwargs

if session is None:
session = boto3.Session()
if resource_kwargs is None:
resource_kwargs = {}

self._session = session
self._resource_kwargs = resource_kwargs
s3 = session.resource('s3', **resource_kwargs)
self._object = s3.Object(bucket, key)
self._version_id = version_id
Expand Down Expand Up @@ -477,8 +494,6 @@ def __init__(
multipart_upload_kwargs=None,
):

self._session = session
self._resource_kwargs = resource_kwargs
self._multipart_upload_kwargs = multipart_upload_kwargs

if min_part_size < MIN_MIN_PART_SIZE:
Expand All @@ -492,6 +507,9 @@ def __init__(
if multipart_upload_kwargs is None:
multipart_upload_kwargs = {}

self._session = session
self._resource_kwargs = resource_kwargs

s3 = session.resource('s3', **resource_kwargs)
try:
self._object = s3.Object(bucket, key)
Expand Down Expand Up @@ -581,6 +599,18 @@ def terminate(self):
self._mp.abort()
self._mp = None

def to_boto3(self):
"""Create an **independent** `boto3.s3.Object` instance that points to
the same resource as this instance.
The created instance will re-use the session and resource parameters of
the current instance, but it will be independent: changes to the
`boto3.s3.Object` may not necessary affect the current instance.
"""
s3 = self._session.resource('s3', **self._resource_kwargs)
return s3.Object(self._object.bucket_name, self._object.key)

#
# Internal methods.
#
Expand Down
23 changes: 22 additions & 1 deletion smart_open/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,10 @@ def ignore_resource_warnings():
class SeekableRawReaderTest(unittest.TestCase):

def setUp(self):
self._body = b'123456'
self._local_resource = boto3.resource('s3', endpoint_url='http://localhost:5000')
self._local_resource.Bucket(BUCKET_NAME).create()
self._local_resource.Object(BUCKET_NAME, KEY_NAME).put(Body=b'123456')
self._local_resource.Object(BUCKET_NAME, KEY_NAME).put(Body=self._body)

def tearDown(self):
self._local_resource.Object(BUCKET_NAME, KEY_NAME).delete()
Expand Down Expand Up @@ -289,6 +290,16 @@ def test_read0_does_not_return_data(self):

self.assertEqual(data, b'')

def test_to_boto3(self):
contents = b'the spice melange\n'
put_to_bucket(contents=contents)

with smart_open.s3.BufferedInputBase(BUCKET_NAME, KEY_NAME) as fin:
returned_obj = fin.to_boto3()

boto3_body = returned_obj.get()['Body'].read()
self.assertEqual(contents, boto3_body)


@maybe_mock_s3
class BufferedOutputBaseTest(unittest.TestCase):
Expand Down Expand Up @@ -428,6 +439,16 @@ def test_flush_close(self):
fout.flush()
fout.close()

def test_to_boto3(self):
contents = b'the spice melange\n'

with smart_open.s3.open(BUCKET_NAME, KEY_NAME, 'wb') as fout:
fout.write(contents)
returned_obj = fout.to_boto3()

boto3_body = returned_obj.get()['Body'].read()
self.assertEqual(contents, boto3_body)


class ClampTest(unittest.TestCase):
def test(self):
Expand Down

0 comments on commit a621aeb

Please sign in to comment.