Python API: ramp up read buffer size

This will reduce the number of iterations of the read loop and thus save CPU time (because of the boilerplate code).
open-io · Aug 20, 2021 · 3236495 · 3236495
1 parent 0a7e51b
commit 3236495
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 5 deletions.
diff --git a/oio/api/io.py b/oio/api/io.py
@@ -1,4 +1,5 @@
 # Copyright (C) 2015-2020 OpenIO SAS, as part of OpenIO SDS
+# Copyright (C) 2021 OVH SAS
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -267,6 +268,10 @@ def __init__(self, chunk_iter, buf_size, headers,
         self.status = None
         # buf size indicates the amount we data we yield
         self.buf_size = buf_size
+        if self.buf_size:
+            self.read_size = itertools.repeat(self.buf_size)
+        else:
+            self.read_size = exp_ramp_gen(8192, 1048576)
         self.discard_bytes = 0
         self.align = align
         self.connection_timeout = connection_timeout or CONNECTION_TIMEOUT
@@ -472,7 +477,7 @@ def iter_from_resp(self, source, parts_iter, part, chunk):
         while True:
             try:
                 with green.ChunkReadTimeout(self.read_timeout):
-                    data = part.read(self.buf_size or READ_CHUNK_SIZE)
+                    data = part.read(next(self.read_size))
                     count += 1
                     buf += data
             except (green.ChunkReadTimeout, IOError) as crto:

diff --git a/oio/blob/client.py b/oio/blob/client.py
@@ -1,4 +1,5 @@
 # Copyright (C) 2015-2020 OpenIO SAS, as part of OpenIO SDS
+# Copyright (C) 2021 OVH SAS
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -36,7 +37,6 @@
 CONNECTION_TIMEOUT = 10.0
 # chunk operations timeout
 CHUNK_TIMEOUT = 60.0
-READ_BUFFER_SIZE = 65535
 PARALLEL_CHUNKS_DELETE = 3
 
 
@@ -172,7 +172,7 @@ def chunk_get(self, url, check_headers=True, **kwargs):
             to the chunk's data.
         """
         url = self.resolve_url(url)
-        reader = ChunkReader([{'url': url}], READ_BUFFER_SIZE,
+        reader = ChunkReader([{'url': url}], None,
                              **kwargs)
         # This must be done now if we want to access headers
         stream = reader.stream()

diff --git a/oio/common/storage_functions.py b/oio/common/storage_functions.py
@@ -1,4 +1,5 @@
 # Copyright (C) 2017-2019 OpenIO SAS, as part of OpenIO SDS
+# Copyright (C) 2021 OVH SAS
 #
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
@@ -16,7 +17,7 @@
 
 import random
 
-from oio.api.io import ChunkReader, READ_CHUNK_SIZE
+from oio.api.io import ChunkReader
 from oio.api.ec import ECChunkDownloadHandler
 from oio.common import exceptions as exc
 from oio.common.constants import OBJECT_METADATA_PREFIX
@@ -212,7 +213,7 @@ def fetch_stream(chunks, ranges, storage_method, headers=None,
                 headers['Range'] = http_header_from_ranges(
                     (meta_range_dict[pos], ))
             reader = ChunkReader(
-                iter(chunks[pos]), READ_CHUNK_SIZE, headers=headers,
+                iter(chunks[pos]), None, headers=headers,
                 **kwargs)
             try:
                 it = reader.get_iter()