Skip to content

Commit

Permalink
improve mongo-backend-loading
Browse files Browse the repository at this point in the history
  • Loading branch information
JoranAngevaare committed Oct 22, 2020
1 parent 114c284 commit 050e71e
Showing 1 changed file with 38 additions and 5 deletions.
43 changes: 38 additions & 5 deletions strax/storage/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,17 @@ def __init__(self, uri, database, col_name=None):
self.client = MongoClient(uri)
self.db = self.client[database]
self.col_name = col_name
self.chunks_registry = None

def _read_chunk(self, backend_key, chunk_info, dtype, compressor):
"""See strax.Backend"""
query = backend_key_to_query(backend_key)
chunk_i = chunk_info["chunk_i"]

# Query for the chunk and project the chunk info
doc = self.db[self.col_name].find_one(
{**query, "chunk_i": chunk_i},
{f"data": 1})
# Build the chunk-registry if not done already.
if self.chunks_registry is None:
self._build_chunk_registry(backend_key)

doc = self.chunks_registry.get(chunk_i, None)
# Unpack info about this chunk from the query. Return empty if not available.
if doc is None:
# Did not find the data
Expand Down Expand Up @@ -73,6 +73,39 @@ def get_metadata(self, key):
return doc['metadata']
raise strax.DataNotAvailable

def _build_chunk_registry(self, backend_key):
"""
:param backend_key: strax.DataKey to query the collection for
Build chunk info in a single registry using only one query to
the database. This is much faster as one does not have to do
n-chunk queries to the database. Just one will do. As the
documents-size is limited to 16 MB, it's unlikely that we will
run into memory issues (that we otherwise would not run into).
"""

query = backend_key_to_query(backend_key)
chunks_registry = self.db[self.col_name].find(
{**query, **{"chunk_i": {'$exists':True}}},
{"chunk_i": 1, "data": 1})
if chunks_registry is not None:
# We are going to convert this to a dictionary as that is
# easier to lookup
self.chunks_registry = {}
else:
raise strax.DataNotAvailable(f'Unable find data for {query}')

for doc in chunks_registry:
chunk_key = doc.get('chunk_i', None)
if chunk_key is None:
# Should not happen because of the projection in find
# but let's double check
raise ValueError(
f'Projection failed, got doc with no "chunk_i":\n{doc}')
# Update our registry with this chunks info. Use chunk_i as
# chunk_key
self.chunks_registry[chunk_key] = doc.copy()



@export
class MongoFrontend(StorageFrontend):
Expand Down

0 comments on commit 050e71e

Please sign in to comment.