From 31c3d581c36678d473d2a97e63580a7e59ab46a7 Mon Sep 17 00:00:00 2001 From: Jordan Henderson Date: Sat, 16 Mar 2024 01:07:14 -0500 Subject: [PATCH] Fix an issue where the Subfiling VFD's context cache grows too large --- release_docs/RELEASE.txt | 15 ++++++++++++ src/H5FDsubfiling/H5FDsubfiling.c | 3 +++ src/H5FDsubfiling/H5subfiling_common.c | 33 ++++++++++++++++---------- src/H5FDsubfiling/H5subfiling_common.h | 1 + 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index 4c6367e9611..234a9aac4d8 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -577,6 +577,21 @@ Bug Fixes since HDF5-1.14.0 release =================================== Library ------- + - Fixed an issue where the Subfiling VFD's context object cache could + grow too large + + The Subfiling VFD keeps a cache of its internal context objects to + speed up access to a context object for a particular file, as well + as access to that object across multiple opens of the same file. + However, opening a large amount of files with the Subfiling VFD over + the course of an application's lifetime could cause this cache to grow + too large and result in the application running out of available MPI + communicator objects. On file close, the Subfiling VFD now simply + evicts context objects out of its cache and frees them. It is assumed + that multiple opens of a file will be a less common use case for the + Subfiling VFD, but this can be revisited if it proves to be an issue + for performance. + - Fixed asserts raised by large values of H5Pset_est_link_info() parameters If large values for est_num_entries and/or est_name_len were passed diff --git a/src/H5FDsubfiling/H5FDsubfiling.c b/src/H5FDsubfiling/H5FDsubfiling.c index 71dd4bacd12..945995f27c4 100644 --- a/src/H5FDsubfiling/H5FDsubfiling.c +++ b/src/H5FDsubfiling/H5FDsubfiling.c @@ -1358,6 +1358,9 @@ H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr) H5MM_free(file_ptr->file_dir); file_ptr->file_dir = NULL; + if (file_ptr->context_id >= 0 && H5_free_subfiling_object(file_ptr->context_id) < 0) + H5_SUBFILING_DONE_ERROR(H5E_FILE, H5E_CANTFREE, FAIL, "can't free subfiling context object"); + /* Release the file info */ file_ptr = H5FL_FREE(H5FD_subfiling_t, file_ptr); diff --git a/src/H5FDsubfiling/H5subfiling_common.c b/src/H5FDsubfiling/H5subfiling_common.c index 1127ae0a386..47d0624ef16 100644 --- a/src/H5FDsubfiling/H5subfiling_common.c +++ b/src/H5FDsubfiling/H5subfiling_common.c @@ -48,7 +48,6 @@ static int sf_file_map_size = 0; #define DEFAULT_TOPOLOGY_CACHE_SIZE 4 #define DEFAULT_FILE_MAP_ENTRIES 8 -static herr_t H5_free_subfiling_object(int64_t object_id); static herr_t H5_free_subfiling_object_int(subfiling_context_t *sf_context); static herr_t H5_free_subfiling_topology(sf_topology_t *topology); @@ -280,18 +279,25 @@ H5_get_subfiling_object(int64_t object_id) * Purpose: Frees the underlying subfiling object for a given subfiling * object ID. * - * NOTE: Currently we assume that all created subfiling - * objects are cached in the (very simple) context/topology - * cache until application exit, so the only time a subfiling - * object should be freed by this routine is if something - * fails right after creating one. Otherwise, the internal - * indexing for the relevant cache will be invalid. + * NOTE: Because we want to avoid the potentially large + * overhead of determining the application topology on every + * file open, we currently assume that all created subfiling + * topology objects are cached in the (very simple) topology + * cache until application exit. This allows us to quickly + * find and assign a cached topology object to a subfiling + * context object for a file when opened. Therefore, a + * subfiling topology object should (currently) only ever be + * freed by this routine if a function fails right after + * creating a topology object. Otherwise, the internal + * indexing for the topology cache will be invalid and we will + * either leak memory or assign invalid topology pointers to + * subfiling context objects after that point. * * Return: Non-negative on success/Negative on failure * *------------------------------------------------------------------------- */ -static herr_t +herr_t H5_free_subfiling_object(int64_t object_id) { int64_t obj_type = (object_id >> 32) & 0x0FFFF; @@ -305,30 +311,31 @@ H5_free_subfiling_object(int64_t object_id) "couldn't get subfiling context for subfiling object ID"); if (H5_free_subfiling_object_int(sf_context) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling object"); + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling context object"); assert(sf_context_cache_num_entries > 0); assert(sf_context == sf_context_cache[sf_context_cache_num_entries - 1]); sf_context_cache[sf_context_cache_num_entries - 1] = NULL; sf_context_cache_num_entries--; } - else { + else if (obj_type == SF_TOPOLOGY) { sf_topology_t *sf_topology; - assert(obj_type == SF_TOPOLOGY); - if (NULL == (sf_topology = H5_get_subfiling_object(object_id))) H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't get subfiling context for subfiling object ID"); if (H5_free_subfiling_topology(sf_topology) < 0) - H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology"); + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "couldn't free subfiling topology object"); assert(sf_topology_cache_num_entries > 0); assert(sf_topology == sf_topology_cache[sf_topology_cache_num_entries - 1]); sf_topology_cache[sf_topology_cache_num_entries - 1] = NULL; sf_topology_cache_num_entries--; } + else + H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, + "couldn't free subfiling object - invalid object type"); done: H5_SUBFILING_FUNC_LEAVE; diff --git a/src/H5FDsubfiling/H5subfiling_common.h b/src/H5FDsubfiling/H5subfiling_common.h index 156902a5c5c..6b5cfa45c27 100644 --- a/src/H5FDsubfiling/H5subfiling_common.h +++ b/src/H5FDsubfiling/H5subfiling_common.h @@ -269,6 +269,7 @@ H5_DLL herr_t H5_close_subfiles(int64_t subfiling_context_id, MPI_Comm file_comm H5_DLL int64_t H5_new_subfiling_object_id(sf_obj_type_t obj_type); H5_DLL void *H5_get_subfiling_object(int64_t object_id); +H5_DLL herr_t H5_free_subfiling_object(int64_t object_id); H5_DLL herr_t H5_get_subfiling_config_from_file(FILE *config_file, int64_t *stripe_size, int64_t *num_subfiles); H5_DLL herr_t H5_resolve_pathname(const char *filepath, MPI_Comm comm, char **resolved_filepath);