From 9f122a5ad5e9d3302d8c7e64603719cd8d1fcf5a Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 09:14:40 -0500 Subject: [PATCH 1/5] name-hash: eliminate duplicate memihash call Remove duplicate memihash() call in hash_dir_entry(). The existing code called memihash() to do the find_dir_entry() and it not found, called memihash() again to do the hashmap_add(). Signed-off-by: Jeff Hostetler --- name-hash.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/name-hash.c b/name-hash.c index 6d9f23e932559c..860b8dd0c746e8 100644 --- a/name-hash.c +++ b/name-hash.c @@ -23,15 +23,21 @@ static int dir_entry_cmp(const struct dir_entry *e1, name ? name : e2->name, e1->namelen); } -static struct dir_entry *find_dir_entry(struct index_state *istate, - const char *name, unsigned int namelen) +static struct dir_entry *find_dir_entry__hash(struct index_state *istate, + const char *name, unsigned int namelen, unsigned int hash) { struct dir_entry key; - hashmap_entry_init(&key, memihash(name, namelen)); + hashmap_entry_init(&key, hash); key.namelen = namelen; return hashmap_get(&istate->dir_hash, &key, name); } +static struct dir_entry *find_dir_entry(struct index_state *istate, + const char *name, unsigned int namelen) +{ + return find_dir_entry__hash(istate, name, namelen, memihash(name,namelen)); +} + static struct dir_entry *hash_dir_entry(struct index_state *istate, struct cache_entry *ce, int namelen) { @@ -43,6 +49,7 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, * in index_state.name_hash (as ordinary cache_entries). */ struct dir_entry *dir; + unsigned int hash; /* get length of parent directory */ while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1])) @@ -52,11 +59,12 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, namelen--; /* lookup existing entry for that directory */ - dir = find_dir_entry(istate, ce->name, namelen); + hash = memihash(ce->name, namelen); + dir = find_dir_entry__hash(istate, ce->name, namelen, hash); if (!dir) { /* not found, create it and add to hash table */ FLEX_ALLOC_MEM(dir, name, ce->name, namelen); - hashmap_entry_init(dir, memihash(ce->name, namelen)); + hashmap_entry_init(dir, hash); dir->namelen = namelen; hashmap_add(&istate->dir_hash, dir); From 1ccde200c2a4aea1c510130502d342859d70c12d Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 09:23:55 -0500 Subject: [PATCH 2/5] hashmap: allow memihash computation to be continued Add variant of memihash() to allow the hash computation to be continued. There are times when we compute the hash on a full path and then the hash on just the path to the parent directory. This can be expensive on large repositories. With this, we can hash the parent directory first. And then continue the computation to include the "/filename". Signed-off-by: Jeff Hostetler --- hashmap.c | 17 +++++++++++++++++ hashmap.h | 1 + 2 files changed, 18 insertions(+) diff --git a/hashmap.c b/hashmap.c index b10b642229ca0c..505e63fd3c801a 100644 --- a/hashmap.c +++ b/hashmap.c @@ -50,6 +50,23 @@ unsigned int memihash(const void *buf, size_t len) return hash; } +/* + * Incoporate another chunk of data into a memihash + * computation. + */ +unsigned int memihash_cont(unsigned int hash_seed, const void *buf, size_t len) +{ + unsigned int hash = hash_seed; + unsigned char *ucbuf = (unsigned char *) buf; + while (len--) { + unsigned int c = *ucbuf++; + if (c >= 'a' && c <= 'z') + c -= 'a' - 'A'; + hash = (hash * FNV32_PRIME) ^ c; + } + return hash; +} + #define HASHMAP_INITIAL_SIZE 64 /* grow / shrink by 2^2 */ #define HASHMAP_RESIZE_BITS 2 diff --git a/hashmap.h b/hashmap.h index ab7958ae333bcc..45eda693bdbc7e 100644 --- a/hashmap.h +++ b/hashmap.h @@ -12,6 +12,7 @@ extern unsigned int strhash(const char *buf); extern unsigned int strihash(const char *buf); extern unsigned int memhash(const void *buf, size_t len); extern unsigned int memihash(const void *buf, size_t len); +extern unsigned int memihash_cont(unsigned int hash_seed, const void *buf, size_t len); static inline unsigned int sha1hash(const unsigned char *sha1) { From baf663d69f201350b181522826dcdb639e35cfb9 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 09:59:59 -0500 Subject: [PATCH 3/5] name-hash: precompute hash values during preload-index Precompute the istate.name_hash and istate.dir_hash values for each cache-entry during the preload-index phase. Move the expensive memihash() calculations from lazy_init_name_hash() to the multi-threaded preload-index phase. Signed-off-by: Jeff Hostetler --- cache.h | 16 ++++++++++++ name-hash.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++-- preload-index.c | 2 ++ read-cache.c | 3 +++ 4 files changed, 85 insertions(+), 2 deletions(-) diff --git a/cache.h b/cache.h index a50a61a19787de..35d58f83e16b33 100644 --- a/cache.h +++ b/cache.h @@ -173,6 +173,9 @@ struct cache_entry { unsigned int ce_flags; unsigned int ce_namelen; unsigned int index; /* for link extension */ + unsigned int precompute_hash_state; + unsigned int precompute_hash_name; + unsigned int precompute_hash_dir; struct object_id oid; char name[FLEX_ARRAY]; /* more */ }; @@ -229,6 +232,19 @@ struct cache_entry { #error "CE_EXTENDED_FLAGS out of range" #endif +/* + * Bit set if preload-index precomputed the hash value(s) + * for this cache-entry. + */ +#define CE_PRECOMPUTE_HASH_STATE__SET (1 << 0) +/* + * Bit set if precompute-index also precomputed the hash value + * for the parent directory. + */ +#define CE_PRECOMPUTE_HASH_STATE__DIR (1 << 1) + +void precompute_istate_hashes(struct cache_entry *ce); + /* Forward structure decls */ struct pathspec; struct child_process; diff --git a/name-hash.c b/name-hash.c index 860b8dd0c746e8..7a220187ad621c 100644 --- a/name-hash.c +++ b/name-hash.c @@ -50,6 +50,17 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, */ struct dir_entry *dir; unsigned int hash; + int use_precomputed_dir_hash = 0; + + if (ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__SET) { + if (!(ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__DIR)) + return NULL; /* item does not have a parent directory */ + if (namelen == ce_namelen(ce)) { + /* dir hash only valid for outer-most call (not recursive ones) */ + use_precomputed_dir_hash = 1; + hash = ce->precompute_hash_dir; + } + } /* get length of parent directory */ while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1])) @@ -59,7 +70,8 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, namelen--; /* lookup existing entry for that directory */ - hash = memihash(ce->name, namelen); + if (!use_precomputed_dir_hash) + hash = memihash(ce->name, namelen); dir = find_dir_entry__hash(istate, ce->name, namelen, hash); if (!dir) { /* not found, create it and add to hash table */ @@ -99,10 +111,18 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce) static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) { + unsigned int h; + if (ce->ce_flags & CE_HASHED) return; ce->ce_flags |= CE_HASHED; - hashmap_entry_init(ce, memihash(ce->name, ce_namelen(ce))); + + if (ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__SET) + h = ce->precompute_hash_name; + else + h = memihash(ce->name, ce_namelen(ce)); + + hashmap_entry_init(ce, h); hashmap_add(&istate->name_hash, ce); if (ignore_case) @@ -244,3 +264,45 @@ void free_name_hash(struct index_state *istate) hashmap_free(&istate->name_hash, 0); hashmap_free(&istate->dir_hash, 1); } + +/* + * Precompute the hash values for this cache_entry + * for use in the istate.name_hash and istate.dir_hash. + * + * If the item is in the root directory, just compute the + * hash value (for istate.name_hash) on the full path. + * + * If the item is in a subdirectory, first compute the + * hash value for the immediate parent directory (for + * istate.dir_hash) and then the hash value for the full + * path by continuing the computation. + * + * Note that these hashes will be used by + * wt_status_collect_untracked() as it scans the worktree + * and maps observed paths back to the index (optionally + * ignoring case). Therefore, we probably only *NEED* to + * precompute this for non-skip-worktree items (since + * status should not observe skipped items), but because + * lazy_init_name_hash() hashes everything, we force it + * here. + */ +void precompute_istate_hashes(struct cache_entry *ce) +{ + int namelen = ce_namelen(ce); + + while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1])) + namelen--; + + if (namelen <= 0) { + ce->precompute_hash_name = memihash(ce->name, ce_namelen(ce)); + ce->precompute_hash_state = CE_PRECOMPUTE_HASH_STATE__SET; + } else { + namelen--; + ce->precompute_hash_dir = memihash(ce->name, namelen); + ce->precompute_hash_name = memihash_cont( + ce->precompute_hash_dir, &ce->name[namelen], + ce_namelen(ce) - namelen); + ce->precompute_hash_state = + CE_PRECOMPUTE_HASH_STATE__SET | CE_PRECOMPUTE_HASH_STATE__DIR; + } +} diff --git a/preload-index.c b/preload-index.c index c1fe3a3ef9cba0..602737f9d0f9f9 100644 --- a/preload-index.c +++ b/preload-index.c @@ -47,6 +47,8 @@ static void *preload_thread(void *_data) struct cache_entry *ce = *cep++; struct stat st; + precompute_istate_hashes(ce); + if (ce_stage(ce)) continue; if (S_ISGITLINK(ce->ce_mode)) diff --git a/read-cache.c b/read-cache.c index db5d910642663e..7fbe993dd7cf0b 100644 --- a/read-cache.c +++ b/read-cache.c @@ -73,6 +73,7 @@ void rename_index_entry_at(struct index_state *istate, int nr, const char *new_n copy_cache_entry(new, old); new->ce_flags &= ~CE_HASHED; new->ce_namelen = namelen; + new->precompute_hash_state = 0; new->index = 0; memcpy(new->name, new_name, namelen + 1); @@ -622,6 +623,7 @@ static struct cache_entry *create_alias_ce(struct index_state *istate, new = xcalloc(1, cache_entry_size(len)); memcpy(new->name, alias->name, len); copy_cache_entry(new, ce); + new->precompute_hash_state = 0; save_or_free_index_entry(istate, ce); return new; } @@ -1457,6 +1459,7 @@ static struct cache_entry *cache_entry_from_ondisk(struct ondisk_cache_entry *on ce->ce_stat_data.sd_size = get_be32(&ondisk->size); ce->ce_flags = flags & ~CE_NAMEMASK; ce->ce_namelen = len; + ce->precompute_hash_state = 0; ce->index = 0; hashcpy(ce->oid.hash, ondisk->sha1); memcpy(ce->name, name, len); From 5ba3a955731afd977b3be842c10f6bb37f313367 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 10:05:26 -0500 Subject: [PATCH 4/5] name-hash: specify initial size for istate.dir_hash table Specify an initial size for the istate.dir_hash HashMap matching the size of the istate.name_hash. Previously hashmap_init() was given 0, causing a 64 bucket hashmap to be created. When working with very large repositories, this would cause numerous rehash() calls to realloc and rebalance the hashmap. This is especially true when the worktree is deep, with many directories containing a few files. Signed-off-by: Jeff Hostetler --- name-hash.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/name-hash.c b/name-hash.c index 7a220187ad621c..b2f6d2df7d40e8 100644 --- a/name-hash.c +++ b/name-hash.c @@ -148,7 +148,8 @@ static void lazy_init_name_hash(struct index_state *istate) return; hashmap_init(&istate->name_hash, (hashmap_cmp_fn) cache_entry_cmp, istate->cache_nr); - hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp, 0); + hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp, + istate->cache_nr); for (nr = 0; nr < istate->cache_nr; nr++) hash_index_entry(istate, istate->cache[nr]); istate->name_hash_initialized = 1; From 6ffde6a26198a79fb25105c77e5771b9504e185e Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Thu, 17 Nov 2016 10:37:01 -0500 Subject: [PATCH 5/5] name-hash: remember previous dir_entry during lazy_init_name_hash Teach hash_dir_entry() to remember the previously found dir_entry during lazy_init_name_hash() iteration. This is a performance optimization. Since items in the index array are sorted by full pathname, adjacent items are likely to be in the same directory. This can save memihash() computations and HashMap lookups. Signed-off-by: Jeff Hostetler --- name-hash.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/name-hash.c b/name-hash.c index b2f6d2df7d40e8..fcd2f90cf71bdd 100644 --- a/name-hash.c +++ b/name-hash.c @@ -39,7 +39,7 @@ static struct dir_entry *find_dir_entry(struct index_state *istate, } static struct dir_entry *hash_dir_entry(struct index_state *istate, - struct cache_entry *ce, int namelen) + struct cache_entry *ce, int namelen, struct dir_entry **p_previous_dir) { /* * Throw each directory component in the hash for quick lookup @@ -70,9 +70,21 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, namelen--; /* lookup existing entry for that directory */ - if (!use_precomputed_dir_hash) - hash = memihash(ce->name, namelen); - dir = find_dir_entry__hash(istate, ce->name, namelen, hash); + if (p_previous_dir && *p_previous_dir + && namelen == (*p_previous_dir)->namelen + && memcmp(ce->name, (*p_previous_dir)->name, namelen) == 0) { + /* + * When our caller is sequentially iterating thru the index, + * items in the same directory will be sequential, and therefore + * refer to the same dir_entry. + */ + dir = *p_previous_dir; + } else { + if (!use_precomputed_dir_hash) + hash = memihash(ce->name, namelen); + dir = find_dir_entry__hash(istate, ce->name, namelen, hash); + } + if (!dir) { /* not found, create it and add to hash table */ FLEX_ALLOC_MEM(dir, name, ce->name, namelen); @@ -81,15 +93,20 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate, hashmap_add(&istate->dir_hash, dir); /* recursively add missing parent directories */ - dir->parent = hash_dir_entry(istate, ce, namelen); + dir->parent = hash_dir_entry(istate, ce, namelen, NULL); } + + if (p_previous_dir) + *p_previous_dir = dir; + return dir; } -static void add_dir_entry(struct index_state *istate, struct cache_entry *ce) +static void add_dir_entry(struct index_state *istate, struct cache_entry *ce, + struct dir_entry **p_previous_dir) { /* Add reference to the directory entry (and parents if 0). */ - struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce)); + struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce), p_previous_dir); while (dir && !(dir->nr++)) dir = dir->parent; } @@ -100,7 +117,7 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce) * Release reference to the directory entry. If 0, remove and continue * with parent directory. */ - struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce)); + struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce), NULL); while (dir && !(--dir->nr)) { struct dir_entry *parent = dir->parent; hashmap_remove(&istate->dir_hash, dir, NULL); @@ -109,7 +126,8 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce) } } -static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) +static void hash_index_entry(struct index_state *istate, struct cache_entry *ce, + struct dir_entry **p_previous_dir) { unsigned int h; @@ -126,7 +144,7 @@ static void hash_index_entry(struct index_state *istate, struct cache_entry *ce) hashmap_add(&istate->name_hash, ce); if (ignore_case) - add_dir_entry(istate, ce); + add_dir_entry(istate, ce, p_previous_dir); } static int cache_entry_cmp(const struct cache_entry *ce1, @@ -142,6 +160,7 @@ static int cache_entry_cmp(const struct cache_entry *ce1, static void lazy_init_name_hash(struct index_state *istate) { + struct dir_entry *previous_dir = NULL; int nr; if (istate->name_hash_initialized) @@ -151,14 +170,14 @@ static void lazy_init_name_hash(struct index_state *istate) hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp, istate->cache_nr); for (nr = 0; nr < istate->cache_nr; nr++) - hash_index_entry(istate, istate->cache[nr]); + hash_index_entry(istate, istate->cache[nr], &previous_dir); istate->name_hash_initialized = 1; } void add_name_hash(struct index_state *istate, struct cache_entry *ce) { if (istate->name_hash_initialized) - hash_index_entry(istate, ce); + hash_index_entry(istate, ce, NULL); } void remove_name_hash(struct index_state *istate, struct cache_entry *ce)