diff --git a/Documentation/config/survey.txt b/Documentation/config/survey.txt index 9e594a2092f225..f3ae768933fe1b 100644 --- a/Documentation/config/survey.txt +++ b/Documentation/config/survey.txt @@ -4,6 +4,10 @@ survey.*:: background with these options. + -- + survey.namerev:: + Boolean to show/hide `git name-rev` information for each + reported commit and the containing commit of each + reported tree and blob. verbose:: This boolean value implies the `--[no-]verbose` option. progress:: @@ -11,4 +15,33 @@ survey.*:: top:: This integer value implies `--top=`, specifying the number of entries in the detail tables. + showBlobSizes:: + A non-negative integer value. Requests details on the + largest file blobs by size in bytes. Provides a + default value for `--blob-sizes=` in + linkgit:git-survey[1]. + showCommitParents:: + A non-negative integer value. Requests details on the + commits with the most number of parents. Provides a + default value for `--commit-parents=` in + linkgit:git-survey[1]. + showCommitSizes:: + A non-negative integer value. Requests details on the + largest commits by size in bytes. Generally, these + are the commits with the largest commit messages. + Provides a default value for `--commit-sizes=` in + linkgit:git-survey[1]. + showTreeEntries:: + A non-negative integer value. Requests details on the + trees (directories) with the most number of entries + (files and subdirectories). Provides a default value + for `--tree-entries=` in linkgit:git-survey[1]. + showTreeSizes:: + A non-negative integer value. Requests details on the + largest trees (directories) by size in bytes. This + will set will usually be equal to the + `survey.showTreeEntries` set, but may be skewed by very + long file or subdirectory entry names. Provides a + default value for `--tree-sizes=` in + linkgit:git-survey[1]. -- diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt index 894c7be3053eb9..cb1fb3c37c3706 100644 --- a/Documentation/git-survey.txt +++ b/Documentation/git-survey.txt @@ -32,6 +32,10 @@ OPTIONS --progress:: Show progress. This is automatically enabled when interactive. +--[no-]name-rev:: + Print `git name-rev` output for each commit, tree, and blob. + Defaults to true. + Ref Selection ~~~~~~~~~~~~~ @@ -59,6 +63,32 @@ only refs for the given options are added. --other:: Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set. +Large Item Selection +~~~~~~~~~~~~~~~~~~~~ + +The following options control the optional display of large items under +various dimensions of scale. The OID of the largest `n` objects will be +displayed in reverse sorted order. For each, `n` defaults to 10. + +--commit-parents:: + Shows the OIDs of the commits with the most parent commits. + +--commit-sizes:: + Shows the OIDs of the largest commits by size in bytes. This is + usually the ones with the largest commit messages. + +--tree-entries:: + Shows the OIDs of the trees with the most number of entries. These + are the directories with the most number of files or subdirectories. + +--tree-sizes:: + Shows the OIDs of the largest trees by size in bytes. This set + will usually be the same as the vector of number of entries unless + skewed by very long entry names. + +--blob-sizes:: + Shows the OIDs of the largest blobs by size in bytes. + OUTPUT ------ @@ -78,6 +108,11 @@ Reachable Object Summary The reachable object summary shows the total number of each kind of Git object, including tags, commits, trees, and blobs. +CONFIGURATION +------------- + +include::config/survey.txt[] + GIT --- Part of the linkgit:git[1] suite diff --git a/builtin/survey.c b/builtin/survey.c index 34850399fd601a..5bf1adb6668eed 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -13,11 +13,12 @@ #include "ref-filter.h" #include "refs.h" #include "revision.h" +#include "run-command.h" #include "strbuf.h" #include "strvec.h" -#include "tag.h" #include "trace2.h" -#include "color.h" +#include "tree.h" +#include "tree-walk.h" static const char * const survey_usage[] = { N_("(EXPERIMENTAL!) git survey "), @@ -41,6 +42,16 @@ static struct survey_refs_wanted default_ref_options = { struct survey_opts { int verbose; int show_progress; + int show_name_rev; + + int show_largest_commits_by_nr_parents; + int show_largest_commits_by_size_bytes; + + int show_largest_trees_by_nr_entries; + int show_largest_trees_by_size_bytes; + + int show_largest_blobs_by_size_bytes; + int top_nr; struct survey_refs_wanted refs; }; @@ -53,6 +64,312 @@ struct survey_report_ref_summary { size_t tags_annotated_nr; size_t others_nr; size_t unknown_nr; + + size_t cnt_symref; + + size_t cnt_packed; + size_t cnt_loose; + + /* + * Measure the length of the refnames. We can look for + * potential platform limits. The partial sums may help us + * estimate the size of a haves/wants conversation, since each + * refname and a SHA must be transmitted. + */ + size_t len_max_local_refname; + size_t len_sum_local_refnames; + size_t len_max_remote_refname; + size_t len_sum_remote_refnames; +}; + +/* + * HBIN -- hex binning (histogram bucketing). + * + * We create histograms for various counts and sums. Since we have a + * wide range of values (objects range in size from 1 to 4G bytes), a + * linear bucketing is not interesting. Instead, lets use a + * log16()-based bucketing. This gives us a better spread on the low + * and middle range and a coarse bucketing on the high end. + * + * The idea here is that it doesn't matter if you have n 1GB blobs or + * n/2 1GB blobs and n/2 1.5GB blobs -- either way you have a scaling + * problem that we want to report on. + */ +#define HBIN_LEN (sizeof(unsigned long) * 2) +#define HBIN_MASK (0xF) +#define HBIN_SHIFT (4) + +static int hbin(unsigned long value) +{ + for (int k = 0; k < HBIN_LEN; k++) { + if ((value & ~(HBIN_MASK)) == 0) + return k; + value >>= HBIN_SHIFT; + } + + return 0; /* should not happen */ +} + +/* + * QBIN -- base4 binning (histogram bucketing). + * + * This is the same idea as the above, but we want better granularity + * in the low end and don't expect as many large values. + */ +#define QBIN_LEN (sizeof(unsigned long) * 4) +#define QBIN_MASK (0x3) +#define QBIN_SHIFT (2) + +static int qbin(unsigned long value) +{ + for (int k = 0; k < QBIN_LEN; k++) { + if ((value & ~(QBIN_MASK)) == 0) + return k; + value >>= (QBIN_SHIFT); + } + + return 0; /* should not happen */ +} + +/* + * histogram bin for objects. + */ +struct obj_hist_bin { + uint64_t sum_size; /* sum(object_size) for all objects in this bin */ + uint64_t sum_disk_size; /* sum(on_disk_size) for all objects in this bin */ + uint32_t cnt_seen; /* number seen in this bin */ +}; + +static void incr_obj_hist_bin(struct obj_hist_bin *pbin, + unsigned long object_length, + off_t disk_sizep) +{ + pbin->sum_size += object_length; + pbin->sum_disk_size += disk_sizep; + pbin->cnt_seen++; +} + +/* + * Remember the largest n objects for some scaling dimension. This + * could be the observed object size or number of entries in a tree. + * We'll use this to generate a sorted vector in the output for that + * dimension. + */ +struct large_item { + uint64_t size; + struct object_id oid; + + /* + * For blobs and trees the name field is the pathname of the + * file or directory. Root trees will have a zero-length + * name. The name field is not currenly used for commits. + */ + struct strbuf name; + + /* + * For blobs and trees remember the transient commit from + * the treewalk so that we can say that this large item + * first appeared in this commit (relative to the treewalk + * order). + */ + struct object_id containing_commit_oid; + + /* + * Lookup `containing_commit_oid` using `git name-rev`. + * Lazy allocate this post-treewalk. + */ + struct strbuf name_rev; +}; + +struct large_item_vec { + char *dimension_label; + char *item_label; + uint64_t nr_items; + struct large_item items[FLEX_ARRAY]; /* nr_items */ +}; + +static struct large_item_vec *alloc_large_item_vec(const char *dimension_label, + const char *item_label, + uint64_t nr_items) +{ + struct large_item_vec *vec; + size_t flex_len = nr_items * sizeof(struct large_item); + size_t k; + + if (!nr_items) + return NULL; + + vec = xcalloc(1, (sizeof(struct large_item_vec) + flex_len)); + vec->dimension_label = strdup(dimension_label); + vec->item_label = strdup(item_label); + vec->nr_items = nr_items; + + for (k = 0; k < nr_items; k++) + strbuf_init(&vec->items[k].name, 0); + + return vec; +} + +static void free_large_item_vec(struct large_item_vec *vec) +{ + if (!vec) + return; + + for (size_t k = 0; k < vec->nr_items; k++) { + strbuf_release(&vec->items[k].name); + strbuf_release(&vec->items[k].name_rev); + } + + free(vec->dimension_label); + free(vec->item_label); + free(vec); +} + +static void maybe_insert_large_item(struct large_item_vec *vec, + uint64_t size, + struct object_id *oid, + const char *name, + const struct object_id *containing_commit_oid) +{ + size_t rest_len; + size_t k; + + if (!vec || !vec->nr_items) + return; + + /* + * Since the odds an object being among the largest n + * is small, shortcut and see if it is smaller than + * the smallest one in our set and quickly reject it. + */ + if (size < vec->items[vec->nr_items - 1].size) + return; + + for (k = 0; k < vec->nr_items; k++) { + if (size < vec->items[k].size) + continue; + + /* + * The last large_item in the vector is about to be + * overwritten by the previous one during the shift. + * Steal its allocated strbuf and reuse it. + * + * We can ignore .name_rev because it will not be + * allocated until after the treewalk. + */ + strbuf_release(&vec->items[vec->nr_items - 1].name); + + /* push items[k..] down one and insert data for this item here */ + + rest_len = (vec->nr_items - k - 1) * sizeof(struct large_item); + if (rest_len) + memmove(&vec->items[k + 1], &vec->items[k], rest_len); + + memset(&vec->items[k], 0, sizeof(struct large_item)); + vec->items[k].size = size; + oidcpy(&vec->items[k].oid, oid); + oidcpy(&vec->items[k].containing_commit_oid, containing_commit_oid ? containing_commit_oid : null_oid()); + strbuf_init(&vec->items[k].name, 0); + if (name && *name) + strbuf_addstr(&vec->items[k].name, name); + + return; + } +} + +/* + * Common fields for any type of object. + */ +struct survey_stats_base_object { + uint32_t cnt_seen; + + uint32_t cnt_missing; /* we may have a partial clone. */ + + /* + * Number of objects grouped by where they are stored on disk. + * This is a function of how the ODB is packed. + */ + uint32_t cnt_cached; /* see oi.whence */ + uint32_t cnt_loose; /* see oi.whence */ + uint32_t cnt_packed; /* see oi.whence */ + uint32_t cnt_dbcached; /* see oi.whence */ + + uint64_t sum_size; /* sum(object_size) */ + uint64_t sum_disk_size; /* sum(disk_size) */ + + /* + * A histogram of the count of objects, the observed size, and + * the on-disk size grouped by the observed size. + */ + struct obj_hist_bin size_hbin[HBIN_LEN]; +}; + +/* + * PBIN -- parent vector binning (histogram bucketing). + * + * We create a histogram based upon the number of parents + * in a commit. This is a simple linear vector. It starts + * at zero for "initial" commits. + * + * If a commit has more parents, just put it in the last bin. + */ +#define PBIN_VEC_LEN (32) + +struct survey_stats_commits { + struct survey_stats_base_object base; + + /* + * Count of commits with k parents. + */ + uint32_t parent_cnt_pbin[PBIN_VEC_LEN]; + + struct large_item_vec *vec_largest_by_nr_parents; + struct large_item_vec *vec_largest_by_size_bytes; +}; + +/* + * Stats for reachable trees. + */ +struct survey_stats_trees { + struct survey_stats_base_object base; + + /* + * Keep a vector of the trees with the most number of entries. + * This gives us a feel for the width of a tree when there are + * gigantic directories. + */ + struct large_item_vec *vec_largest_by_nr_entries; + + /* + * Keep a vector of the trees with the largest size in bytes. + * The contents of this may or may not match items in the other + * vector, since entryname length can alter the results. + */ + struct large_item_vec *vec_largest_by_size_bytes; + + /* + * Computing the sum of the number of entries across all trees + * is probably not that interesting. + */ + uint64_t sum_entries; /* sum(nr_entries) -- sum across all trees */ + + /* + * A histogram of the count of trees, the observed size, and + * the on-disk size grouped by the number of entries in the tree. + */ + struct obj_hist_bin entry_qbin[QBIN_LEN]; +}; + +/* + * Stats for reachable blobs. + */ +struct survey_stats_blobs { + struct survey_stats_base_object base; + + /* + * Remember the OIDs of the largest n blobs. + */ + struct large_item_vec *vec_largest_by_size_bytes; }; struct survey_report_object_summary { @@ -60,6 +377,10 @@ struct survey_report_object_summary { size_t tags_nr; size_t trees_nr; size_t blobs_nr; + + struct survey_stats_commits commits; + struct survey_stats_trees trees; + struct survey_stats_blobs blobs; }; /** @@ -229,6 +550,12 @@ struct survey_context { static void clear_survey_context(struct survey_context *ctx) { + free_large_item_vec(ctx->report.reachable_objects.commits.vec_largest_by_nr_parents); + free_large_item_vec(ctx->report.reachable_objects.commits.vec_largest_by_size_bytes); + free_large_item_vec(ctx->report.reachable_objects.trees.vec_largest_by_nr_entries); + free_large_item_vec(ctx->report.reachable_objects.trees.vec_largest_by_size_bytes); + free_large_item_vec(ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes); + ref_array_clear(&ctx->ref_array); strvec_clear(&ctx->refs); } @@ -349,6 +676,128 @@ static void print_table_plaintext(struct survey_table *table) free(column_widths); } +static void pretty_print_bin_table(const char *title_caption, + const char *bucket_header, + struct obj_hist_bin *bin, + uint64_t bin_len, int bin_shift, uint64_t bin_mask) +{ + struct survey_table table = SURVEY_TABLE_INIT; + struct strbuf bucket = STRBUF_INIT, cnt_seen = STRBUF_INIT; + struct strbuf sum_size = STRBUF_INIT, sum_disk_size = STRBUF_INIT; + uint64_t lower = 0; + uint64_t upper = bin_mask; + + table.table_name = title_caption; + strvec_pushl(&table.header, bucket_header, "Count", "Size", "Disk Size", NULL); + + for (int k = 0; k < bin_len; k++) { + struct obj_hist_bin *p = bin + k; + uint64_t lower_k = lower; + uint64_t upper_k = upper; + + lower = upper+1; + upper = (upper << bin_shift) + bin_mask; + + if (!p->cnt_seen) + continue; + + strbuf_reset(&bucket); + strbuf_addf(&bucket, "%"PRIu64"..%"PRIu64, lower_k, upper_k); + + strbuf_reset(&cnt_seen); + strbuf_addf(&cnt_seen, "%"PRIu64, (uintmax_t)p->cnt_seen); + + strbuf_reset(&sum_size); + strbuf_addf(&sum_size, "%"PRIu64, (uintmax_t)p->sum_size); + + strbuf_reset(&sum_disk_size); + strbuf_addf(&sum_disk_size, "%"PRIu64, (uintmax_t)p->sum_disk_size); + + insert_table_rowv(&table, bucket.buf, + cnt_seen.buf, sum_size.buf, sum_disk_size.buf); + } + strbuf_release(&bucket); + strbuf_release(&cnt_seen); + strbuf_release(&sum_size); + strbuf_release(&sum_disk_size); + + print_table_plaintext(&table); + clear_table(&table); +} + +static void survey_report_hbin(const char *title_caption, + struct obj_hist_bin *bin) +{ + pretty_print_bin_table(title_caption, + "Byte Range", + bin, + HBIN_LEN, HBIN_SHIFT, HBIN_MASK); +} + +static void survey_report_tree_lengths(struct survey_context *ctx) +{ + pretty_print_bin_table(_("TREE HISTOGRAM BY NUMBER OF ENTRIES"), + "Entry Range", + ctx->report.reachable_objects.trees.entry_qbin, + QBIN_LEN, QBIN_SHIFT, QBIN_MASK); +} + +static void survey_report_commit_parents(struct survey_context *ctx) +{ + struct survey_stats_commits *psc = &ctx->report.reachable_objects.commits; + struct survey_table table = SURVEY_TABLE_INIT; + struct strbuf parents = STRBUF_INIT, counts = STRBUF_INIT; + + table.table_name = _("HISTOGRAM BY NUMBER OF COMMIT PARENTS"); + strvec_pushl(&table.header, "Parents", "Counts", NULL); + + for (int k = 0; k < PBIN_VEC_LEN; k++) + if (psc->parent_cnt_pbin[k]) { + strbuf_reset(&parents); + strbuf_addf(&parents, "%02d", k); + + strbuf_reset(&counts); + strbuf_addf(&counts, "%14"PRIuMAX, (uintmax_t)psc->parent_cnt_pbin[k]); + + insert_table_rowv(&table, parents.buf, counts.buf, NULL); + } + strbuf_release(&parents); + strbuf_release(&counts); + + print_table_plaintext(&table); + clear_table(&table); +} + +static void survey_report_largest_vec(struct survey_context *ctx, struct large_item_vec *vec) +{ + struct survey_table table = SURVEY_TABLE_INIT; + struct strbuf size = STRBUF_INIT; + + if (!vec || !vec->nr_items) + return; + + table.table_name = vec->dimension_label; + strvec_pushl(&table.header, "Size", "OID", "Name", "Commit", ctx->opts.show_name_rev ? "Name-Rev" : NULL, NULL); + + for (int k = 0; k < vec->nr_items; k++) { + struct large_item *pk = &vec->items[k]; + if (!is_null_oid(&pk->oid)) { + strbuf_reset(&size); + strbuf_addf(&size, "%"PRIuMAX, (uintmax_t)pk->size); + + insert_table_rowv(&table, size.buf, oid_to_hex(&pk->oid), pk->name.buf, + is_null_oid(&pk->containing_commit_oid) ? + "" : oid_to_hex(&pk->containing_commit_oid), + !ctx->opts.show_name_rev ? NULL : pk->name_rev.len ? pk->name_rev.buf : "", + NULL); + } + } + strbuf_release(&size); + + print_table_plaintext(&table); + clear_table(&table); +} + static void survey_report_plaintext_refs(struct survey_context *ctx) { struct survey_report_ref_summary *refs = &ctx->report.refs; @@ -380,6 +829,42 @@ static void survey_report_plaintext_refs(struct survey_context *ctx) free(fmt); } + /* + * SymRefs are somewhat orthogonal to the above classification (e.g. + * "HEAD" --> detached and "refs/remotes/origin/HEAD" --> remote) so the + * above classified counts will already include them, but it is less + * confusing to display them here than to create a whole new section. + */ + if (ctx->report.refs.cnt_symref) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->cnt_symref); + insert_table_rowv(&table, _("Symbolic refs"), fmt, NULL); + free(fmt); + } + + if (ctx->report.refs.cnt_loose || ctx->report.refs.cnt_packed) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->cnt_loose); + insert_table_rowv(&table, _("Loose refs"), fmt, NULL); + free(fmt); + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->cnt_packed); + insert_table_rowv(&table, _("Packed refs"), fmt, NULL); + free(fmt); + } + + if (ctx->report.refs.len_max_local_refname || ctx->report.refs.len_max_remote_refname) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->len_max_local_refname); + insert_table_rowv(&table, _("Max local refname length"), fmt, NULL); + free(fmt); + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->len_sum_local_refnames); + insert_table_rowv(&table, _("Sum local refnames length"), fmt, NULL); + free(fmt); + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->len_max_remote_refname); + insert_table_rowv(&table, _("Max remote refname length"), fmt, NULL); + free(fmt); + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->len_sum_remote_refnames); + insert_table_rowv(&table, _("Sum remote refnames length"), fmt, NULL); + free(fmt); + } + print_table_plaintext(&table); clear_table(&table); } @@ -465,6 +950,19 @@ static void survey_report_plaintext(struct survey_context *ctx) ctx->report.by_type, REPORT_TYPE_COUNT); + survey_report_commit_parents(ctx); + + survey_report_hbin(_("COMMITS HISTOGRAM BY SIZE IN BYTES"), + ctx->report.reachable_objects.commits.base.size_hbin); + + survey_report_tree_lengths(ctx); + + survey_report_hbin(_("TREES HISTOGRAM BY SIZE IN BYTES"), + ctx->report.reachable_objects.trees.base.size_hbin); + + survey_report_hbin(_("BLOBS HISTOGRAM BY SIZE IN BYTES"), + ctx->report.reachable_objects.blobs.base.size_hbin); + survey_report_plaintext_sorted_size( &ctx->report.top_paths_by_count[REPORT_TYPE_TREE]); survey_report_plaintext_sorted_size( @@ -479,6 +977,12 @@ static void survey_report_plaintext(struct survey_context *ctx) &ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]); survey_report_plaintext_sorted_size( &ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]); + + survey_report_largest_vec(ctx, ctx->report.reachable_objects.commits.vec_largest_by_nr_parents); + survey_report_largest_vec(ctx, ctx->report.reachable_objects.commits.vec_largest_by_size_bytes); + survey_report_largest_vec(ctx, ctx->report.reachable_objects.trees.vec_largest_by_nr_entries); + survey_report_largest_vec(ctx, ctx->report.reachable_objects.trees.vec_largest_by_size_bytes); + survey_report_largest_vec(ctx, ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes); } /* @@ -550,6 +1054,31 @@ static int survey_load_config_cb(const char *var, const char *value, ctx->opts.show_progress = git_config_bool(var, value); return 0; } + if (!strcmp(var, "survey.namerev")) { + ctx->opts.show_name_rev = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "survey.showcommitparents")) { + ctx->opts.show_largest_commits_by_nr_parents = git_config_ulong(var, value, cctx->kvi); + return 0; + } + if (!strcmp(var, "survey.showcommitsizes")) { + ctx->opts.show_largest_commits_by_size_bytes = git_config_ulong(var, value, cctx->kvi); + return 0; + } + + if (!strcmp(var, "survey.showtreeentries")) { + ctx->opts.show_largest_trees_by_nr_entries = git_config_ulong(var, value, cctx->kvi); + return 0; + } + if (!strcmp(var, "survey.showtreesizes")) { + ctx->opts.show_largest_trees_by_size_bytes = git_config_ulong(var, value, cctx->kvi); + return 0; + } + if (!strcmp(var, "survey.showblobsizes")) { + ctx->opts.show_largest_blobs_by_size_bytes = git_config_ulong(var, value, cctx->kvi); + return 0; + } if (!strcmp(var, "survey.top")) { ctx->opts.top_nr = git_config_bool(var, value); return 0; @@ -614,6 +1143,80 @@ static void do_load_refs(struct survey_context *ctx, ref_sorting_release(sorting); } +/* + * Try to run `git name-rev` on each of the containing-commit-oid's + * in this large-item-vec to get a pretty name for each OID. Silently + * ignore errors if it fails because this info is nice to have but not + * essential. + */ +static void large_item_vec_lookup_name_rev(struct survey_context *ctx, + struct large_item_vec *vec) +{ + struct child_process cp = CHILD_PROCESS_INIT; + struct strbuf in = STRBUF_INIT; + struct strbuf out = STRBUF_INIT; + const char *line; + size_t k; + + if (!vec || !vec->nr_items) + return; + + ctx->progress_total += vec->nr_items; + display_progress(ctx->progress, ctx->progress_total); + + for (k = 0; k < vec->nr_items; k++) + strbuf_addf(&in, "%s\n", oid_to_hex(&vec->items[k].containing_commit_oid)); + + cp.git_cmd = 1; + strvec_pushl(&cp.args, "name-rev", "--name-only", "--annotate-stdin", NULL); + if (pipe_command(&cp, in.buf, in.len, &out, 0, NULL, 0)) { + strbuf_release(&in); + strbuf_release(&out); + return; + } + + line = out.buf; + k = 0; + while (*line) { + const char *eol = strchrnul(line, '\n'); + + strbuf_init(&vec->items[k].name_rev, 0); + strbuf_add(&vec->items[k].name_rev, line, (eol - line)); + + line = eol + 1; + k++; + } + + strbuf_release(&in); + strbuf_release(&out); +} + +static void do_lookup_name_rev(struct survey_context *ctx) +{ + /* + * `git name-rev` can be very expensive when there are lots of + * refs, so make it optional. + */ + if (!ctx->opts.show_name_rev) + return; + + if (ctx->opts.show_progress) { + ctx->progress_total = 0; + ctx->progress = start_progress(_("Resolving name-revs..."), 0); + } + + large_item_vec_lookup_name_rev(ctx, ctx->report.reachable_objects.commits.vec_largest_by_nr_parents); + large_item_vec_lookup_name_rev(ctx, ctx->report.reachable_objects.commits.vec_largest_by_size_bytes); + + large_item_vec_lookup_name_rev(ctx, ctx->report.reachable_objects.trees.vec_largest_by_nr_entries); + large_item_vec_lookup_name_rev(ctx, ctx->report.reachable_objects.trees.vec_largest_by_size_bytes); + + large_item_vec_lookup_name_rev(ctx, ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes); + + if (ctx->opts.show_progress) + stop_progress(&ctx->progress); +} + /* * The REFS phase: * @@ -637,6 +1240,7 @@ static void survey_phase_refs(struct survey_context *ctx) for (size_t i = 0; i < ctx->ref_array.nr; i++) { unsigned long size; struct ref_array_item *item = ctx->ref_array.items[i]; + size_t len = strlen(item->refname); switch (item->kind) { case FILTER_REFS_TAGS: @@ -663,6 +1267,33 @@ static void survey_phase_refs(struct survey_context *ctx) ctx->report.refs.unknown_nr++; break; } + + /* + * SymRefs are somewhat orthogonal to the above + * classification (e.g. "HEAD" --> detached + * and "refs/remotes/origin/HEAD" --> remote) so + * our totals will already include them. + */ + if (item->flag & REF_ISSYMREF) + ctx->report.refs.cnt_symref++; + + /* + * Where/how is the ref stored in GITDIR. + */ + if (item->flag & REF_ISPACKED) + ctx->report.refs.cnt_packed++; + else + ctx->report.refs.cnt_loose++; + + if (item->kind == FILTER_REFS_REMOTES) { + ctx->report.refs.len_sum_remote_refnames += len; + if (len > ctx->report.refs.len_max_remote_refname) + ctx->report.refs.len_max_remote_refname = len; + } else { + ctx->report.refs.len_sum_local_refnames += len; + if (len > ctx->report.refs.len_max_local_refname) + ctx->report.refs.len_max_local_refname = len; + } } trace2_region_leave("survey", "phase/refs", ctx->repo); @@ -697,7 +1328,8 @@ static void increment_object_counts( static void increment_totals(struct survey_context *ctx, struct oid_array *oids, - struct survey_report_object_size_summary *summary) + struct survey_report_object_size_summary *summary, + const char *path) { for (size_t i = 0; i < oids->nr; i++) { struct object_info oi = OBJECT_INFO_INIT; @@ -705,6 +1337,8 @@ static void increment_totals(struct survey_context *ctx, unsigned long object_length = 0; off_t disk_sizep = 0; enum object_type type; + struct survey_stats_base_object *base; + int hb; oi.typep = &type; oi.sizep = &object_length; @@ -713,11 +1347,86 @@ static void increment_totals(struct survey_context *ctx, if (oid_object_info_extended(ctx->repo, &oids->oid[i], &oi, oi_flags) < 0) { summary->num_missing++; - } else { - summary->nr++; - summary->disk_size += disk_sizep; - summary->inflated_size += object_length; + continue; + } + + summary->nr++; + summary->disk_size += disk_sizep; + summary->inflated_size += object_length; + + switch (type) { + case OBJ_COMMIT: { + struct commit *commit = lookup_commit(ctx->repo, &oids->oid[i]); + unsigned k = commit_list_count(commit->parents); + + if (k >= PBIN_VEC_LEN) + k = PBIN_VEC_LEN - 1; + + ctx->report.reachable_objects.commits.parent_cnt_pbin[k]++; + base = &ctx->report.reachable_objects.commits.base; + + maybe_insert_large_item(ctx->report.reachable_objects.commits.vec_largest_by_nr_parents, k, &commit->object.oid, NULL, &commit->object.oid); + maybe_insert_large_item(ctx->report.reachable_objects.commits.vec_largest_by_size_bytes, object_length, &commit->object.oid, NULL, &commit->object.oid); + break; + } + case OBJ_TREE: { + struct tree *tree = lookup_tree(ctx->repo, &oids->oid[i]); + if (tree) { + struct survey_stats_trees *pst = &ctx->report.reachable_objects.trees; + struct tree_desc desc; + struct name_entry entry; + int nr_entries; + int qb; + + parse_tree(tree); + init_tree_desc(&desc, &oids->oid[i], tree->buffer, tree->size); + nr_entries = 0; + while (tree_entry(&desc, &entry)) + nr_entries++; + + pst->sum_entries += nr_entries; + + maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries, &tree->object.oid, path, NULL); + maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length, &tree->object.oid, path, NULL); + + qb = qbin(nr_entries); + incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep); + } + base = &ctx->report.reachable_objects.trees.base; + break; + } + case OBJ_BLOB: + base = &ctx->report.reachable_objects.blobs.base; + + maybe_insert_large_item(ctx->report.reachable_objects.blobs.vec_largest_by_size_bytes, object_length, &oids->oid[i], path, NULL); + break; + default: + continue; + } + + switch (oi.whence) { + case OI_CACHED: + base->cnt_cached++; + break; + case OI_LOOSE: + base->cnt_loose++; + break; + case OI_PACKED: + base->cnt_packed++; + break; + case OI_DBCACHED: + base->cnt_dbcached++; + break; + default: + break; } + + base->sum_size += object_length; + base->sum_disk_size += disk_sizep; + + hb = hbin(object_length); + incr_obj_hist_bin(&base->size_hbin[hb], object_length, disk_sizep); + } } @@ -729,7 +1438,7 @@ static void increment_object_totals(struct survey_context *ctx, struct survey_report_object_size_summary *total; struct survey_report_object_size_summary summary = { 0 }; - increment_totals(ctx, oids, &summary); + increment_totals(ctx, oids, &summary, path); switch (type) { case OBJ_COMMIT: @@ -861,6 +1570,12 @@ static void survey_phase_objects(struct survey_context *ctx) release_revisions(&revs); trace2_region_leave("survey", "phase/objects", ctx->repo); + + if (ctx->opts.show_name_rev) { + trace2_region_enter("survey", "phase/namerev", the_repository); + do_lookup_name_rev(ctx); + trace2_region_enter("survey", "phase/namerev", the_repository); + } } int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo) @@ -869,7 +1584,7 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor .opts = { .verbose = 0, .show_progress = -1, /* defaults to isatty(2) */ - .top_nr = 100, + .top_nr = 10, .refs.want_all_refs = -1, @@ -885,6 +1600,7 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor static struct option survey_options[] = { OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + OPT_BOOL(0, "name-rev", &ctx.opts.show_name_rev, N_("run name-rev on each reported commit")), OPT_INTEGER('n', "top", &ctx.opts.top_nr, N_("number of entries to include in detail tables")), @@ -896,6 +1612,14 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor OPT_BOOL_F(0, "detached", &ctx.opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG), OPT_BOOL_F(0, "other", &ctx.opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG), + OPT_INTEGER_F(0, "commit-parents", &ctx.opts.show_largest_commits_by_nr_parents, N_("show N largest commits by parent count"), PARSE_OPT_NONEG), + OPT_INTEGER_F(0, "commit-sizes", &ctx.opts.show_largest_commits_by_size_bytes, N_("show N largest commits by size in bytes"), PARSE_OPT_NONEG), + + OPT_INTEGER_F(0, "tree-entries", &ctx.opts.show_largest_trees_by_nr_entries, N_("show N largest trees by entry count"), PARSE_OPT_NONEG), + OPT_INTEGER_F(0, "tree-sizes", &ctx.opts.show_largest_trees_by_size_bytes, N_("show N largest trees by size in bytes"), PARSE_OPT_NONEG), + + OPT_INTEGER_F(0, "blob-sizes", &ctx.opts.show_largest_blobs_by_size_bytes, N_("show N largest blobs by size in bytes"), PARSE_OPT_NONEG), + OPT_END(), }; @@ -919,6 +1643,39 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor fixup_refs_wanted(&ctx); + if (ctx.opts.show_largest_commits_by_nr_parents) + ctx.report.reachable_objects.commits.vec_largest_by_nr_parents = + alloc_large_item_vec( + "largest_commits_by_nr_parents", + "nr_parents", + ctx.opts.show_largest_commits_by_nr_parents); + if (ctx.opts.show_largest_commits_by_size_bytes) + ctx.report.reachable_objects.commits.vec_largest_by_size_bytes = + alloc_large_item_vec( + "largest_commits_by_size_bytes", + "size", + ctx.opts.show_largest_commits_by_size_bytes); + + if (ctx.opts.show_largest_trees_by_nr_entries) + ctx.report.reachable_objects.trees.vec_largest_by_nr_entries = + alloc_large_item_vec( + "largest_trees_by_nr_entries", + "nr_entries", + ctx.opts.show_largest_trees_by_nr_entries); + if (ctx.opts.show_largest_trees_by_size_bytes) + ctx.report.reachable_objects.trees.vec_largest_by_size_bytes = + alloc_large_item_vec( + "largest_trees_by_size_bytes", + "size", + ctx.opts.show_largest_trees_by_size_bytes); + + if (ctx.opts.show_largest_blobs_by_size_bytes) + ctx.report.reachable_objects.blobs.vec_largest_by_size_bytes = + alloc_large_item_vec( + "largest_blobs_by_size_bytes", + "size", + ctx.opts.show_largest_blobs_by_size_bytes); + survey_phase_refs(&ctx); survey_phase_objects(&ctx); @@ -928,3 +1685,143 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor clear_survey_context(&ctx); return 0; } + +/* + * NEEDSWORK: So far, I only have iteration on the requested set of + * refs and treewalk/reachable objects on that set of refs. The + * following is a bit of a laundry list of things that I'd like to + * add. + * + * [] Dump stats on all of the packfiles. The number and size of each. + * Whether each is in the .git directory or in an alternate. The + * state of the IDX or MIDX files and etc. Delta chain stats. All + * of this data is relative to the "lived-in" state of the + * repository. Stuff that may change after a GC or repack. + * + * [] Clone and Index stats. partial, shallow, sparse-checkout, + * sparse-index, etc. Hydration stats. + * + * [] Dump stats on each remote. When we fetch from a remote the size + * of the response is related to the set of haves on the server. + * You can see this in `GIT_TRACE_CURL=1 git fetch`. We get a + * `ls-refs` payload that lists all of the branches and tags on the + * server, so at a minimum the RefName and SHA for each. But for + * annotated tags we also get the peeled SHA. The size of this + * overhead on every fetch is proporational to the size of the `git + * ls-remote` response (roughly, although the latter repeats the + * RefName of the peeled tag). If, for example, you have 500K refs + * on a remote, you're going to have a long "haves" message, so + * every fetch will be slow just because of that overhead (not + * counting new objects to be downloaded). + * + * Note that the local set of tags in "refs/tags/" is a union over + * all remotes. However, since most people only have one remote, + * we can probaly estimate the overhead value directly from the + * size of the set of "refs/tags/" that we visited while building + * the `ref_info` and `ref_array` and not need to ask the remote. + * + * [] Should the "string length of refnames / remote refs", for + * example, be sub-divided by remote so we can project the + * cost of the haves/wants overhead a fetch. + * + * [] Can we examine the merge commits and classify them as clean or + * dirty? (ie. ones with merge conflicts that needed to be + * addressed during the merge itself.) + * + * [] Do dirty merges affect performance of later operations? + * + * [] Dump info on the complexity of the DAG. Criss-cross merges. + * The number of edges that must be touched to compute merge bases. + * Edge length. The number of parallel lanes in the history that + * must be navigated to get to the merge base. What affects the + * cost of the Ahead/Behind computation? How often do + * criss-crosses occur and do they cause various operations to slow + * down? + * + * [] If there are primary branches (like "main" or "master") are they + * always on the left side of merges? Does the graph have a clean + * left edge? Or are there normal and "backwards" merges? Do + * these cause problems at scale? + * + * [] If we have a hierarchy of FI/RI branches like "L1", "L2, ..., + * can we learn anything about the shape of the repo around these + * FI and RI integrations? + * + * [] Do we need a no-PII flag to omit pathnames or branch/tag names + * in the various histograms? (This would turn off --name-rev + * too.) + * + * [] I have so far avoided adding opinions about individual fields + * (such as the way `git-sizer` prints a row of stars or bangs in + * the last column). + * + * I'm wondering if that is a job of this executable or if it + * should be done in a post-processing step using the JSON output. + * + * My problem with the `git-sizer` approach is that it doesn't give + * the (casual) user any information on why it has stars or bangs. + * And there isn't a good way to print detailed information in the + * ASCII-art tables that would be easy to understand. + * + * [] For example, a large number of refs does not define a cliff. + * Performance will drop off (linearly, quadratically, ... ??). + * The tool should refer them to article(s) talking about the + * different problems that it could cause. So should `git + * survey` just print the number and (implicitly) refer them to + * the man page (chapter/verse) or to a tool that will interpret + * the number and explain it? + * + * [] Alternatively, should `git survey` do that analysis too and + * just print footnotes for each large number? + * + * [] The computation of the raw survey JSON data can take HOURS on + * a very large repo (like Windows), so I'm wondering if we + * want to keep the opinion portion separate. + * + * [] In addition to opinions based on the static data, I would like + * to dump the JSON results (or the Trace2 telemetry) into a DB and + * aggregate it with other users. + * + * Granted, they should all see the same DAG and the same set of + * reachable objects, but we could average across all datasets + * generated on a particular date and detect outlier users. + * + * [] Maybe someone cloned from the `_full` endpoint rather than + * the limited refs endpoint. + * + * [] Maybe that user is having problems with repacking / GC / + * maintenance without knowing it. + * + * [] I'd also like to dump use the DB to compare survey datasets over + * a time. How fast is their repository growing and in what ways? + * + * [] I'd rather have the delta analysis NOT be inside `git + * survey`, so it makes sense to consider having all of it in a + * post-process step. + * + * [] Another reason to put the opinion analysis in a post-process + * is that it would be easier to generate plots on the data tables. + * Granted, we can get plots from telemetry, but a stand-alone user + * could run the JSON thru python or jq or something and generate + * something nicer than ASCII-art and it could handle cross-referencing + * and hyperlinking to helpful information on each issue. + * + * [] I think there are several classes of data that we can report on: + * + * [] The "inherit repo properties", such as the shape and size of + * the DAG -- these should be universal in each enlistment. + * + * [] The "ODB lived in properties", such as the efficiency + * of the repack and things like partial and shallow clone. + * These will vary, but indicate health of the ODB. + * + * [] The "index related properties", such as sparse-checkout, + * sparse-index, cache-tree, untracked-cache, fsmonitor, and + * etc. These will also vary, but are more like knobs for + * the user to adjust. + * + * [] I want to compare these with Matt's "dimensions of scale" + * notes and see if there are other pieces of data that we + * could compute/consider. + * + */ diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 8c6edfcae0c6c2..0d35dfcf311827 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -59,13 +59,20 @@ test_expect_success 'git survey (default)' ' ----------------------------------------------------- REFERENCES SUMMARY - ======================== - , Ref Type | Count - -----------------+------ - , Branches | 1 - Remote refs | 0 - Tags (all) | 2 - Tags (annotated) | 2 + ================================== + , Ref Type | Count + ---------------------------+------ + , Branches | 1 + , Remote refs | 0 + , Tags (all) | 2 + , Tags (annotated) | 2 + , Symbolic refs | 1 + , Loose refs | 4 + , Packed refs | 0 + Max local refname length | 15 + Sum local refnames length | 46 + Max remote refname length | 0 + Sum remote refnames length | 0 REACHABLE OBJECT SUMMARY ========================