From 8752901aae39953ac07da3933ad296c44259bc97 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 10 Sep 2024 02:28:34 +0000 Subject: [PATCH 1/9] survey: stub in new experimental `git-survey` command Start work on a new `git survey` command to scan the repository for monorepo performance and scaling problems. The goal is to measure the various known "dimensions of scale" and serve as a foundation for adding additional measurements as we learn more about Git monorepo scaling problems. The initial goal is to complement the scanning and analysis performed by the GO-based `git-sizer` (https://github.com/github/git-sizer) tool. It is hoped that by creating a builtin command, we may be able to take advantage of internal Git data structures and code that is not accessible from GO to gain further insight into potential scaling problems. RFC TODO: Adapt this boilerplat to match the upcoming changes to builtin methods that include a 'struct repository' pointer. Co-authored-by: Derrick Stolee Signed-off-by: Jeff Hostetler Signed-off-by: Derrick Stolee --- .gitignore | 1 + Documentation/git-survey.adoc | 36 +++++++++++++++++++++++ Documentation/meson.build | 1 + Makefile | 1 + builtin.h | 1 + builtin/survey.c | 54 +++++++++++++++++++++++++++++++++++ command-list.txt | 1 + git.c | 1 + meson.build | 1 + t/meson.build | 1 + t/t8100-git-survey.sh | 18 ++++++++++++ 11 files changed, 116 insertions(+) create mode 100644 Documentation/git-survey.adoc create mode 100644 builtin/survey.c create mode 100755 t/t8100-git-survey.sh diff --git a/.gitignore b/.gitignore index 04c444404e4..078fc537292 100644 --- a/.gitignore +++ b/.gitignore @@ -166,6 +166,7 @@ /git-submodule /git-submodule--helper /git-subtree +/git-survey /git-svn /git-switch /git-symbolic-ref diff --git a/Documentation/git-survey.adoc b/Documentation/git-survey.adoc new file mode 100644 index 00000000000..fbeb630fc24 --- /dev/null +++ b/Documentation/git-survey.adoc @@ -0,0 +1,36 @@ +git-survey(1) +============= + +NAME +---- +git-survey - EXPERIMENTAL: Measure various repository dimensions of scale + +SYNOPSIS +-------- +[verse] +(EXPERIMENTAL!) git survey + +DESCRIPTION +----------- + +Survey the repository and measure various dimensions of scale. + +As repositories grow to "monorepo" size, certain data shapes can cause +performance problems. `git-survey` attempts to measure and report on +known problem areas. + +OPTIONS +------- + +--progress:: + Show progress. This is automatically enabled when interactive. + +OUTPUT +------ + +By default, `git survey` will print information about the repository in a +human-readable format that includes overviews and tables. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Documentation/meson.build b/Documentation/meson.build index 2fe1a1369d4..5baa68a6f00 100644 --- a/Documentation/meson.build +++ b/Documentation/meson.build @@ -141,6 +141,7 @@ manpages = { 'git-status.adoc' : 1, 'git-stripspace.adoc' : 1, 'git-submodule.adoc' : 1, + 'git-survey.adoc' : 1, 'git-svn.adoc' : 1, 'git-switch.adoc' : 1, 'git-symbolic-ref.adoc' : 1, diff --git a/Makefile b/Makefile index 70d1543b6b8..8fc7e2458a7 100644 --- a/Makefile +++ b/Makefile @@ -1323,6 +1323,7 @@ BUILTIN_OBJS += builtin/sparse-checkout.o BUILTIN_OBJS += builtin/stash.o BUILTIN_OBJS += builtin/stripspace.o BUILTIN_OBJS += builtin/submodule--helper.o +BUILTIN_OBJS += builtin/survey.o BUILTIN_OBJS += builtin/symbolic-ref.o BUILTIN_OBJS += builtin/tag.o BUILTIN_OBJS += builtin/unpack-file.o diff --git a/builtin.h b/builtin.h index bff13e3069b..b0f928dfd3b 100644 --- a/builtin.h +++ b/builtin.h @@ -233,6 +233,7 @@ int cmd_status(int argc, const char **argv, const char *prefix, struct repositor int cmd_stash(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_stripspace(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_submodule__helper(int argc, const char **argv, const char *prefix, struct repository *repo); +int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_switch(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_symbolic_ref(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_tag(int argc, const char **argv, const char *prefix, struct repository *repo); diff --git a/builtin/survey.c b/builtin/survey.c new file mode 100644 index 00000000000..a9c06486a93 --- /dev/null +++ b/builtin/survey.c @@ -0,0 +1,54 @@ +#include "builtin.h" +#include "config.h" +#include "parse-options.h" + +static const char * const survey_usage[] = { + N_("(EXPERIMENTAL!) git survey "), + NULL, +}; + +struct survey_opts { + int verbose; + int show_progress; +}; + +static struct survey_opts survey_opts = { + .verbose = 0, + .show_progress = -1, /* defaults to isatty(2) */ +}; + +static struct option survey_options[] = { + OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")), + OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")), + OPT_END(), +}; + +static int survey_load_config_cb(const char *var, const char *value, + const struct config_context *ctx, void *pvoid) +{ + if (!strcmp(var, "survey.verbose")) { + survey_opts.verbose = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "survey.progress")) { + survey_opts.show_progress = git_config_bool(var, value); + return 0; + } + + return git_default_config(var, value, ctx, pvoid); +} + +int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo) +{ + show_usage_with_options_if_asked(argc, argv, survey_usage, survey_options); + + prepare_repo_settings(repo); + repo_config(repo, survey_load_config_cb, NULL); + + argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0); + + if (survey_opts.show_progress < 0) + survey_opts.show_progress = isatty(2); + + return 0; +} diff --git a/command-list.txt b/command-list.txt index b7ade3ab9f3..8872958cf48 100644 --- a/command-list.txt +++ b/command-list.txt @@ -188,6 +188,7 @@ git-stash mainporcelain git-status mainporcelain info git-stripspace purehelpers git-submodule mainporcelain +git-survey mainporcelain git-svn foreignscminterface git-switch mainporcelain history git-symbolic-ref plumbingmanipulators diff --git a/git.c b/git.c index 07a5fe39fb6..d4a9ca63c95 100644 --- a/git.c +++ b/git.c @@ -630,6 +630,7 @@ static struct cmd_struct commands[] = { { "status", cmd_status, RUN_SETUP | NEED_WORK_TREE }, { "stripspace", cmd_stripspace }, { "submodule--helper", cmd_submodule__helper, RUN_SETUP }, + { "survey", cmd_survey, RUN_SETUP }, { "switch", cmd_switch, RUN_SETUP | NEED_WORK_TREE }, { "symbolic-ref", cmd_symbolic_ref, RUN_SETUP }, { "tag", cmd_tag, RUN_SETUP | DELAY_PAGER_CONFIG }, diff --git a/meson.build b/meson.build index 7fea4a34d68..8af29520c6f 100644 --- a/meson.build +++ b/meson.build @@ -660,6 +660,7 @@ builtin_sources = [ 'builtin/stash.c', 'builtin/stripspace.c', 'builtin/submodule--helper.c', + 'builtin/survey.c', 'builtin/symbolic-ref.c', 'builtin/tag.c', 'builtin/unpack-file.c', diff --git a/t/meson.build b/t/meson.build index 6d7fe6b117e..56fdcdb050f 100644 --- a/t/meson.build +++ b/t/meson.build @@ -961,6 +961,7 @@ integration_tests = [ 't8012-blame-colors.sh', 't8013-blame-ignore-revs.sh', 't8014-blame-ignore-fuzzy.sh', + 't8100-git-survey.sh', 't9001-send-email.sh', 't9002-column.sh', 't9003-help-autocorrect.sh', diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh new file mode 100755 index 00000000000..d9816419855 --- /dev/null +++ b/t/t8100-git-survey.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +test_description='git survey' + +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME + +TEST_PASSES_SANITIZE_LEAK=0 +export TEST_PASSES_SANITIZE_LEAK + +. ./test-lib.sh + +test_expect_success 'git survey -h shows experimental warning' ' + test_expect_code 129 git survey -h >usage && + grep "EXPERIMENTAL!" usage +' + +test_done -- GitLab From 6c10e4b14bbec4bef548a0f6a09ac7c9bc63f905 Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 10 Sep 2024 02:28:35 +0000 Subject: [PATCH 2/9] survey: add command line opts to select references By default we will scan all references in "refs/heads/", "refs/tags/" and "refs/remotes/". Add command line opts let the use ask for all refs or a subset of them and to include a detached HEAD. Signed-off-by: Jeff Hostetler Signed-off-by: Derrick Stolee --- Documentation/git-survey.adoc | 34 ++++++++++++ builtin/survey.c | 99 +++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/Documentation/git-survey.adoc b/Documentation/git-survey.adoc index fbeb630fc24..3c3dbef5b00 100644 --- a/Documentation/git-survey.adoc +++ b/Documentation/git-survey.adoc @@ -19,12 +19,46 @@ As repositories grow to "monorepo" size, certain data shapes can cause performance problems. `git-survey` attempts to measure and report on known problem areas. +Ref Selection and Reachable Objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this first analysis phase, `git survey` will iterate over the set of +requested branches, tags, and other refs and treewalk over all of the +reachable commits, trees, and blobs and generate various statistics. + OPTIONS ------- --progress:: Show progress. This is automatically enabled when interactive. +Ref Selection +~~~~~~~~~~~~~ + +The following options control the set of refs that `git survey` will examine. +By default, `git survey` will look at tags, local branches, and remote refs. +If any of the following options are given, the default set is cleared and +only refs for the given options are added. + +--all-refs:: + Use all refs. This includes local branches, tags, remote refs, + notes, and stashes. This option overrides all of the following. + +--branches:: + Add local branches (`refs/heads/`) to the set. + +--tags:: + Add tags (`refs/tags/`) to the set. + +--remotes:: + Add remote branches (`refs/remote/`) to the set. + +--detached:: + Add HEAD to the set. + +--other:: + Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set. + OUTPUT ------ diff --git a/builtin/survey.c b/builtin/survey.c index a9c06486a93..42188af9b8b 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -7,19 +7,117 @@ static const char * const survey_usage[] = { NULL, }; +struct survey_refs_wanted { + int want_all_refs; /* special override */ + + int want_branches; + int want_tags; + int want_remotes; + int want_detached; + int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */ +}; + +/* + * The set of refs that we will search if the user doesn't select + * any on the command line. + */ +static struct survey_refs_wanted refs_if_unspecified = { + .want_all_refs = 0, + + .want_branches = 1, + .want_tags = 1, + .want_remotes = 1, + .want_detached = 0, + .want_other = 0, +}; + struct survey_opts { int verbose; int show_progress; + struct survey_refs_wanted refs; }; static struct survey_opts survey_opts = { .verbose = 0, .show_progress = -1, /* defaults to isatty(2) */ + + .refs.want_all_refs = -1, + + .refs.want_branches = -1, /* default these to undefined */ + .refs.want_tags = -1, + .refs.want_remotes = -1, + .refs.want_detached = -1, + .refs.want_other = -1, }; +/* + * After parsing the command line arguments, figure out which refs we + * should scan. + * + * If ANY were given in positive sense, then we ONLY include them and + * do not use the builtin values. + */ +static void fixup_refs_wanted(void) +{ + struct survey_refs_wanted *rw = &survey_opts.refs; + + /* + * `--all-refs` overrides and enables everything. + */ + if (rw->want_all_refs == 1) { + rw->want_branches = 1; + rw->want_tags = 1; + rw->want_remotes = 1; + rw->want_detached = 1; + rw->want_other = 1; + return; + } + + /* + * If none of the `--` were given, we assume all + * of the builtin unspecified values. + */ + if (rw->want_branches == -1 && + rw->want_tags == -1 && + rw->want_remotes == -1 && + rw->want_detached == -1 && + rw->want_other == -1) { + *rw = refs_if_unspecified; + return; + } + + /* + * Since we only allow positive boolean values on the command + * line, we will only have true values where they specified + * a `--`. + * + * So anything that still has an unspecified value should be + * set to false. + */ + if (rw->want_branches == -1) + rw->want_branches = 0; + if (rw->want_tags == -1) + rw->want_tags = 0; + if (rw->want_remotes == -1) + rw->want_remotes = 0; + if (rw->want_detached == -1) + rw->want_detached = 0; + if (rw->want_other == -1) + rw->want_other = 0; +} + static struct option survey_options[] = { OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")), OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")), + + OPT_BOOL_F(0, "all-refs", &survey_opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), + + OPT_BOOL_F(0, "branches", &survey_opts.refs.want_branches, N_("include branches"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "tags", &survey_opts.refs.want_tags, N_("include tags"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "remotes", &survey_opts.refs.want_remotes, N_("include all remotes refs"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "detached", &survey_opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "other", &survey_opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG), + OPT_END(), }; @@ -49,6 +147,7 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor if (survey_opts.show_progress < 0) survey_opts.show_progress = isatty(2); + fixup_refs_wanted(); return 0; } -- GitLab From 809b7a87a134b0d3c28ef53fc20beee446efdb2d Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Tue, 10 Sep 2024 02:28:36 +0000 Subject: [PATCH 3/9] survey: collect the set of requested refs Collect the set of requested branches, tags, and etc into a ref_array and collect the set of requested patterns into a strvec. RFC TODO: This patch has some changes that should be in the previous patch, to make the diff look a lot better. Co-authored-by: Derrick Stolee Signed-off-by: Jeff Hostetler Signed-off-by: Derrick Stolee --- builtin/survey.c | 254 ++++++++++++++++++++++++++++++++++-------- t/t8100-git-survey.sh | 9 ++ 2 files changed, 215 insertions(+), 48 deletions(-) diff --git a/builtin/survey.c b/builtin/survey.c index 42188af9b8b..e2d11897be5 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -1,6 +1,12 @@ #include "builtin.h" #include "config.h" +#include "object.h" +#include "object-store.h" #include "parse-options.h" +#include "progress.h" +#include "ref-filter.h" +#include "strvec.h" +#include "trace2.h" static const char * const survey_usage[] = { N_("(EXPERIMENTAL!) git survey "), @@ -17,18 +23,8 @@ struct survey_refs_wanted { int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */ }; -/* - * The set of refs that we will search if the user doesn't select - * any on the command line. - */ -static struct survey_refs_wanted refs_if_unspecified = { - .want_all_refs = 0, - - .want_branches = 1, - .want_tags = 1, - .want_remotes = 1, - .want_detached = 0, - .want_other = 0, +static struct survey_refs_wanted default_ref_options = { + .want_all_refs = 1, }; struct survey_opts { @@ -37,19 +33,51 @@ struct survey_opts { struct survey_refs_wanted refs; }; -static struct survey_opts survey_opts = { - .verbose = 0, - .show_progress = -1, /* defaults to isatty(2) */ +struct survey_report_ref_summary { + size_t refs_nr; + size_t branches_nr; + size_t remote_refs_nr; + size_t tags_nr; + size_t tags_annotated_nr; + size_t others_nr; + size_t unknown_nr; +}; + +/** + * This struct contains all of the information that needs to be printed + * at the end of the exploration of the repository and its references. + */ +struct survey_report { + struct survey_report_ref_summary refs; +}; + +struct survey_context { + /* Options that control what is done. */ + struct survey_opts opts; + + /* Info for output only. */ + struct survey_report report; - .refs.want_all_refs = -1, + /* + * The rest of the members are about enabling the activity + * of the 'git survey' command, including ref listings, object + * pointers, and progress. + */ + + struct repository *repo; + + struct progress *progress; + size_t progress_nr; + size_t progress_total; - .refs.want_branches = -1, /* default these to undefined */ - .refs.want_tags = -1, - .refs.want_remotes = -1, - .refs.want_detached = -1, - .refs.want_other = -1, + struct strvec refs; }; +static void clear_survey_context(struct survey_context *ctx) +{ + strvec_clear(&ctx->refs); +} + /* * After parsing the command line arguments, figure out which refs we * should scan. @@ -57,9 +85,9 @@ static struct survey_opts survey_opts = { * If ANY were given in positive sense, then we ONLY include them and * do not use the builtin values. */ -static void fixup_refs_wanted(void) +static void fixup_refs_wanted(struct survey_context *ctx) { - struct survey_refs_wanted *rw = &survey_opts.refs; + struct survey_refs_wanted *rw = &ctx->opts.refs; /* * `--all-refs` overrides and enables everything. @@ -82,7 +110,7 @@ static void fixup_refs_wanted(void) rw->want_remotes == -1 && rw->want_detached == -1 && rw->want_other == -1) { - *rw = refs_if_unspecified; + *rw = default_ref_options; return; } @@ -106,48 +134,178 @@ static void fixup_refs_wanted(void) rw->want_other = 0; } -static struct option survey_options[] = { - OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")), - OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")), - - OPT_BOOL_F(0, "all-refs", &survey_opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), - - OPT_BOOL_F(0, "branches", &survey_opts.refs.want_branches, N_("include branches"), PARSE_OPT_NONEG), - OPT_BOOL_F(0, "tags", &survey_opts.refs.want_tags, N_("include tags"), PARSE_OPT_NONEG), - OPT_BOOL_F(0, "remotes", &survey_opts.refs.want_remotes, N_("include all remotes refs"), PARSE_OPT_NONEG), - OPT_BOOL_F(0, "detached", &survey_opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG), - OPT_BOOL_F(0, "other", &survey_opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG), - - OPT_END(), -}; - static int survey_load_config_cb(const char *var, const char *value, - const struct config_context *ctx, void *pvoid) + const struct config_context *cctx, void *pvoid) { + struct survey_context *sctx = pvoid; if (!strcmp(var, "survey.verbose")) { - survey_opts.verbose = git_config_bool(var, value); + sctx->opts.verbose = git_config_bool(var, value); return 0; } if (!strcmp(var, "survey.progress")) { - survey_opts.show_progress = git_config_bool(var, value); + sctx->opts.show_progress = git_config_bool(var, value); return 0; } - return git_default_config(var, value, ctx, pvoid); + return git_default_config(var, value, cctx, pvoid); +} + +static void do_load_refs(struct survey_context *ctx, + struct ref_array *ref_array) +{ + struct ref_filter filter = REF_FILTER_INIT; + struct ref_sorting *sorting; + struct string_list sorting_options = STRING_LIST_INIT_DUP; + + string_list_append(&sorting_options, "objectname"); + sorting = ref_sorting_options(&sorting_options); + + if (ctx->opts.refs.want_detached) + strvec_push(&ctx->refs, "HEAD"); + + if (ctx->opts.refs.want_all_refs) { + strvec_push(&ctx->refs, "refs/"); + } else { + if (ctx->opts.refs.want_branches) + strvec_push(&ctx->refs, "refs/heads/"); + if (ctx->opts.refs.want_tags) + strvec_push(&ctx->refs, "refs/tags/"); + if (ctx->opts.refs.want_remotes) + strvec_push(&ctx->refs, "refs/remotes/"); + if (ctx->opts.refs.want_other) { + strvec_push(&ctx->refs, "refs/notes/"); + strvec_push(&ctx->refs, "refs/stash/"); + } + } + + filter.name_patterns = ctx->refs.v; + filter.ignore_case = 0; + filter.match_as_path = 1; + + if (ctx->opts.show_progress) { + ctx->progress_total = 0; + ctx->progress = start_progress(ctx->repo, _("Scanning refs..."), 0); + } + + filter_refs(ref_array, &filter, FILTER_REFS_KIND_MASK); + + if (ctx->opts.show_progress) { + ctx->progress_total = ref_array->nr; + display_progress(ctx->progress, ctx->progress_total); + } + + ref_array_sort(sorting, ref_array); + + stop_progress(&ctx->progress); + ref_filter_clear(&filter); + ref_sorting_release(sorting); +} + +/* + * The REFS phase: + * + * Load the set of requested refs and assess them for scalablity problems. + * Use that set to start a treewalk to all reachable objects and assess + * them. + * + * This data will give us insights into the repository itself (the number + * of refs, the size and shape of the DAG, the number and size of the + * objects). + * + * Theoretically, this data is independent of the on-disk representation + * (e.g. independent of packing concerns). + */ +static void survey_phase_refs(struct survey_context *ctx) +{ + struct ref_array ref_array = { 0 }; + + trace2_region_enter("survey", "phase/refs", ctx->repo); + do_load_refs(ctx, &ref_array); + + ctx->report.refs.refs_nr = ref_array.nr; + for (int i = 0; i < ref_array.nr; i++) { + size_t size; + struct ref_array_item *item = ref_array.items[i]; + + switch (item->kind) { + case FILTER_REFS_TAGS: + ctx->report.refs.tags_nr++; + if (oid_object_info(ctx->repo, + &item->objectname, + &size) == OBJ_TAG) + ctx->report.refs.tags_annotated_nr++; + break; + + case FILTER_REFS_BRANCHES: + ctx->report.refs.branches_nr++; + break; + + case FILTER_REFS_REMOTES: + ctx->report.refs.remote_refs_nr++; + break; + + case FILTER_REFS_OTHERS: + ctx->report.refs.others_nr++; + break; + + default: + ctx->report.refs.unknown_nr++; + break; + } + } + + trace2_region_leave("survey", "phase/refs", ctx->repo); + + ref_array_clear(&ref_array); } int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo) { + static struct survey_context ctx = { + .opts = { + .verbose = 0, + .show_progress = -1, /* defaults to isatty(2) */ + + .refs.want_all_refs = -1, + + .refs.want_branches = -1, /* default these to undefined */ + .refs.want_tags = -1, + .refs.want_remotes = -1, + .refs.want_detached = -1, + .refs.want_other = -1, + }, + .refs = STRVEC_INIT, + }; + + static struct option survey_options[] = { + OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), + OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + + OPT_BOOL_F(0, "all-refs", &ctx.opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), + + OPT_BOOL_F(0, "branches", &ctx.opts.refs.want_branches, N_("include branches"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "tags", &ctx.opts.refs.want_tags, N_("include tags"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "remotes", &ctx.opts.refs.want_remotes, N_("include all remotes refs"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "detached", &ctx.opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "other", &ctx.opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG), + + OPT_END(), + }; + show_usage_with_options_if_asked(argc, argv, survey_usage, survey_options); - prepare_repo_settings(repo); - repo_config(repo, survey_load_config_cb, NULL); + ctx.repo = repo; + prepare_repo_settings(ctx.repo); + repo_config(repo, survey_load_config_cb, &ctx); argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0); - if (survey_opts.show_progress < 0) - survey_opts.show_progress = isatty(2); - fixup_refs_wanted(); + if (ctx.opts.show_progress < 0) + ctx.opts.show_progress = isatty(2); + fixup_refs_wanted(&ctx); + + survey_phase_refs(&ctx); + clear_survey_context(&ctx); return 0; } diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index d9816419855..30713d8f2e4 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -15,4 +15,13 @@ test_expect_success 'git survey -h shows experimental warning' ' grep "EXPERIMENTAL!" usage ' +test_expect_success 'creat a semi-interesting repo' ' + test_commit_bulk 10 +' + +test_expect_success 'git survey (default)' ' + git survey >out 2>err && + test_line_count = 0 err +' + test_done -- GitLab From 1649f70b856bce94af9c2d990837ad1d4c9554c5 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 10 Sep 2024 02:28:37 +0000 Subject: [PATCH 4/9] survey: start pretty printing data in table form When 'git survey' provides information to the user, this will be presented in one of two formats: plaintext and JSON. The JSON implementation will be delayed until the functionality is complete for the plaintext format. The most important parts of the plaintext format are headers specifying the different sections of the report and tables providing concreted data. Create a custom table data structure that allows specifying a list of strings for the row values. When printing the table, check each column for the maximum width so we can create a table of the correct size from the start. The table structure is designed to be flexible to the different kinds of output that will be implemented in future changes. Signed-off-by: Derrick Stolee --- builtin/survey.c | 175 ++++++++++++++++++++++++++++++++++++++++++ t/t8100-git-survey.sh | 17 +++- 2 files changed, 191 insertions(+), 1 deletion(-) diff --git a/builtin/survey.c b/builtin/survey.c index e2d11897be5..e52e3109412 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -5,6 +5,7 @@ #include "parse-options.h" #include "progress.h" #include "ref-filter.h" +#include "strbuf.h" #include "strvec.h" #include "trace2.h" @@ -27,10 +28,16 @@ static struct survey_refs_wanted default_ref_options = { .want_all_refs = 1, }; +enum survey_format { + SURVEY_PLAINTEXT = 0, + SURVEY_JSON = 1, +}; + struct survey_opts { int verbose; int show_progress; struct survey_refs_wanted refs; + enum survey_format format; }; struct survey_report_ref_summary { @@ -78,6 +85,161 @@ static void clear_survey_context(struct survey_context *ctx) strvec_clear(&ctx->refs); } +struct survey_table { + const char *table_name; + struct strvec header; + struct strvec *rows; + size_t rows_nr; + size_t rows_alloc; +}; + +#define SURVEY_TABLE_INIT { \ + .header = STRVEC_INIT, \ +} + +static void clear_table(struct survey_table *table) +{ + strvec_clear(&table->header); + for (size_t i = 0; i < table->rows_nr; i++) + strvec_clear(&table->rows[i]); + free(table->rows); +} + +static void insert_table_rowv(struct survey_table *table, ...) +{ + va_list ap; + char *arg; + ALLOC_GROW(table->rows, table->rows_nr + 1, table->rows_alloc); + + memset(&table->rows[table->rows_nr], 0, sizeof(struct strvec)); + + va_start(ap, table); + while ((arg = va_arg(ap, char *))) + strvec_push(&table->rows[table->rows_nr], arg); + va_end(ap); + + table->rows_nr++; +} + +static void print_table_title(const char *name, size_t *widths, size_t nr) +{ + static struct strbuf lines = STRBUF_INIT; + size_t width = 0; + strbuf_setlen(&lines, 0); + + strbuf_addch(&lines, ' '); + strbuf_addstr(&lines, name); + strbuf_addch(&lines, '\n'); + + for (size_t i = 0; i < nr; i++) { + if (i) + width += 3; + width += widths[i]; + } + strbuf_addchars(&lines, '=', width); + printf("%s\n", lines.buf); +} + +static void print_row_plaintext(struct strvec *row, size_t *widths) +{ + static struct strbuf line = STRBUF_INIT; + strbuf_setlen(&line, 0); + + for (size_t i = 0; i < row->nr; i++) { + const char *str = row->v[i]; + size_t len = strlen(str); + if (i) + strbuf_add(&line, " | ", 3); + strbuf_addchars(&line, ' ', widths[i] - len); + strbuf_add(&line, str, len); + } + printf("%s\n", line.buf); +} + +static void print_divider_plaintext(size_t *widths, size_t nr) +{ + static struct strbuf line = STRBUF_INIT; + strbuf_setlen(&line, 0); + + for (size_t i = 0; i < nr; i++) { + if (i) + strbuf_add(&line, "-+-", 3); + strbuf_addchars(&line, '-', widths[i]); + } + printf("%s\n", line.buf); +} + +static void print_table_plaintext(struct survey_table *table) +{ + size_t *column_widths; + size_t columns_nr = table->header.nr; + CALLOC_ARRAY(column_widths, columns_nr); + + for (size_t i = 0; i < columns_nr; i++) { + column_widths[i] = strlen(table->header.v[i]); + + for (size_t j = 0; j < table->rows_nr; j++) { + size_t rowlen = strlen(table->rows[j].v[i]); + if (column_widths[i] < rowlen) + column_widths[i] = rowlen; + } + } + + print_table_title(table->table_name, column_widths, columns_nr); + print_row_plaintext(&table->header, column_widths); + print_divider_plaintext(column_widths, columns_nr); + + for (size_t j = 0; j < table->rows_nr; j++) + print_row_plaintext(&table->rows[j], column_widths); +} + +static void survey_report_plaintext_refs(struct survey_context *ctx) +{ + struct survey_report_ref_summary *refs = &ctx->report.refs; + struct survey_table table = SURVEY_TABLE_INIT; + + table.table_name = _("REFERENCES SUMMARY"); + + strvec_push(&table.header, _("Ref Type")); + strvec_push(&table.header, _("Count")); + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_branches) { + char *fmt = xstrfmt("%"PRIuMAX"", refs->branches_nr); + insert_table_rowv(&table, _("Branches"), fmt, NULL); + free(fmt); + } + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_remotes) { + char *fmt = xstrfmt("%"PRIuMAX"", refs->remote_refs_nr); + insert_table_rowv(&table, _("Remote refs"), fmt, NULL); + free(fmt); + } + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_tags) { + char *fmt = xstrfmt("%"PRIuMAX"", refs->tags_nr); + insert_table_rowv(&table, _("Tags (all)"), fmt, NULL); + free(fmt); + fmt = xstrfmt("%"PRIuMAX"", refs->tags_annotated_nr); + insert_table_rowv(&table, _("Tags (annotated)"), fmt, NULL); + free(fmt); + } + + print_table_plaintext(&table); + clear_table(&table); +} + +static void survey_report_plaintext(struct survey_context *ctx) +{ + printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); + printf("-----------------------------------------------------\n"); + survey_report_plaintext_refs(ctx); +} + +static void survey_report_json(struct survey_context *ctx UNUSED) +{ + /* TODO. */ +} + /* * After parsing the command line arguments, figure out which refs we * should scan. @@ -306,6 +468,19 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor survey_phase_refs(&ctx); + switch (ctx.opts.format) { + case SURVEY_PLAINTEXT: + survey_report_plaintext(&ctx); + break; + + case SURVEY_JSON: + survey_report_json(&ctx); + break; + + default: + BUG("Undefined format"); + } + clear_survey_context(&ctx); return 0; } diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 30713d8f2e4..7da4ad52787 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -21,7 +21,22 @@ test_expect_success 'creat a semi-interesting repo' ' test_expect_success 'git survey (default)' ' git survey >out 2>err && - test_line_count = 0 err + test_line_count = 0 err && + + cat >expect <<-EOF && + GIT SURVEY for "$(pwd)" + ----------------------------------------------------- + REFERENCES SUMMARY + ======================== + Ref Type | Count + -----------------+------ + Branches | 1 + Remote refs | 0 + Tags (all) | 0 + Tags (annotated) | 0 + EOF + + test_cmp expect out ' test_done -- GitLab From 446db15cc4ae5d52328f4bf2d6ac10c73c3ecbb9 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 10 Sep 2024 02:28:38 +0000 Subject: [PATCH 5/9] survey: add object count summary At the moment, nothing is obvious about the reason for the use of the path-walk API, but this will become more prevelant in future iterations. For now, use the path-walk API to sum up the counts of each kind of object. For example, this is the reachable object summary output for my local repo: REACHABLE OBJECT SUMMARY ======================== Object Type | Count ------------+------- Tags | 0 Commits | 178573 Trees | 312745 Blobs | 183035 (Note: the "Tags" are zero right now because the path-walk API has not been integrated to walk tags yet. This will be fixed in a later change.) RFC TODO: make sure tags are walked before this change. Signed-off-by: Derrick Stolee --- builtin/survey.c | 196 ++++++++++++++++++++++++++++++++++++++++-- t/t8100-git-survey.sh | 26 ++++-- 2 files changed, 209 insertions(+), 13 deletions(-) diff --git a/builtin/survey.c b/builtin/survey.c index e52e3109412..225c2568c47 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -1,12 +1,19 @@ #include "builtin.h" #include "config.h" +#include "environment.h" +#include "hex.h" #include "object.h" +#include "object-name.h" #include "object-store.h" #include "parse-options.h" +#include "path-walk.h" #include "progress.h" #include "ref-filter.h" +#include "refs.h" +#include "revision.h" #include "strbuf.h" #include "strvec.h" +#include "tag.h" #include "trace2.h" static const char * const survey_usage[] = { @@ -50,12 +57,20 @@ struct survey_report_ref_summary { size_t unknown_nr; }; +struct survey_report_object_summary { + size_t commits_nr; + size_t tags_nr; + size_t trees_nr; + size_t blobs_nr; +}; + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. */ struct survey_report { struct survey_report_ref_summary refs; + struct survey_report_object_summary reachable_objects; }; struct survey_context { @@ -78,10 +93,12 @@ struct survey_context { size_t progress_total; struct strvec refs; + struct ref_array ref_array; }; static void clear_survey_context(struct survey_context *ctx) { + ref_array_clear(&ctx->ref_array); strvec_clear(&ctx->refs); } @@ -125,10 +142,12 @@ static void print_table_title(const char *name, size_t *widths, size_t nr) { static struct strbuf lines = STRBUF_INIT; size_t width = 0; + size_t min_width; strbuf_setlen(&lines, 0); - strbuf_addch(&lines, ' '); + strbuf_addch(&lines, '\n'); strbuf_addstr(&lines, name); + min_width = lines.len - 1; strbuf_addch(&lines, '\n'); for (size_t i = 0; i < nr; i++) { @@ -136,6 +155,10 @@ static void print_table_title(const char *name, size_t *widths, size_t nr) width += 3; width += widths[i]; } + + if (width < min_width) + width = min_width; + strbuf_addchars(&lines, '=', width); printf("%s\n", lines.buf); } @@ -228,11 +251,43 @@ static void survey_report_plaintext_refs(struct survey_context *ctx) clear_table(&table); } +static void survey_report_plaintext_reachable_object_summary(struct survey_context *ctx) +{ + struct survey_report_object_summary *objs = &ctx->report.reachable_objects; + struct survey_table table = SURVEY_TABLE_INIT; + char *fmt; + + table.table_name = _("REACHABLE OBJECT SUMMARY"); + + strvec_push(&table.header, _("Object Type")); + strvec_push(&table.header, _("Count")); + + fmt = xstrfmt("%"PRIuMAX"", objs->tags_nr); + insert_table_rowv(&table, _("Tags"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", objs->commits_nr); + insert_table_rowv(&table, _("Commits"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", objs->trees_nr); + insert_table_rowv(&table, _("Trees"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", objs->blobs_nr); + insert_table_rowv(&table, _("Blobs"), fmt, NULL); + free(fmt); + + print_table_plaintext(&table); + clear_table(&table); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); printf("-----------------------------------------------------\n"); survey_report_plaintext_refs(ctx); + survey_report_plaintext_reachable_object_summary(ctx); } static void survey_report_json(struct survey_context *ctx UNUSED) @@ -379,15 +434,13 @@ static void do_load_refs(struct survey_context *ctx, */ static void survey_phase_refs(struct survey_context *ctx) { - struct ref_array ref_array = { 0 }; - trace2_region_enter("survey", "phase/refs", ctx->repo); - do_load_refs(ctx, &ref_array); + do_load_refs(ctx, &ctx->ref_array); - ctx->report.refs.refs_nr = ref_array.nr; - for (int i = 0; i < ref_array.nr; i++) { + ctx->report.refs.refs_nr = ctx->ref_array.nr; + for (int i = 0; i < ctx->ref_array.nr; i++) { size_t size; - struct ref_array_item *item = ref_array.items[i]; + struct ref_array_item *item = ctx->ref_array.items[i]; switch (item->kind) { case FILTER_REFS_TAGS: @@ -417,8 +470,133 @@ static void survey_phase_refs(struct survey_context *ctx) } trace2_region_leave("survey", "phase/refs", ctx->repo); +} + +static void increment_object_counts( + struct survey_report_object_summary *summary, + enum object_type type, + size_t nr) +{ + switch (type) { + case OBJ_COMMIT: + summary->commits_nr += nr; + break; + + case OBJ_TREE: + summary->trees_nr += nr; + break; + + case OBJ_BLOB: + summary->blobs_nr += nr; + break; + + default: + break; + } +} + +static int survey_objects_path_walk_fn(const char *path UNUSED, + struct oid_array *oids, + enum object_type type, + void *data) +{ + struct survey_context *ctx = data; + + increment_object_counts(&ctx->report.reachable_objects, + type, oids->nr); + + return 0; +} + +static int iterate_tag_chain(struct survey_context *ctx, + struct object_id *oid, + struct object_id *peeled) +{ + struct object *o = lookup_unknown_object(ctx->repo, oid); + struct tag *t; + + if (o->type != OBJ_TAG) { + oidcpy(peeled, &o->oid); + return o->type != OBJ_COMMIT; + } + + t = lookup_tag(ctx->repo, oid); + while (t) { + parse_tag(t); + ctx->report.reachable_objects.tags_nr++; + + if (!t->tagged) + break; + + o = lookup_unknown_object(ctx->repo, &t->tagged->oid); + if (o && o->type == OBJ_TAG) + t = lookup_tag(ctx->repo, &t->tagged->oid); + else + break; + } + + if (!t || !t->tagged) + return -1; - ref_array_clear(&ref_array); + oidcpy(peeled, &t->tagged->oid); + o = lookup_unknown_object(ctx->repo, peeled); + if (o && o->type == OBJ_COMMIT) + return 0; + return -1; +} + +static void survey_phase_objects(struct survey_context *ctx) +{ + struct rev_info revs; + struct path_walk_info info = PATH_WALK_INFO_INIT; + unsigned int add_flags = 0; + + trace2_region_enter("survey", "phase/objects", ctx->repo); + + info.revs = &revs; + info.path_fn = survey_objects_path_walk_fn; + info.path_fn_data = ctx; + + info.commits = 1; + info.trees = 1; + info.blobs = 1; + info.tags = 1; + + repo_init_revisions(ctx->repo, &revs, ""); + + for (int i = 0; i < ctx->ref_array.nr; i++) { + struct ref_array_item *item = ctx->ref_array.items[i]; + struct object_id peeled; + + switch (item->kind) { + case FILTER_REFS_TAGS: + if (!iterate_tag_chain(ctx, &item->objectname, &peeled)) + add_pending_oid(&revs, NULL, &peeled, add_flags); + break; + case FILTER_REFS_BRANCHES: + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + break; + case FILTER_REFS_REMOTES: + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + break; + case FILTER_REFS_OTHERS: + /* + * This may be a note, stash, or custom namespace branch. + */ + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + break; + case FILTER_REFS_DETACHED_HEAD: + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + break; + default: + break; + } + } + + walk_objects_by_path(&info); + + release_revisions(&revs); + trace2_region_leave("survey", "phase/objects", ctx->repo); } int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo) @@ -468,6 +646,8 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor survey_phase_refs(&ctx); + survey_phase_objects(&ctx); + switch (ctx.opts.format) { case SURVEY_PLAINTEXT: survey_report_plaintext(&ctx); diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 7da4ad52787..4b4a35d8494 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -16,24 +16,40 @@ test_expect_success 'git survey -h shows experimental warning' ' ' test_expect_success 'creat a semi-interesting repo' ' - test_commit_bulk 10 + test_commit_bulk 10 && + git tag -a -m one one HEAD~5 && + git tag -a -m two two HEAD~3 && + git tag -a -m three three two && + git tag -a -m four four three && + git update-ref -d refs/tags/three && + git update-ref -d refs/tags/two ' test_expect_success 'git survey (default)' ' - git survey >out 2>err && + git survey --all-refs >out 2>err && test_line_count = 0 err && cat >expect <<-EOF && GIT SURVEY for "$(pwd)" ----------------------------------------------------- - REFERENCES SUMMARY + + REFERENCES SUMMARY ======================== Ref Type | Count -----------------+------ Branches | 1 Remote refs | 0 - Tags (all) | 0 - Tags (annotated) | 0 + Tags (all) | 2 + Tags (annotated) | 2 + + REACHABLE OBJECT SUMMARY + ======================== + Object Type | Count + ------------+------ + Tags | 0 + Commits | 10 + Trees | 10 + Blobs | 10 EOF test_cmp expect out -- GitLab From 3af5fd6ae1535587f3d0c14113fa60a29ed992f9 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 10 Sep 2024 02:28:39 +0000 Subject: [PATCH 6/9] survey: summarize total sizes by object type Now that we have explored objects by count, we can expand that a bit more to summarize the data for the on-disk and inflated size of those objects. This information is helpful for diagnosing both why disk space (and perhaps clone or fetch times) is growing but also why certain operations are slow because the inflated size of the abstract objects that must be processed is so large. Signed-off-by: Derrick Stolee --- builtin/survey.c | 113 ++++++++++++++++++++++++++++++++++++++++++ t/t8100-git-survey.sh | 8 +++ 2 files changed, 121 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index 225c2568c47..009577c3a45 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -64,6 +64,19 @@ struct survey_report_object_summary { size_t blobs_nr; }; +/** + * For some category given by 'label', count the number of objects + * that match that label along with the on-disk size and the size + * after decompressing (both with delta bases and zlib). + */ +struct survey_report_object_size_summary { + char *label; + size_t nr; + size_t disk_size; + size_t inflated_size; + size_t num_missing; +}; + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. @@ -71,8 +84,15 @@ struct survey_report_object_summary { struct survey_report { struct survey_report_ref_summary refs; struct survey_report_object_summary reachable_objects; + + struct survey_report_object_size_summary *by_type; }; +#define REPORT_TYPE_COMMIT 0 +#define REPORT_TYPE_TREE 1 +#define REPORT_TYPE_BLOB 2 +#define REPORT_TYPE_COUNT 3 + struct survey_context { /* Options that control what is done. */ struct survey_opts opts; @@ -282,12 +302,41 @@ static void survey_report_plaintext_reachable_object_summary(struct survey_conte clear_table(&table); } +static void survey_report_object_sizes(const char *title, + const char *categories, + struct survey_report_object_size_summary *summary, + size_t summary_nr) +{ + struct survey_table table = SURVEY_TABLE_INIT; + table.table_name = title; + + strvec_push(&table.header, xstrdup(categories)); + strvec_push(&table.header, xstrdup(_("Count"))); + strvec_push(&table.header, xstrdup(_("Disk Size"))); + strvec_push(&table.header, xstrdup(_("Inflated Size"))); + + for (size_t i = 0; i < summary_nr; i++) { + insert_table_rowv(&table, xstrdup(summary[i].label), + xstrfmt("%"PRIuMAX, summary[i].nr), + xstrfmt("%"PRIuMAX, summary[i].disk_size), + xstrfmt("%"PRIuMAX, summary[i].inflated_size), + NULL); + } + + print_table_plaintext(&table); + clear_table(&table); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); printf("-----------------------------------------------------\n"); survey_report_plaintext_refs(ctx); survey_report_plaintext_reachable_object_summary(ctx); + survey_report_object_sizes(_("TOTAL OBJECT SIZES BY TYPE"), + _("Object Type"), + ctx->report.by_type, + REPORT_TYPE_COUNT); } static void survey_report_json(struct survey_context *ctx UNUSED) @@ -495,6 +544,64 @@ static void increment_object_counts( } } +static void increment_totals(struct survey_context *ctx, + struct oid_array *oids, + struct survey_report_object_size_summary *summary) +{ + for (size_t i = 0; i < oids->nr; i++) { + struct object_info oi = OBJECT_INFO_INIT; + unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH; + unsigned long object_length = 0; + off_t disk_sizep = 0; + enum object_type type; + + oi.typep = &type; + oi.sizep = &object_length; + oi.disk_sizep = &disk_sizep; + + if (oid_object_info_extended(ctx->repo, &oids->oid[i], + &oi, oi_flags) < 0) { + summary->num_missing++; + } else { + summary->nr++; + summary->disk_size += disk_sizep; + summary->inflated_size += object_length; + } + } +} + +static void increment_object_totals(struct survey_context *ctx, + struct oid_array *oids, + enum object_type type) +{ + struct survey_report_object_size_summary *total; + struct survey_report_object_size_summary summary = { 0 }; + + increment_totals(ctx, oids, &summary); + + switch (type) { + case OBJ_COMMIT: + total = &ctx->report.by_type[REPORT_TYPE_COMMIT]; + break; + + case OBJ_TREE: + total = &ctx->report.by_type[REPORT_TYPE_TREE]; + break; + + case OBJ_BLOB: + total = &ctx->report.by_type[REPORT_TYPE_BLOB]; + break; + + default: + BUG("No other type allowed"); + } + + total->nr += summary.nr; + total->disk_size += summary.disk_size; + total->inflated_size += summary.inflated_size; + total->num_missing += summary.num_missing; +} + static int survey_objects_path_walk_fn(const char *path UNUSED, struct oid_array *oids, enum object_type type, @@ -504,6 +611,7 @@ static int survey_objects_path_walk_fn(const char *path UNUSED, increment_object_counts(&ctx->report.reachable_objects, type, oids->nr); + increment_object_totals(ctx, oids, type); return 0; } @@ -562,6 +670,11 @@ static void survey_phase_objects(struct survey_context *ctx) info.blobs = 1; info.tags = 1; + CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); + ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); + ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); + ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); + repo_init_revisions(ctx->repo, &revs, ""); for (int i = 0; i < ctx->ref_array.nr; i++) { diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 4b4a35d8494..9b743b4fe5c 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -50,6 +50,14 @@ test_expect_success 'git survey (default)' ' Commits | 10 Trees | 10 Blobs | 10 + + TOTAL OBJECT SIZES BY TYPE + =============================================== + Object Type | Count | Disk Size | Inflated Size + ------------+-------+-----------+-------------- + Commits | 10 | 1523 | 2153 + Trees | 10 | 495 | 1706 + Blobs | 10 | 191 | 101 EOF test_cmp expect out -- GitLab From 0695c0d1a6639b0480ede6d5598fef102194c864 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 10 Sep 2024 02:28:40 +0000 Subject: [PATCH 7/9] survey: show progress during object walk Signed-off-by: Derrick Stolee --- builtin/survey.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index 009577c3a45..618b4c9d223 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -613,6 +613,9 @@ static int survey_objects_path_walk_fn(const char *path UNUSED, type, oids->nr); increment_object_totals(ctx, oids, type); + ctx->progress_nr += oids->nr; + display_progress(ctx->progress, ctx->progress_nr); + return 0; } @@ -677,6 +680,11 @@ static void survey_phase_objects(struct survey_context *ctx) repo_init_revisions(ctx->repo, &revs, ""); + ctx->progress_nr = 0; + ctx->progress_total = ctx->ref_array.nr; + if (ctx->opts.show_progress) + ctx->progress = start_progress(ctx->repo, _("Preparing object walk"), + ctx->progress_total); for (int i = 0; i < ctx->ref_array.nr; i++) { struct ref_array_item *item = ctx->ref_array.items[i]; struct object_id peeled; @@ -704,9 +712,17 @@ static void survey_phase_objects(struct survey_context *ctx) default: break; } + + display_progress(ctx->progress, ++(ctx->progress_nr)); } + stop_progress(&ctx->progress); + ctx->progress_nr = 0; + ctx->progress_total = 0; + if (ctx->opts.show_progress) + ctx->progress = start_progress(ctx->repo, _("Walking objects"), 0); walk_objects_by_path(&info); + stop_progress(&ctx->progress); release_revisions(&revs); trace2_region_leave("survey", "phase/objects", ctx->repo); -- GitLab From 666e7cf917d87495cfbf7f9528d0c04716e619e8 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 10 Sep 2024 02:28:41 +0000 Subject: [PATCH 8/9] survey: add ability to track prioritized lists In future changes, we will make use of these methods. The intention is to keep track of the top contributors according to some metric. We don't want to store all of the entries and do a sort at the end, so track a constant-size table and remove rows that get pushed out depending on the chosen sorting algorithm. Co-authored-by: Jeff Hostetler Signed-off-by; Jeff Hostetler Signed-off-by: Derrick Stolee --- builtin/survey.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index 618b4c9d223..9414d0a8fb0 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -77,6 +77,102 @@ struct survey_report_object_size_summary { size_t num_missing; }; +typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1, + struct survey_report_object_size_summary *s2); + +MAYBE_UNUSED +static int cmp_by_nr(struct survey_report_object_size_summary *s1, + struct survey_report_object_size_summary *s2) +{ + if (s1->nr < s2->nr) + return -1; + if (s1->nr > s2->nr) + return 1; + return 0; +} + +MAYBE_UNUSED +static int cmp_by_disk_size(struct survey_report_object_size_summary *s1, + struct survey_report_object_size_summary *s2) +{ + if (s1->disk_size < s2->disk_size) + return -1; + if (s1->disk_size > s2->disk_size) + return 1; + return 0; +} + +MAYBE_UNUSED +static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1, + struct survey_report_object_size_summary *s2) +{ + if (s1->inflated_size < s2->inflated_size) + return -1; + if (s1->inflated_size > s2->inflated_size) + return 1; + return 0; +} + +/** + * Store a list of "top" categories by some sorting function. When + * inserting a new category, reorder the list and free the one that + * got ejected (if any). + */ +struct survey_report_top_sizes { + const char *name; + survey_top_size_cmp cmp_fn; + struct survey_report_object_size_summary *data; + size_t nr; + size_t alloc; +}; + +MAYBE_UNUSED +static void init_top_sizes(struct survey_report_top_sizes *top, + size_t limit, const char *name, + survey_top_size_cmp cmp) +{ + top->name = name; + top->alloc = limit; + top->nr = 0; + CALLOC_ARRAY(top->data, limit); + top->cmp_fn = cmp; +} + +MAYBE_UNUSED +static void clear_top_sizes(struct survey_report_top_sizes *top) +{ + for (size_t i = 0; i < top->nr; i++) + free(top->data[i].label); + free(top->data); +} + +MAYBE_UNUSED +static void maybe_insert_into_top_size(struct survey_report_top_sizes *top, + struct survey_report_object_size_summary *summary) +{ + size_t pos = top->nr; + + /* Compare against list from the bottom. */ + while (pos > 0 && top->cmp_fn(&top->data[pos - 1], summary) < 0) + pos--; + + /* Not big enough! */ + if (pos >= top->alloc) + return; + + /* We need to shift the data. */ + if (top->nr == top->alloc) + free(top->data[top->nr - 1].label); + else + top->nr++; + + for (size_t i = top->nr - 1; i > pos; i--) + memcpy(&top->data[i], &top->data[i - 1], sizeof(*top->data)); + + memcpy(&top->data[pos], summary, sizeof(*summary)); + top->data[pos].label = xstrdup(summary->label); +} + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. -- GitLab From 1817dc08b8ea00fce4cd1fb6bc75713ad00a74d3 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 10 Sep 2024 02:28:42 +0000 Subject: [PATCH 9/9] survey: add report of "largest" paths Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee --- builtin/survey.c | 92 +++++++++++++++++++++++++++++++++++++------ t/t8100-git-survey.sh | 12 +++++- 2 files changed, 91 insertions(+), 13 deletions(-) diff --git a/builtin/survey.c b/builtin/survey.c index 9414d0a8fb0..b2b7e3783f1 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -80,7 +80,6 @@ struct survey_report_object_size_summary { typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1, struct survey_report_object_size_summary *s2); -MAYBE_UNUSED static int cmp_by_nr(struct survey_report_object_size_summary *s1, struct survey_report_object_size_summary *s2) { @@ -91,7 +90,6 @@ static int cmp_by_nr(struct survey_report_object_size_summary *s1, return 0; } -MAYBE_UNUSED static int cmp_by_disk_size(struct survey_report_object_size_summary *s1, struct survey_report_object_size_summary *s2) { @@ -102,7 +100,6 @@ static int cmp_by_disk_size(struct survey_report_object_size_summary *s1, return 0; } -MAYBE_UNUSED static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1, struct survey_report_object_size_summary *s2) { @@ -126,7 +123,6 @@ struct survey_report_top_sizes { size_t alloc; }; -MAYBE_UNUSED static void init_top_sizes(struct survey_report_top_sizes *top, size_t limit, const char *name, survey_top_size_cmp cmp) @@ -146,7 +142,6 @@ static void clear_top_sizes(struct survey_report_top_sizes *top) free(top->data); } -MAYBE_UNUSED static void maybe_insert_into_top_size(struct survey_report_top_sizes *top, struct survey_report_object_size_summary *summary) { @@ -182,6 +177,10 @@ struct survey_report { struct survey_report_object_summary reachable_objects; struct survey_report_object_size_summary *by_type; + + struct survey_report_top_sizes *top_paths_by_count; + struct survey_report_top_sizes *top_paths_by_disk; + struct survey_report_top_sizes *top_paths_by_inflate; }; #define REPORT_TYPE_COMMIT 0 @@ -423,6 +422,13 @@ static void survey_report_object_sizes(const char *title, clear_table(&table); } +static void survey_report_plaintext_sorted_size( + struct survey_report_top_sizes *top) +{ + survey_report_object_sizes(top->name, _("Path"), + top->data, top->nr); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); @@ -433,6 +439,21 @@ static void survey_report_plaintext(struct survey_context *ctx) _("Object Type"), ctx->report.by_type, REPORT_TYPE_COUNT); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]); } static void survey_report_json(struct survey_context *ctx UNUSED) @@ -668,7 +689,8 @@ static void increment_totals(struct survey_context *ctx, static void increment_object_totals(struct survey_context *ctx, struct oid_array *oids, - enum object_type type) + enum object_type type, + const char *path) { struct survey_report_object_size_summary *total; struct survey_report_object_size_summary summary = { 0 }; @@ -696,9 +718,30 @@ static void increment_object_totals(struct survey_context *ctx, total->disk_size += summary.disk_size; total->inflated_size += summary.inflated_size; total->num_missing += summary.num_missing; + + if (type == OBJ_TREE || type == OBJ_BLOB) { + int index = type == OBJ_TREE ? + REPORT_TYPE_TREE : REPORT_TYPE_BLOB; + struct survey_report_top_sizes *top; + + /* + * Temporarily store (const char *) here, but it will + * be duped if inserted and will not be freed. + */ + summary.label = (char *)path; + + top = ctx->report.top_paths_by_count; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_disk; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_inflate; + maybe_insert_into_top_size(&top[index], &summary); + } } -static int survey_objects_path_walk_fn(const char *path UNUSED, +static int survey_objects_path_walk_fn(const char *path, struct oid_array *oids, enum object_type type, void *data) @@ -707,7 +750,7 @@ static int survey_objects_path_walk_fn(const char *path UNUSED, increment_object_counts(&ctx->report.reachable_objects, type, oids->nr); - increment_object_totals(ctx, oids, type); + increment_object_totals(ctx, oids, type, path); ctx->progress_nr += oids->nr; display_progress(ctx->progress, ctx->progress_nr); @@ -752,6 +795,34 @@ static int iterate_tag_chain(struct survey_context *ctx, return -1; } +static void initialize_report(struct survey_context *ctx) +{ + const int top_limit = 100; + + CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); + ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); + ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); + ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); + + CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY COUNT"), cmp_by_nr); + + CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); + + CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); +} + static void survey_phase_objects(struct survey_context *ctx) { struct rev_info revs; @@ -769,10 +840,7 @@ static void survey_phase_objects(struct survey_context *ctx) info.blobs = 1; info.tags = 1; - CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); - ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); - ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); - ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); + initialize_report(ctx); repo_init_revisions(ctx->repo, &revs, ""); diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 9b743b4fe5c..8d30bea17ef 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -60,7 +60,17 @@ test_expect_success 'git survey (default)' ' Blobs | 10 | 191 | 101 EOF - test_cmp expect out + lines=$(wc -l out-trimmed && + test_cmp expect out-trimmed && + + for type in "DIRECTORIES" "FILES" + do + for metric in "COUNT" "DISK SIZE" "INFLATED SIZE" + do + grep "TOP $type BY $metric" out || return 1 + done || return 1 + done ' test_done -- GitLab