Files
git/t/helper/test-path-walk.c
Derrick Stolee bff4555767 backfill: add --sparse option
One way to significantly reduce the cost of a Git clone and later fetches is
to use a blobless partial clone and combine that with a sparse-checkout that
reduces the paths that need to be populated in the working directory. Not
only does this reduce the cost of clones and fetches, the sparse-checkout
reduces the number of objects needed to download from a promisor remote.

However, history investigations can be expensive as computing blob diffs
will trigger promisor remote requests for one object at a time. This can be
avoided by downloading the blobs needed for the given sparse-checkout using
'git backfill' and its new '--sparse' mode, at a time that the user is
willing to pay that extra cost.

Note that this is distinctly different from the '--filter=sparse:<oid>'
option, as this assumes that the partial clone has all reachable trees and
we are using client-side logic to avoid downloading blobs outside of the
sparse-checkout cone. This avoids the server-side cost of walking trees
while also achieving a similar goal. It also downloads in batches based on
similar path names, presenting a resumable download if things are
interrupted.

This augments the path-walk API to have a possibly-NULL 'pl' member that may
point to a 'struct pattern_list'. This could be more general than the
sparse-checkout definition at HEAD, but 'git backfill --sparse' is currently
the only consumer.

Be sure to test this in both cone mode and not cone mode. Cone mode has the
benefit that the path-walk can skip certain paths once they would expand
beyond the sparse-checkout. Non-cone mode can describe the included files
using both positive and negative patterns, which changes the possible return
values of path_matches_pattern_list(). Test both kinds of matches for
increased coverage.

To test this, we can create a blobless sparse clone, expand the
sparse-checkout slightly, and then run 'git backfill --sparse' to see
how much data is downloaded. The general steps are

 1. git clone --filter=blob:none --sparse <url>
 2. git sparse-checkout set <dir1> ... <dirN>
 3. git backfill --sparse

For the Git repository with the 'builtin' directory in the
sparse-checkout, we get these results for various batch sizes:

| Batch Size      | Pack Count | Pack Size | Time  |
|-----------------|------------|-----------|-------|
| (Initial clone) | 3          | 110 MB    |       |
| 10K             | 12         | 192 MB    | 17.2s |
| 15K             | 9          | 192 MB    | 15.5s |
| 20K             | 8          | 192 MB    | 15.5s |
| 25K             | 7          | 192 MB    | 14.7s |

This case matters less because a full clone of the Git repository from
GitHub is currently at 277 MB.

Using a copy of the Linux repository with the 'kernel/' directory in the
sparse-checkout, we get these results:

| Batch Size      | Pack Count | Pack Size | Time |
|-----------------|------------|-----------|------|
| (Initial clone) | 2          | 1,876 MB  |      |
| 10K             | 11         | 2,187 MB  | 46s  |
| 25K             | 7          | 2,188 MB  | 43s  |
| 50K             | 5          | 2,194 MB  | 44s  |
| 100K            | 4          | 2,194 MB  | 48s  |

This case is more meaningful because a full clone of the Linux
repository is currently over 6 GB, so this is a valuable way to download
a fraction of the repository and no longer need network access for all
reachable objects within the sparse-checkout.

Choosing a batch size will depend on a lot of factors, including the
user's network speed or reliability, the repository's file structure,
and how many versions there are of the file within the sparse-checkout
scope. There will not be a one-size-fits-all solution.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2025-02-03 16:12:42 -08:00

133 lines
3.2 KiB
C

#define USE_THE_REPOSITORY_VARIABLE
#include "test-tool.h"
#include "dir.h"
#include "environment.h"
#include "hex.h"
#include "object-name.h"
#include "object.h"
#include "pretty.h"
#include "revision.h"
#include "setup.h"
#include "parse-options.h"
#include "strbuf.h"
#include "path-walk.h"
#include "oid-array.h"
static const char * const path_walk_usage[] = {
N_("test-tool path-walk <options> -- <revision-options>"),
NULL
};
struct path_walk_test_data {
uintmax_t batch_nr;
uintmax_t commit_nr;
uintmax_t tree_nr;
uintmax_t blob_nr;
uintmax_t tag_nr;
};
static int emit_block(const char *path, struct oid_array *oids,
enum object_type type, void *data)
{
struct path_walk_test_data *tdata = data;
const char *typestr;
if (type == OBJ_TREE)
tdata->tree_nr += oids->nr;
else if (type == OBJ_BLOB)
tdata->blob_nr += oids->nr;
else if (type == OBJ_COMMIT)
tdata->commit_nr += oids->nr;
else if (type == OBJ_TAG)
tdata->tag_nr += oids->nr;
else
BUG("we do not understand this type");
typestr = type_name(type);
/* This should never be output during tests. */
if (!oids->nr)
printf("%"PRIuMAX":%s:%s:EMPTY\n",
tdata->batch_nr, typestr, path);
for (size_t i = 0; i < oids->nr; i++) {
struct object *o = lookup_unknown_object(the_repository,
&oids->oid[i]);
printf("%"PRIuMAX":%s:%s:%s%s\n",
tdata->batch_nr, typestr, path,
oid_to_hex(&oids->oid[i]),
o->flags & UNINTERESTING ? ":UNINTERESTING" : "");
}
tdata->batch_nr++;
return 0;
}
int cmd__path_walk(int argc, const char **argv)
{
int res, stdin_pl = 0;
struct rev_info revs = REV_INFO_INIT;
struct path_walk_info info = PATH_WALK_INFO_INIT;
struct path_walk_test_data data = { 0 };
struct option options[] = {
OPT_BOOL(0, "blobs", &info.blobs,
N_("toggle inclusion of blob objects")),
OPT_BOOL(0, "commits", &info.commits,
N_("toggle inclusion of commit objects")),
OPT_BOOL(0, "tags", &info.tags,
N_("toggle inclusion of tag objects")),
OPT_BOOL(0, "trees", &info.trees,
N_("toggle inclusion of tree objects")),
OPT_BOOL(0, "prune", &info.prune_all_uninteresting,
N_("toggle pruning of uninteresting paths")),
OPT_BOOL(0, "stdin-pl", &stdin_pl,
N_("read a pattern list over stdin")),
OPT_END(),
};
setup_git_directory();
revs.repo = the_repository;
argc = parse_options(argc, argv, NULL,
options, path_walk_usage,
PARSE_OPT_KEEP_UNKNOWN_OPT | PARSE_OPT_KEEP_ARGV0);
if (argc > 1)
setup_revisions(argc, argv, &revs, NULL);
else
usage(path_walk_usage[0]);
info.revs = &revs;
info.path_fn = emit_block;
info.path_fn_data = &data;
if (stdin_pl) {
struct strbuf in = STRBUF_INIT;
CALLOC_ARRAY(info.pl, 1);
info.pl->use_cone_patterns = 1;
strbuf_fread(&in, 2048, stdin);
add_patterns_from_buffer(in.buf, in.len, "", 0, info.pl);
strbuf_release(&in);
}
res = walk_objects_by_path(&info);
printf("commits:%" PRIuMAX "\n"
"trees:%" PRIuMAX "\n"
"blobs:%" PRIuMAX "\n"
"tags:%" PRIuMAX "\n",
data.commit_nr, data.tree_nr, data.blob_nr, data.tag_nr);
if (info.pl) {
clear_pattern_list(info.pl);
free(info.pl);
}
release_revisions(&revs);
return res;
}