backfill: basic functionality and tests
The default behavior of 'git backfill' is to fetch all missing blobs that are reachable from HEAD. Document and test this behavior. The implementation is a very simple use of the path-walk API, initializing the revision walk at HEAD to start the path-walk from all commits reachable from HEAD. Ignore the object arrays that correspond to tree entries, assuming that they are all present already. The path-walk API provides lists of objects in batches according to a common path, but that list could be very small. We want to balance the number of requests to the server with the ability to have the process interrupted with minimal repeated work to catch up in the next run. Based on some experiments (detailed in the next change) a minimum batch size of 50,000 is selected for the default. This batch size is a _minimum_. As the path-walk API emits lists of blob IDs, they are collected into a list of objects for a request to the server. When that list is at least the minimum batch size, then the request is sent to the server for the new objects. However, the list of blob IDs from the path-walk API could be much longer than the batch size. At this moment, it is unclear if there is a benefit to split the list when there are too many objects at the same path. Signed-off-by: Derrick Stolee <stolee@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:

committed by
Junio C Hamano

parent
a3f79e9abd
commit
1e72e889e7
@ -1,16 +1,112 @@
|
||||
#include "builtin.h"
|
||||
#include "git-compat-util.h"
|
||||
#include "config.h"
|
||||
#include "parse-options.h"
|
||||
#include "repository.h"
|
||||
#include "commit.h"
|
||||
#include "hex.h"
|
||||
#include "tree.h"
|
||||
#include "tree-walk.h"
|
||||
#include "object.h"
|
||||
#include "object-store-ll.h"
|
||||
#include "oid-array.h"
|
||||
#include "oidset.h"
|
||||
#include "promisor-remote.h"
|
||||
#include "strmap.h"
|
||||
#include "string-list.h"
|
||||
#include "revision.h"
|
||||
#include "trace2.h"
|
||||
#include "progress.h"
|
||||
#include "packfile.h"
|
||||
#include "path-walk.h"
|
||||
|
||||
static const char * const builtin_backfill_usage[] = {
|
||||
N_("git backfill [<options>]"),
|
||||
NULL
|
||||
};
|
||||
|
||||
struct backfill_context {
|
||||
struct repository *repo;
|
||||
struct oid_array current_batch;
|
||||
size_t min_batch_size;
|
||||
};
|
||||
|
||||
static void backfill_context_clear(struct backfill_context *ctx)
|
||||
{
|
||||
oid_array_clear(&ctx->current_batch);
|
||||
}
|
||||
|
||||
static void download_batch(struct backfill_context *ctx)
|
||||
{
|
||||
promisor_remote_get_direct(ctx->repo,
|
||||
ctx->current_batch.oid,
|
||||
ctx->current_batch.nr);
|
||||
oid_array_clear(&ctx->current_batch);
|
||||
|
||||
/*
|
||||
* We likely have a new packfile. Add it to the packed list to
|
||||
* avoid possible duplicate downloads of the same objects.
|
||||
*/
|
||||
reprepare_packed_git(ctx->repo);
|
||||
}
|
||||
|
||||
static int fill_missing_blobs(const char *path UNUSED,
|
||||
struct oid_array *list,
|
||||
enum object_type type,
|
||||
void *data)
|
||||
{
|
||||
struct backfill_context *ctx = data;
|
||||
|
||||
if (type != OBJ_BLOB)
|
||||
return 0;
|
||||
|
||||
for (size_t i = 0; i < list->nr; i++) {
|
||||
if (!has_object(ctx->repo, &list->oid[i],
|
||||
OBJECT_INFO_FOR_PREFETCH))
|
||||
oid_array_append(&ctx->current_batch, &list->oid[i]);
|
||||
}
|
||||
|
||||
if (ctx->current_batch.nr >= ctx->min_batch_size)
|
||||
download_batch(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_backfill(struct backfill_context *ctx)
|
||||
{
|
||||
struct rev_info revs;
|
||||
struct path_walk_info info = PATH_WALK_INFO_INIT;
|
||||
int ret;
|
||||
|
||||
repo_init_revisions(ctx->repo, &revs, "");
|
||||
handle_revision_arg("HEAD", &revs, 0, 0);
|
||||
|
||||
info.blobs = 1;
|
||||
info.tags = info.commits = info.trees = 0;
|
||||
|
||||
info.revs = &revs;
|
||||
info.path_fn = fill_missing_blobs;
|
||||
info.path_fn_data = ctx;
|
||||
|
||||
ret = walk_objects_by_path(&info);
|
||||
|
||||
/* Download the objects that did not fill a batch. */
|
||||
if (!ret)
|
||||
download_batch(ctx);
|
||||
|
||||
path_walk_info_clear(&info);
|
||||
release_revisions(&revs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int cmd_backfill(int argc, const char **argv, const char *prefix, struct repository *repo)
|
||||
{
|
||||
int result;
|
||||
struct backfill_context ctx = {
|
||||
.repo = repo,
|
||||
.current_batch = OID_ARRAY_INIT,
|
||||
.min_batch_size = 50000,
|
||||
};
|
||||
struct option options[] = {
|
||||
OPT_END(),
|
||||
};
|
||||
@ -22,7 +118,7 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit
|
||||
|
||||
repo_config(repo, git_default_config, NULL);
|
||||
|
||||
die(_("not implemented"));
|
||||
|
||||
return 0;
|
||||
result = do_backfill(&ctx);
|
||||
backfill_context_clear(&ctx);
|
||||
return result;
|
||||
}
|
||||
|
Reference in New Issue
Block a user