Merge branch 'ds/push-sparse-tree-walk'

"git pack-objects" learned another algorithm to compute the set of
objects to send, that trades the resulting packfile off to save
traversal cost to favor small pushes.

* ds/push-sparse-tree-walk:
  pack-objects: create GIT_TEST_PACK_SPARSE
  pack-objects: create pack.useSparse setting
  revision: implement sparse algorithm
  list-objects: consume sparse tree walk
  revision: add mark_tree_uninteresting_sparse
This commit is contained in:
Junio C Hamano
2019-02-06 22:05:24 -08:00
12 changed files with 379 additions and 18 deletions

View File

@ -105,6 +105,15 @@ pack.useBitmaps::
true. You should not generally need to turn this off unless true. You should not generally need to turn this off unless
you are debugging pack bitmaps. you are debugging pack bitmaps.
pack.useSparse::
When true, git will default to using the '--sparse' option in
'git pack-objects' when the '--revs' option is present. This
algorithm only walks trees that appear in paths that introduce new
objects. This can have significant performance benefits when
computing a pack to send a small change. However, it is possible
that extra objects are added to the pack-file if the included
commits contain certain types of direct renames.
pack.writeBitmaps (deprecated):: pack.writeBitmaps (deprecated)::
This is a deprecated synonym for `repack.writeBitmaps`. This is a deprecated synonym for `repack.writeBitmaps`.

View File

@ -14,7 +14,7 @@ SYNOPSIS
[--local] [--incremental] [--window=<n>] [--depth=<n>] [--local] [--incremental] [--window=<n>] [--depth=<n>]
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>] [--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
[--stdout [--filter=<filter-spec>] | base-name] [--stdout [--filter=<filter-spec>] | base-name]
[--shallow] [--keep-true-parents] < object-list [--shallow] [--keep-true-parents] [--sparse] < object-list
DESCRIPTION DESCRIPTION
@ -196,6 +196,15 @@ depth is 4095.
Add --no-reuse-object if you want to force a uniform compression Add --no-reuse-object if you want to force a uniform compression
level on all data no matter the source. level on all data no matter the source.
--sparse::
Use the "sparse" algorithm to determine which objects to include in
the pack, when combined with the "--revs" option. This algorithm
only walks trees that appear in paths that introduce new objects.
This can have significant performance benefits when computing
a pack to send a small change. However, it is possible that extra
objects are added to the pack-file if the included commits contain
certain types of direct renames.
--thin:: --thin::
Create a "thin" pack by omitting the common objects between a Create a "thin" pack by omitting the common objects between a
sender and a receiver in order to reduce network transfer. This sender and a receiver in order to reduce network transfer. This

View File

@ -658,7 +658,7 @@ static void bisect_common(struct rev_info *revs)
if (prepare_revision_walk(revs)) if (prepare_revision_walk(revs))
die("revision walk setup failed"); die("revision walk setup failed");
if (revs->tree_objects) if (revs->tree_objects)
mark_edges_uninteresting(revs, NULL); mark_edges_uninteresting(revs, NULL, 0);
} }
static void exit_if_skipped_commits(struct commit_list *tried, static void exit_if_skipped_commits(struct commit_list *tried,

View File

@ -84,6 +84,7 @@ static unsigned long pack_size_limit;
static int depth = 50; static int depth = 50;
static int delta_search_threads; static int delta_search_threads;
static int pack_to_stdout; static int pack_to_stdout;
static int sparse;
static int thin; static int thin;
static int num_preferred_base; static int num_preferred_base;
static struct progress *progress_state; static struct progress *progress_state;
@ -2703,6 +2704,10 @@ static int git_pack_config(const char *k, const char *v, void *cb)
use_bitmap_index_default = git_config_bool(k, v); use_bitmap_index_default = git_config_bool(k, v);
return 0; return 0;
} }
if (!strcmp(k, "pack.usesparse")) {
sparse = git_config_bool(k, v);
return 0;
}
if (!strcmp(k, "pack.threads")) { if (!strcmp(k, "pack.threads")) {
delta_search_threads = git_config_int(k, v); delta_search_threads = git_config_int(k, v);
if (delta_search_threads < 0) if (delta_search_threads < 0)
@ -3130,7 +3135,7 @@ static void get_object_list(int ac, const char **av)
if (prepare_revision_walk(&revs)) if (prepare_revision_walk(&revs))
die(_("revision walk setup failed")); die(_("revision walk setup failed"));
mark_edges_uninteresting(&revs, show_edge); mark_edges_uninteresting(&revs, show_edge, sparse);
if (!fn_show_object) if (!fn_show_object)
fn_show_object = show_object; fn_show_object = show_object;
@ -3287,6 +3292,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
{ OPTION_CALLBACK, 0, "unpack-unreachable", NULL, N_("time"), { OPTION_CALLBACK, 0, "unpack-unreachable", NULL, N_("time"),
N_("unpack unreachable objects newer than <time>"), N_("unpack unreachable objects newer than <time>"),
PARSE_OPT_OPTARG, option_parse_unpack_unreachable }, PARSE_OPT_OPTARG, option_parse_unpack_unreachable },
OPT_BOOL(0, "sparse", &sparse,
N_("use the sparse reachability algorithm")),
OPT_BOOL(0, "thin", &thin, OPT_BOOL(0, "thin", &thin,
N_("create thin packs")), N_("create thin packs")),
OPT_BOOL(0, "shallow", &shallow, OPT_BOOL(0, "shallow", &shallow,
@ -3319,6 +3326,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
read_replace_refs = 0; read_replace_refs = 0;
sparse = git_env_bool("GIT_TEST_PACK_SPARSE", 0);
reset_pack_idx_option(&pack_idx_opts); reset_pack_idx_option(&pack_idx_opts);
git_config(git_pack_config, NULL); git_config(git_pack_config, NULL);

View File

@ -546,7 +546,7 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
if (prepare_revision_walk(&revs)) if (prepare_revision_walk(&revs))
die("revision walk setup failed"); die("revision walk setup failed");
if (revs.tree_objects) if (revs.tree_objects)
mark_edges_uninteresting(&revs, show_edge); mark_edges_uninteresting(&revs, show_edge, 0);
if (bisect_list) { if (bisect_list) {
int reaches, all; int reaches, all;

View File

@ -1933,7 +1933,7 @@ int cmd_main(int argc, const char **argv)
pushing = 0; pushing = 0;
if (prepare_revision_walk(&revs)) if (prepare_revision_walk(&revs))
die("revision walk setup failed"); die("revision walk setup failed");
mark_edges_uninteresting(&revs, NULL); mark_edges_uninteresting(&revs, NULL, 0);
objects_to_send = get_delta(&revs, ref_lock); objects_to_send = get_delta(&revs, ref_lock);
finish_all_active_slots(); finish_all_active_slots();

View File

@ -226,14 +226,60 @@ static void mark_edge_parents_uninteresting(struct commit *commit,
} }
} }
void mark_edges_uninteresting(struct rev_info *revs, show_edge_fn show_edge) static void add_edge_parents(struct commit *commit,
struct rev_info *revs,
show_edge_fn show_edge,
struct oidset *set)
{
struct commit_list *parents;
for (parents = commit->parents; parents; parents = parents->next) {
struct commit *parent = parents->item;
struct tree *tree = get_commit_tree(parent);
if (!tree)
continue;
oidset_insert(set, &tree->object.oid);
if (!(parent->object.flags & UNINTERESTING))
continue;
tree->object.flags |= UNINTERESTING;
if (revs->edge_hint && !(parent->object.flags & SHOWN)) {
parent->object.flags |= SHOWN;
show_edge(parent);
}
}
}
void mark_edges_uninteresting(struct rev_info *revs,
show_edge_fn show_edge,
int sparse)
{ {
struct commit_list *list; struct commit_list *list;
int i; int i;
if (sparse) {
struct oidset set;
oidset_init(&set, 16);
for (list = revs->commits; list; list = list->next) { for (list = revs->commits; list; list = list->next) {
struct commit *commit = list->item; struct commit *commit = list->item;
struct tree *tree = get_commit_tree(commit);
if (commit->object.flags & UNINTERESTING)
tree->object.flags |= UNINTERESTING;
oidset_insert(&set, &tree->object.oid);
add_edge_parents(commit, revs, show_edge, &set);
}
mark_trees_uninteresting_sparse(revs->repo, &set);
oidset_clear(&set);
} else {
for (list = revs->commits; list; list = list->next) {
struct commit *commit = list->item;
if (commit->object.flags & UNINTERESTING) { if (commit->object.flags & UNINTERESTING) {
mark_tree_uninteresting(revs->repo, mark_tree_uninteresting(revs->repo,
get_commit_tree(commit)); get_commit_tree(commit));
@ -245,6 +291,8 @@ void mark_edges_uninteresting(struct rev_info *revs, show_edge_fn show_edge)
} }
mark_edge_parents_uninteresting(commit, revs, show_edge); mark_edge_parents_uninteresting(commit, revs, show_edge);
} }
}
if (revs->edge_hint_aggressive) { if (revs->edge_hint_aggressive) {
for (i = 0; i < revs->cmdline.nr; i++) { for (i = 0; i < revs->cmdline.nr; i++) {
struct object *obj = revs->cmdline.rev[i].item; struct object *obj = revs->cmdline.rev[i].item;

View File

@ -10,7 +10,9 @@ typedef void (*show_object_fn)(struct object *, const char *, void *);
void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *); void traverse_commit_list(struct rev_info *, show_commit_fn, show_object_fn, void *);
typedef void (*show_edge_fn)(struct commit *); typedef void (*show_edge_fn)(struct commit *);
void mark_edges_uninteresting(struct rev_info *, show_edge_fn); void mark_edges_uninteresting(struct rev_info *revs,
show_edge_fn show_edge,
int sparse);
struct oidset; struct oidset;
struct list_objects_filter_options; struct list_objects_filter_options;

View File

@ -27,6 +27,7 @@
#include "commit-reach.h" #include "commit-reach.h"
#include "commit-graph.h" #include "commit-graph.h"
#include "prio-queue.h" #include "prio-queue.h"
#include "hashmap.h"
volatile show_early_output_fn_t show_early_output; volatile show_early_output_fn_t show_early_output;
@ -99,6 +100,148 @@ void mark_tree_uninteresting(struct repository *r, struct tree *tree)
mark_tree_contents_uninteresting(r, tree); mark_tree_contents_uninteresting(r, tree);
} }
struct path_and_oids_entry {
struct hashmap_entry ent;
char *path;
struct oidset trees;
};
static int path_and_oids_cmp(const void *hashmap_cmp_fn_data,
const struct path_and_oids_entry *e1,
const struct path_and_oids_entry *e2,
const void *keydata)
{
return strcmp(e1->path, e2->path);
}
static void paths_and_oids_init(struct hashmap *map)
{
hashmap_init(map, (hashmap_cmp_fn) path_and_oids_cmp, NULL, 0);
}
static void paths_and_oids_clear(struct hashmap *map)
{
struct hashmap_iter iter;
struct path_and_oids_entry *entry;
hashmap_iter_init(map, &iter);
while ((entry = (struct path_and_oids_entry *)hashmap_iter_next(&iter))) {
oidset_clear(&entry->trees);
free(entry->path);
}
hashmap_free(map, 1);
}
static void paths_and_oids_insert(struct hashmap *map,
const char *path,
const struct object_id *oid)
{
int hash = strhash(path);
struct path_and_oids_entry key;
struct path_and_oids_entry *entry;
hashmap_entry_init(&key, hash);
/* use a shallow copy for the lookup */
key.path = (char *)path;
oidset_init(&key.trees, 0);
if (!(entry = (struct path_and_oids_entry *)hashmap_get(map, &key, NULL))) {
entry = xcalloc(1, sizeof(struct path_and_oids_entry));
hashmap_entry_init(entry, hash);
entry->path = xstrdup(key.path);
oidset_init(&entry->trees, 16);
hashmap_put(map, entry);
}
oidset_insert(&entry->trees, oid);
}
static void add_children_by_path(struct repository *r,
struct tree *tree,
struct hashmap *map)
{
struct tree_desc desc;
struct name_entry entry;
if (!tree)
return;
if (parse_tree_gently(tree, 1) < 0)
return;
init_tree_desc(&desc, tree->buffer, tree->size);
while (tree_entry(&desc, &entry)) {
switch (object_type(entry.mode)) {
case OBJ_TREE:
paths_and_oids_insert(map, entry.path, &entry.oid);
if (tree->object.flags & UNINTERESTING) {
struct tree *child = lookup_tree(r, &entry.oid);
if (child)
child->object.flags |= UNINTERESTING;
}
break;
case OBJ_BLOB:
if (tree->object.flags & UNINTERESTING) {
struct blob *child = lookup_blob(r, &entry.oid);
if (child)
child->object.flags |= UNINTERESTING;
}
break;
default:
/* Subproject commit - not in this repository */
break;
}
}
free_tree_buffer(tree);
}
void mark_trees_uninteresting_sparse(struct repository *r,
struct oidset *trees)
{
unsigned has_interesting = 0, has_uninteresting = 0;
struct hashmap map;
struct hashmap_iter map_iter;
struct path_and_oids_entry *entry;
struct object_id *oid;
struct oidset_iter iter;
oidset_iter_init(trees, &iter);
while ((!has_interesting || !has_uninteresting) &&
(oid = oidset_iter_next(&iter))) {
struct tree *tree = lookup_tree(r, oid);
if (!tree)
continue;
if (tree->object.flags & UNINTERESTING)
has_uninteresting = 1;
else
has_interesting = 1;
}
/* Do not walk unless we have both types of trees. */
if (!has_uninteresting || !has_interesting)
return;
paths_and_oids_init(&map);
oidset_iter_init(trees, &iter);
while ((oid = oidset_iter_next(&iter))) {
struct tree *tree = lookup_tree(r, oid);
add_children_by_path(r, tree, &map);
}
hashmap_iter_init(&map, &map_iter);
while ((entry = hashmap_iter_next(&map_iter)))
mark_trees_uninteresting_sparse(r, &entry->trees);
paths_and_oids_clear(&map);
}
struct commit_stack { struct commit_stack {
struct commit **items; struct commit **items;
size_t nr, alloc; size_t nr, alloc;

View File

@ -67,6 +67,7 @@ struct rev_cmdline_info {
#define REVISION_WALK_NO_WALK_SORTED 1 #define REVISION_WALK_NO_WALK_SORTED 1
#define REVISION_WALK_NO_WALK_UNSORTED 2 #define REVISION_WALK_NO_WALK_UNSORTED 2
struct oidset;
struct topo_walk_info; struct topo_walk_info;
struct rev_info { struct rev_info {
@ -327,6 +328,7 @@ void put_revision_mark(const struct rev_info *revs,
void mark_parents_uninteresting(struct commit *commit); void mark_parents_uninteresting(struct commit *commit);
void mark_tree_uninteresting(struct repository *r, struct tree *tree); void mark_tree_uninteresting(struct repository *r, struct tree *tree);
void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees);
void show_object_with_name(FILE *, struct object *, const char *); void show_object_with_name(FILE *, struct object *, const char *);

View File

@ -358,6 +358,10 @@ GIT_TEST_INDEX_VERSION=<n> exercises the index read/write code path
for the index version specified. Can be set to any valid version for the index version specified. Can be set to any valid version
(currently 2, 3, or 4). (currently 2, 3, or 4).
GIT_TEST_PACK_SPARSE=<boolean> if enabled will default the pack-objects
builtin to use the sparse object walk. This can still be overridden by
the --no-sparse command-line argument.
GIT_TEST_PRELOAD_INDEX=<boolean> exercises the preload-index code path GIT_TEST_PRELOAD_INDEX=<boolean> exercises the preload-index code path
by overriding the minimum number of cache entries required per thread. by overriding the minimum number of cache entries required per thread.

136
t/t5322-pack-objects-sparse.sh Executable file
View File

@ -0,0 +1,136 @@
#!/bin/sh
test_description='pack-objects object selection using sparse algorithm'
. ./test-lib.sh
test_expect_success 'setup repo' '
test_commit initial &&
for i in $(test_seq 1 3)
do
mkdir f$i &&
for j in $(test_seq 1 3)
do
mkdir f$i/f$j &&
echo $j >f$i/f$j/data.txt
done
done &&
git add . &&
git commit -m "Initialized trees" &&
for i in $(test_seq 1 3)
do
git checkout -b topic$i master &&
echo change-$i >f$i/f$i/data.txt &&
git commit -a -m "Changed f$i/f$i/data.txt"
done &&
cat >packinput.txt <<-EOF &&
topic1
^topic2
^topic3
EOF
git rev-parse \
topic1 \
topic1^{tree} \
topic1:f1 \
topic1:f1/f1 \
topic1:f1/f1/data.txt | sort >expect_objects.txt
'
test_expect_success 'non-sparse pack-objects' '
git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
git index-pack -o nonsparse.idx nonsparse.pack &&
git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
test_cmp expect_objects.txt nonsparse_objects.txt
'
test_expect_success 'sparse pack-objects' '
git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
test_cmp expect_objects.txt sparse_objects.txt
'
test_expect_success 'duplicate a folder from f3 and commit to topic1' '
git checkout topic1 &&
echo change-3 >f3/f3/data.txt &&
git commit -a -m "Changed f3/f3/data.txt" &&
git rev-parse \
topic1~1 \
topic1~1^{tree} \
topic1^{tree} \
topic1 \
topic1:f1 \
topic1:f1/f1 \
topic1:f1/f1/data.txt | sort >required_objects.txt
'
test_expect_success 'non-sparse pack-objects' '
git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
git index-pack -o nonsparse.idx nonsparse.pack &&
git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
comm -1 -2 required_objects.txt nonsparse_objects.txt >nonsparse_required_objects.txt &&
test_cmp required_objects.txt nonsparse_required_objects.txt
'
test_expect_success 'sparse pack-objects' '
git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
comm -1 -2 required_objects.txt sparse_objects.txt >sparse_required_objects.txt &&
test_cmp required_objects.txt sparse_required_objects.txt
'
# Demonstrate that the algorithms differ when we copy a tree wholesale
# from one folder to another.
test_expect_success 'duplicate a folder from f1 into f3' '
mkdir f3/f4 &&
cp -r f1/f1/* f3/f4 &&
git add f3/f4 &&
git commit -m "Copied f1/f1 to f3/f4" &&
cat >packinput.txt <<-EOF &&
topic1
^topic1~1
EOF
git rev-parse \
topic1 \
topic1^{tree} \
topic1:f3 | sort >required_objects.txt
'
test_expect_success 'non-sparse pack-objects' '
git pack-objects --stdout --revs --no-sparse <packinput.txt >nonsparse.pack &&
git index-pack -o nonsparse.idx nonsparse.pack &&
git show-index <nonsparse.idx | awk "{print \$2}" >nonsparse_objects.txt &&
comm -1 -2 required_objects.txt nonsparse_objects.txt >nonsparse_required_objects.txt &&
test_cmp required_objects.txt nonsparse_required_objects.txt
'
test_expect_success 'sparse pack-objects' '
git rev-parse \
topic1 \
topic1^{tree} \
topic1:f3 \
topic1:f3/f4 \
topic1:f3/f4/data.txt | sort >expect_sparse_objects.txt &&
git pack-objects --stdout --revs --sparse <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
test_cmp expect_sparse_objects.txt sparse_objects.txt
'
test_expect_success 'pack.useSparse enables algorithm' '
git config pack.useSparse true &&
git pack-objects --stdout --revs <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
test_cmp expect_sparse_objects.txt sparse_objects.txt
'
test_expect_success 'pack.useSparse overridden' '
git pack-objects --stdout --revs --no-sparse <packinput.txt >sparse.pack &&
git index-pack -o sparse.idx sparse.pack &&
git show-index <sparse.idx | awk "{print \$2}" >sparse_objects.txt &&
test_cmp required_objects.txt sparse_objects.txt
'
test_done