From 202e7f1e161b5bce6587d1a696843ead10a8b477 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:09:06 -0400 Subject: [PATCH 01/11] for_each_*_object: store flag definitions in a single location These flags were split between cache.h and packfile.h, because some of the flags apply only to packs. However, they share a single numeric namespace, since both are respected for the packed variant. Let's make sure they're defined together so that nobody accidentally adds a new flag in one location that duplicates the other. While we're here, let's also put them in an enum (which helps debugger visibility) and use "(1< Signed-off-by: Junio C Hamano --- cache.h | 13 ++++++++++++- packfile.h | 8 ++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/cache.h b/cache.h index 8dc7134f00..4187238ecf 100644 --- a/cache.h +++ b/cache.h @@ -1623,12 +1623,23 @@ int for_each_loose_file_in_objdir_buf(struct strbuf *path, each_loose_subdir_fn subdir_cb, void *data); +/* + * Flags for for_each_*_object(), including for_each_loose below and + * for_each_packed in packfile.h. + */ +enum for_each_object_flags { + /* Iterate only over local objects, not alternates. */ + FOR_EACH_OBJECT_LOCAL_ONLY = (1<<0), + + /* Only iterate over packs obtained from the promisor remote. */ + FOR_EACH_OBJECT_PROMISOR_ONLY = (1<<1), +}; + /* * Iterate over loose objects in both the local * repository and any alternates repositories (unless the * LOCAL_ONLY flag is set). */ -#define FOR_EACH_OBJECT_LOCAL_ONLY 0x1 extern int for_each_loose_object(each_loose_object_fn, void *, unsigned flags); /* diff --git a/packfile.h b/packfile.h index cc7eaffe1b..6ddc6a2e91 100644 --- a/packfile.h +++ b/packfile.h @@ -148,15 +148,11 @@ extern int has_object_pack(const struct object_id *oid); extern int has_pack_index(const unsigned char *sha1); -/* - * Only iterate over packs obtained from the promisor remote. - */ -#define FOR_EACH_OBJECT_PROMISOR_ONLY 2 - /* * Iterate over packed objects in both the local * repository and any alternates repositories (unless the - * FOR_EACH_OBJECT_LOCAL_ONLY flag, defined in cache.h, is set). + * FOR_EACH_OBJECT_LOCAL_ONLY flag is set). See cache.h for the complete list + * of flags. */ typedef int each_packed_object_fn(const struct object_id *oid, struct packed_git *pack, From a7ff6f5a0f310406aa4973e8d7ec25815554bcb5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:09:44 -0400 Subject: [PATCH 02/11] for_each_*_object: take flag arguments as enum It's not wrong to pass our flags in an "unsigned", as we know it will be at least as large as the enum. However, using the enum in the declaration makes it more obvious where to find the list of flags. While we're here, let's also drop the "extern" noise-words from the declarations, per our modern coding style. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- cache.h | 3 ++- packfile.c | 3 ++- packfile.h | 5 +++-- sha1-file.c | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cache.h b/cache.h index 4187238ecf..9e02fc494e 100644 --- a/cache.h +++ b/cache.h @@ -1640,7 +1640,8 @@ enum for_each_object_flags { * repository and any alternates repositories (unless the * LOCAL_ONLY flag is set). */ -extern int for_each_loose_object(each_loose_object_fn, void *, unsigned flags); +int for_each_loose_object(each_loose_object_fn, void *, + enum for_each_object_flags flags); /* * Set this to 0 to prevent sha1_object_info_extended() from fetching missing diff --git a/packfile.c b/packfile.c index 6974903e58..9da8f6d728 100644 --- a/packfile.c +++ b/packfile.c @@ -1904,7 +1904,8 @@ int for_each_object_in_pack(struct packed_git *p, each_packed_object_fn cb, void return r; } -int for_each_packed_object(each_packed_object_fn cb, void *data, unsigned flags) +int for_each_packed_object(each_packed_object_fn cb, void *data, + enum for_each_object_flags flags) { struct packed_git *p; int r = 0; diff --git a/packfile.h b/packfile.h index 6ddc6a2e91..9861728514 100644 --- a/packfile.h +++ b/packfile.h @@ -158,8 +158,9 @@ typedef int each_packed_object_fn(const struct object_id *oid, struct packed_git *pack, uint32_t pos, void *data); -extern int for_each_object_in_pack(struct packed_git *p, each_packed_object_fn, void *data); -extern int for_each_packed_object(each_packed_object_fn, void *, unsigned flags); +int for_each_object_in_pack(struct packed_git *p, each_packed_object_fn, void *data); +int for_each_packed_object(each_packed_object_fn, void *, + enum for_each_object_flags flags); /* * Return 1 if an object in a promisor packfile is or refers to the given diff --git a/sha1-file.c b/sha1-file.c index dfa8a35d68..cc0b57a751 100644 --- a/sha1-file.c +++ b/sha1-file.c @@ -2146,7 +2146,8 @@ static int loose_from_alt_odb(struct alternate_object_database *alt, return r; } -int for_each_loose_object(each_loose_object_fn cb, void *data, unsigned flags) +int for_each_loose_object(each_loose_object_fn cb, void *data, + enum for_each_object_flags flags) { struct loose_alt_odb_data alt; int r; From 8b361551900be9bedd946386362f2d0e2a506845 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:11:14 -0400 Subject: [PATCH 03/11] for_each_*_object: give more comprehensive docstrings We already mention the local/alternate behavior of these functions, but we can help clarify a few other behaviors: - there's no need to mention LOCAL_ONLY specifically, since we already reference the flags by type (and as we add more flags, we don't want to have to mention each) - clarify that reachability doesn't matter here; this is all accessible objects - what ordering/uniqueness guarantees we give - how pack-specific flags are handled for the loose case Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- cache.h | 8 +++++--- packfile.h | 12 ++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cache.h b/cache.h index 9e02fc494e..f438540f9b 100644 --- a/cache.h +++ b/cache.h @@ -1636,9 +1636,11 @@ enum for_each_object_flags { }; /* - * Iterate over loose objects in both the local - * repository and any alternates repositories (unless the - * LOCAL_ONLY flag is set). + * Iterate over all accessible loose objects without respect to + * reachability. By default, this includes both local and alternate objects. + * The order in which objects are visited is unspecified. + * + * Any flags specific to packs are ignored. */ int for_each_loose_object(each_loose_object_fn, void *, enum for_each_object_flags flags); diff --git a/packfile.h b/packfile.h index 9861728514..c86a8c2716 100644 --- a/packfile.h +++ b/packfile.h @@ -149,10 +149,14 @@ extern int has_object_pack(const struct object_id *oid); extern int has_pack_index(const unsigned char *sha1); /* - * Iterate over packed objects in both the local - * repository and any alternates repositories (unless the - * FOR_EACH_OBJECT_LOCAL_ONLY flag is set). See cache.h for the complete list - * of flags. + * Iterate over all accessible packed objects without respect to reachability. + * By default, this includes both local and alternate packs. + * + * Note that some objects may appear twice if they are found in multiple packs. + * Each pack is visited in an unspecified order. Objects within a pack are + * visited in pack-idx order (i.e., sorted by oid). + * + * The list of flags can be found in cache.h. */ typedef int each_packed_object_fn(const struct object_id *oid, struct packed_git *pack, From 736eb88fdc8a2dea4302114d2f74b580d0f83cfe Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:15:49 -0400 Subject: [PATCH 04/11] for_each_packed_object: support iterating in pack-order We currently iterate over objects within a pack in .idx order, which uses the object hashes. That means that it is effectively random with respect to the location of the object within the pack. If you're going to access the actual object data, there are two reasons to move linearly through the pack itself: 1. It improves the locality of access in the packfile. In the cold-cache case, this may mean fewer disk seeks, or better usage of disk cache. 2. We store related deltas together in the packfile. Which means that the delta base cache can operate much more efficiently if we visit all of those related deltas in sequence, as the earlier items are likely to still be in the cache. Whereas if we visit the objects in random order, our cache entries are much more likely to have been evicted by unrelated deltas in the meantime. So in general, if you're going to access the object contents pack order is generally going to end up more efficient. But if you're simply generating a list of object names, or if you're going to end up sorting the result anyway, you're better off just using the .idx order, as finding the pack order means generating the in-memory pack-revindex. According to the numbers in 8b8dfd5132 (pack-revindex: radix-sort the revindex, 2013-07-11), that takes about 200ms for linux.git, and 20ms for git.git (those numbers are a few years old but are still a good ballpark). That makes it a good optimization for some cases (we can save tens of seconds in git.git by having good locality of delta access, for a 20ms cost), but a bad one for others (e.g., right now "cat-file --batch-all-objects --batch-check="%(objectname)" is 170ms in git.git, so adding 20ms to that is noticeable). Hence this patch makes it an optional flag. You can't actually do any interesting timings yet, as it's not plumbed through to any user-facing tools like cat-file. That will come in a later patch. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- cache.h | 5 +++++ commit-graph.c | 2 +- packfile.c | 23 +++++++++++++++++------ packfile.h | 8 +++++--- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/cache.h b/cache.h index f438540f9b..6d14702df2 100644 --- a/cache.h +++ b/cache.h @@ -1633,6 +1633,11 @@ enum for_each_object_flags { /* Only iterate over packs obtained from the promisor remote. */ FOR_EACH_OBJECT_PROMISOR_ONLY = (1<<1), + + /* + * Visit objects within a pack in packfile order rather than .idx order + */ + FOR_EACH_OBJECT_PACK_ORDER = (1<<2), }; /* diff --git a/commit-graph.c b/commit-graph.c index b0a55ad128..69a0d1c203 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -730,7 +730,7 @@ void write_commit_graph(const char *obj_dir, die("error adding pack %s", packname.buf); if (open_pack_index(p)) die("error opening index for %s", packname.buf); - for_each_object_in_pack(p, add_packed_commits, &oids); + for_each_object_in_pack(p, add_packed_commits, &oids, 0); close_pack(p); } strbuf_release(&packname); diff --git a/packfile.c b/packfile.c index 9da8f6d728..ebcb5742ec 100644 --- a/packfile.c +++ b/packfile.c @@ -1885,19 +1885,30 @@ int has_pack_index(const unsigned char *sha1) return 1; } -int for_each_object_in_pack(struct packed_git *p, each_packed_object_fn cb, void *data) +int for_each_object_in_pack(struct packed_git *p, + each_packed_object_fn cb, void *data, + enum for_each_object_flags flags) { uint32_t i; int r = 0; + if (flags & FOR_EACH_OBJECT_PACK_ORDER) + load_pack_revindex(p); + for (i = 0; i < p->num_objects; i++) { + uint32_t pos; struct object_id oid; - if (!nth_packed_object_oid(&oid, p, i)) - return error("unable to get sha1 of object %u in %s", - i, p->pack_name); + if (flags & FOR_EACH_OBJECT_PACK_ORDER) + pos = p->revindex[i].nr; + else + pos = i; - r = cb(&oid, p, i, data); + if (!nth_packed_object_oid(&oid, p, pos)) + return error("unable to get sha1 of object %u in %s", + pos, p->pack_name); + + r = cb(&oid, p, pos, data); if (r) break; } @@ -1922,7 +1933,7 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, pack_errors = 1; continue; } - r = for_each_object_in_pack(p, cb, data); + r = for_each_object_in_pack(p, cb, data, flags); if (r) break; } diff --git a/packfile.h b/packfile.h index c86a8c2716..99411bdd85 100644 --- a/packfile.h +++ b/packfile.h @@ -153,8 +153,8 @@ extern int has_pack_index(const unsigned char *sha1); * By default, this includes both local and alternate packs. * * Note that some objects may appear twice if they are found in multiple packs. - * Each pack is visited in an unspecified order. Objects within a pack are - * visited in pack-idx order (i.e., sorted by oid). + * Each pack is visited in an unspecified order. By default, objects within a + * pack are visited in pack-idx order (i.e., sorted by oid). * * The list of flags can be found in cache.h. */ @@ -162,7 +162,9 @@ typedef int each_packed_object_fn(const struct object_id *oid, struct packed_git *pack, uint32_t pos, void *data); -int for_each_object_in_pack(struct packed_git *p, each_packed_object_fn, void *data); +int for_each_object_in_pack(struct packed_git *p, + each_packed_object_fn, void *data, + enum for_each_object_flags flags); int for_each_packed_object(each_packed_object_fn, void *, enum for_each_object_flags flags); From aa2f5ef5004704cd01282a1f1a9f99459c9dc021 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:16:40 -0400 Subject: [PATCH 05/11] t1006: test cat-file --batch-all-objects with duplicates The test for --batch-all-objects in t1006 covers a variety of object storage situations, but one thing it doesn't cover is that we avoid mentioning duplicate objects. We won't have any because running "git repack -ad" will have packed them all and deleted the loose ones. This does work (because we sort and de-dup the output list), but it's good to include it in our test. And doubly so for when we add an unordered mode which has to de-dup in a different way. Note that we cannot just re-create one of the objects, as Git will omit the write of an object that is already present. However, we can create a new pack with one of the objects, which forces the duplication. One alternative would be to just use "git repack -a" instead of "-ad". But then _every_ object would be duplicated as loose and packed, and we might miss a bug that omits packed objects (because we'd show their loose counterparts). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- t/t1006-cat-file.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index 13dd510b2e..4fb5e098f2 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -550,8 +550,8 @@ test_expect_success 'git cat-file --batch --follow-symlink returns correct sha a test_expect_success 'cat-file --batch-all-objects shows all objects' ' # make new repos so we know the full set of objects; we will # also make sure that there are some packed and some loose - # objects, some referenced and some not, and that there are - # some available only via alternates. + # objects, some referenced and some not, some duplicates, and that + # there are some available only via alternates. git init all-one && ( cd all-one && @@ -567,6 +567,8 @@ test_expect_success 'cat-file --batch-all-objects shows all objects' ' cd all-two && echo local-unref | git hash-object -w --stdin ) >>expect.unsorted && + git -C all-two rev-parse HEAD:file | + git -C all-two pack-objects .git/objects/pack/pack && sort expect && git -C all-two cat-file --batch-all-objects \ --batch-check="%(objectname)" >actual && From b1adb38458679af98a057bcdc988a7f6ce1247d6 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:17:14 -0400 Subject: [PATCH 06/11] cat-file: rename batch_{loose,packed}_object callbacks We're not really doing the batch-show operation in these callbacks, but just collecting the set of objects. That distinction will become more important in a future patch, so let's rename them now to avoid cluttering that diff. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 4a44b2404f..2d34f3b867 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -420,18 +420,18 @@ static int batch_object_cb(const struct object_id *oid, void *vdata) return 0; } -static int batch_loose_object(const struct object_id *oid, - const char *path, - void *data) +static int collect_loose_object(const struct object_id *oid, + const char *path, + void *data) { oid_array_append(data, oid); return 0; } -static int batch_packed_object(const struct object_id *oid, - struct packed_git *pack, - uint32_t pos, - void *data) +static int collect_packed_object(const struct object_id *oid, + struct packed_git *pack, + uint32_t pos, + void *data) { oid_array_append(data, oid); return 0; @@ -476,8 +476,8 @@ static int batch_objects(struct batch_options *opt) struct oid_array sa = OID_ARRAY_INIT; struct object_cb_data cb; - for_each_loose_object(batch_loose_object, &sa, 0); - for_each_packed_object(batch_packed_object, &sa, 0); + for_each_loose_object(collect_loose_object, &sa, 0); + for_each_packed_object(collect_packed_object, &sa, 0); if (repository_format_partial_clone) warning("This repository has extensions.partialClone set. Some objects may not be loaded."); From 0750bb5b51f021ecad6f33b7ec88cdfc2a8cdff4 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:24:57 -0400 Subject: [PATCH 07/11] cat-file: support "unordered" output for --batch-all-objects If you're going to access the contents of every object in a packfile, it's generally much more efficient to do so in pack order, rather than in hash order. That increases the locality of access within the packfile, which in turn is friendlier to the delta base cache, since the packfile puts related deltas next to each other. By contrast, hash order is effectively random, since the sha1 has no discernible relationship to the content. This patch introduces an "--unordered" option to cat-file which iterates over packs in pack-order under the hood. You can see the results when dumping all of the file content: $ time ./git cat-file --batch-all-objects --buffer --batch | wc -c 6883195596 real 0m44.491s user 0m42.902s sys 0m5.230s $ time ./git cat-file --unordered \ --batch-all-objects --buffer --batch | wc -c 6883195596 real 0m6.075s user 0m4.774s sys 0m3.548s Same output, different order, way faster. The same speed-up applies even if you end up accessing the object content in a different process, like: git cat-file --batch-all-objects --buffer --batch-check | grep blob | git cat-file --batch='%(objectname) %(rest)' | wc -c Adding "--unordered" to the first command drops the runtime in git.git from 24s to 3.5s. Side note: there are actually further speedups available for doing it all in-process now. Since we are outputting the object content during the actual pack iteration, we know where to find the object and could skip the extra lookup done by oid_object_info(). This patch stops short of that optimization since the underlying API isn't ready for us to make those sorts of direct requests. So if --unordered is so much better, why not make it the default? Two reasons: 1. We've promised in the documentation that --batch-all-objects outputs in hash order. Since cat-file is plumbing, people may be relying on that default, and we can't change it. 2. It's actually _slower_ for some cases. We have to compute the pack revindex to walk in pack order. And our de-duplication step uses an oidset, rather than a sort-and-dedup, which can end up being more expensive. If we're just accessing the type and size of each object, for example, like: git cat-file --batch-all-objects --buffer --batch-check my best-of-five warm cache timings go from 900ms to 1100ms using --unordered. Though it's possible in a cold-cache or under memory pressure that we could do better, since we'd have better locality within the packfile. And one final question: why is it "--unordered" and not "--pack-order"? The answer is again two-fold: 1. "pack order" isn't a well-defined thing across the whole set of objects. We're hitting loose objects, as well as objects in multiple packs, and the only ordering we're promising is _within_ a single pack. The rest is apparently random. 2. The point here is optimization. So we don't want to promise any particular ordering, but only to say that we will choose an ordering which is likely to be efficient for accessing the object content. That leaves the door open for further changes in the future without having to add another compatibility option. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- Documentation/git-cat-file.txt | 10 ++++++ builtin/cat-file.c | 56 +++++++++++++++++++++++++++++++--- t/t1006-cat-file.sh | 11 +++++++ 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/Documentation/git-cat-file.txt b/Documentation/git-cat-file.txt index f90f09b03f..74013335a1 100644 --- a/Documentation/git-cat-file.txt +++ b/Documentation/git-cat-file.txt @@ -104,6 +104,16 @@ OPTIONS buffering; this is much more efficient when invoking `--batch-check` on a large number of objects. +--unordered:: + When `--batch-all-objects` is in use, visit objects in an + order which may be more efficient for accessing the object + contents than hash order. The exact details of the order are + unspecified, but if you do not require a specific order, this + should generally result in faster output, especially with + `--batch`. Note that `cat-file` will still show each object + only once, even if it is stored multiple times in the + repository. + --allow-unknown-type:: Allow -s or -t to query broken/corrupt objects of unknown type. diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 2d34f3b867..45992c9be9 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -21,6 +21,7 @@ struct batch_options { int print_contents; int buffer_output; int all_objects; + int unordered; int cmdmode; /* may be 'w' or 'c' for --filters or --textconv */ const char *format; }; @@ -410,6 +411,7 @@ static void batch_one_object(const char *obj_name, struct batch_options *opt, struct object_cb_data { struct batch_options *opt; struct expand_data *expand; + struct oidset *seen; }; static int batch_object_cb(const struct object_id *oid, void *vdata) @@ -437,6 +439,32 @@ static int collect_packed_object(const struct object_id *oid, return 0; } +static int batch_unordered_object(const struct object_id *oid, void *vdata) +{ + struct object_cb_data *data = vdata; + + if (oidset_contains(data->seen, oid)) + return 0; + oidset_insert(data->seen, oid); + + return batch_object_cb(oid, data); +} + +static int batch_unordered_loose(const struct object_id *oid, + const char *path, + void *data) +{ + return batch_unordered_object(oid, data); +} + +static int batch_unordered_packed(const struct object_id *oid, + struct packed_git *pack, + uint32_t pos, + void *data) +{ + return batch_unordered_object(oid, data); +} + static int batch_objects(struct batch_options *opt) { struct strbuf buf = STRBUF_INIT; @@ -473,19 +501,35 @@ static int batch_objects(struct batch_options *opt) data.info.typep = &data.type; if (opt->all_objects) { - struct oid_array sa = OID_ARRAY_INIT; struct object_cb_data cb; - for_each_loose_object(collect_loose_object, &sa, 0); - for_each_packed_object(collect_packed_object, &sa, 0); if (repository_format_partial_clone) warning("This repository has extensions.partialClone set. Some objects may not be loaded."); cb.opt = opt; cb.expand = &data; - oid_array_for_each_unique(&sa, batch_object_cb, &cb); - oid_array_clear(&sa); + if (opt->unordered) { + struct oidset seen = OIDSET_INIT; + + cb.seen = &seen; + + for_each_loose_object(batch_unordered_loose, &cb, 0); + for_each_packed_object(batch_unordered_packed, &cb, + FOR_EACH_OBJECT_PACK_ORDER); + + oidset_clear(&seen); + } else { + struct oid_array sa = OID_ARRAY_INIT; + + for_each_loose_object(collect_loose_object, &sa, 0); + for_each_packed_object(collect_packed_object, &sa, 0); + + oid_array_for_each_unique(&sa, batch_object_cb, &cb); + + oid_array_clear(&sa); + } + return 0; } @@ -586,6 +630,8 @@ int cmd_cat_file(int argc, const char **argv, const char *prefix) N_("follow in-tree symlinks (used with --batch or --batch-check)")), OPT_BOOL(0, "batch-all-objects", &batch.all_objects, N_("show all objects with --batch or --batch-check")), + OPT_BOOL(0, "unordered", &batch.unordered, + N_("do not order --batch-all-objects output")), OPT_END() }; diff --git a/t/t1006-cat-file.sh b/t/t1006-cat-file.sh index 4fb5e098f2..7f19d591f2 100755 --- a/t/t1006-cat-file.sh +++ b/t/t1006-cat-file.sh @@ -575,4 +575,15 @@ test_expect_success 'cat-file --batch-all-objects shows all objects' ' test_cmp expect actual ' +# The only user-visible difference is that the objects are no longer sorted, +# and the resulting sort order is undefined. So we can only check that it +# produces the same objects as the ordered case, but that at least exercises +# the code. +test_expect_success 'cat-file --unordered works' ' + git -C all-two cat-file --batch-all-objects --unordered \ + --batch-check="%(objectname)" >actual.unsorted && + sort actual && + test_cmp expect actual +' + test_done From ced9fff75dad2578d7583ba3085970b03c66c57b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 14 Aug 2018 14:14:27 -0400 Subject: [PATCH 08/11] cat-file: use oidset check-and-insert We don't need to check if the oidset has our object before we insert it; that's done as part of the insertion. We can just rely on the return value from oidset_insert(), which saves one hash lookup per object. This measurable speedup is tiny and within the run-to-run noise, but the result is simpler to read, too. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 45992c9be9..04b5cda191 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -443,9 +443,8 @@ static int batch_unordered_object(const struct object_id *oid, void *vdata) { struct object_cb_data *data = vdata; - if (oidset_contains(data->seen, oid)) + if (oidset_insert(data->seen, oid)) return 0; - oidset_insert(data->seen, oid); return batch_object_cb(oid, data); } From 54d2f0d945abac2d8a8a1bcc258db937e597189e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 14 Aug 2018 14:18:06 -0400 Subject: [PATCH 09/11] cat-file: split batch "buf" into two variables We use the "buf" strbuf for two things: to read incoming lines, and as a scratch space for test-expanding the user-provided format. Let's split this into two variables with descriptive names, which makes their purpose and lifetime more clear. It will also help in a future patch when we start using the "output" buffer for more expansions. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 04b5cda191..3ed1d0be80 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -466,7 +466,8 @@ static int batch_unordered_packed(const struct object_id *oid, static int batch_objects(struct batch_options *opt) { - struct strbuf buf = STRBUF_INIT; + struct strbuf input = STRBUF_INIT; + struct strbuf output = STRBUF_INIT; struct expand_data data; int save_warning; int retval = 0; @@ -481,8 +482,9 @@ static int batch_objects(struct batch_options *opt) */ memset(&data, 0, sizeof(data)); data.mark_query = 1; - strbuf_expand(&buf, opt->format, expand_format, &data); + strbuf_expand(&output, opt->format, expand_format, &data); data.mark_query = 0; + strbuf_release(&output); if (opt->cmdmode) data.split_on_whitespace = 1; @@ -542,14 +544,14 @@ static int batch_objects(struct batch_options *opt) save_warning = warn_on_object_refname_ambiguity; warn_on_object_refname_ambiguity = 0; - while (strbuf_getline(&buf, stdin) != EOF) { + while (strbuf_getline(&input, stdin) != EOF) { if (data.split_on_whitespace) { /* * Split at first whitespace, tying off the beginning * of the string and saving the remainder (or NULL) in * data.rest. */ - char *p = strpbrk(buf.buf, " \t"); + char *p = strpbrk(input.buf, " \t"); if (p) { while (*p && strchr(" \t", *p)) *p++ = '\0'; @@ -557,10 +559,10 @@ static int batch_objects(struct batch_options *opt) data.rest = p; } - batch_one_object(buf.buf, opt, &data); + batch_one_object(input.buf, opt, &data); } - strbuf_release(&buf); + strbuf_release(&input); warn_on_object_refname_ambiguity = save_warning; return retval; } From 79ed0a5e2627a0e1eab0448e6f32d781e80bfafa Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 14 Aug 2018 14:20:22 -0400 Subject: [PATCH 10/11] cat-file: use a single strbuf for all output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we're in batch mode, we end up in batch_object_write() for each object, which allocates its own strbuf for each call. Instead, we can provide a single "scratch" buffer that gets reused for each output. When running: git cat-file --batch-all-objects --batch-check='%(objectname)' on git.git, my best-of-five time drops from: real 0m0.171s user 0m0.159s sys 0m0.012s to: real 0m0.133s user 0m0.121s sys 0m0.012s Note that we could do this just by putting the "scratch" pointer into "struct expand_data", but I chose instead to add an extra parameter to the callstack. That's more verbose, but it makes it a bit more obvious what is going on, which in turn makes it easy to see where we need to be releasing the string in the caller (right after the loop which uses it in each case). Based-on-a-patch-by: René Scharfe Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 3ed1d0be80..08dced2614 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -338,11 +338,11 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d } } -static void batch_object_write(const char *obj_name, struct batch_options *opt, +static void batch_object_write(const char *obj_name, + struct strbuf *scratch, + struct batch_options *opt, struct expand_data *data) { - struct strbuf buf = STRBUF_INIT; - if (!data->skip_object_info && oid_object_info_extended(the_repository, &data->oid, &data->info, OBJECT_INFO_LOOKUP_REPLACE) < 0) { @@ -352,10 +352,10 @@ static void batch_object_write(const char *obj_name, struct batch_options *opt, return; } - strbuf_expand(&buf, opt->format, expand_format, data); - strbuf_addch(&buf, '\n'); - batch_write(opt, buf.buf, buf.len); - strbuf_release(&buf); + strbuf_reset(scratch); + strbuf_expand(scratch, opt->format, expand_format, data); + strbuf_addch(scratch, '\n'); + batch_write(opt, scratch->buf, scratch->len); if (opt->print_contents) { print_object_or_die(opt, data); @@ -363,7 +363,9 @@ static void batch_object_write(const char *obj_name, struct batch_options *opt, } } -static void batch_one_object(const char *obj_name, struct batch_options *opt, +static void batch_one_object(const char *obj_name, + struct strbuf *scratch, + struct batch_options *opt, struct expand_data *data) { struct object_context ctx; @@ -405,20 +407,21 @@ static void batch_one_object(const char *obj_name, struct batch_options *opt, return; } - batch_object_write(obj_name, opt, data); + batch_object_write(obj_name, scratch, opt, data); } struct object_cb_data { struct batch_options *opt; struct expand_data *expand; struct oidset *seen; + struct strbuf *scratch; }; static int batch_object_cb(const struct object_id *oid, void *vdata) { struct object_cb_data *data = vdata; oidcpy(&data->expand->oid, oid); - batch_object_write(NULL, data->opt, data->expand); + batch_object_write(NULL, data->scratch, data->opt, data->expand); return 0; } @@ -509,6 +512,7 @@ static int batch_objects(struct batch_options *opt) cb.opt = opt; cb.expand = &data; + cb.scratch = &output; if (opt->unordered) { struct oidset seen = OIDSET_INIT; @@ -531,6 +535,7 @@ static int batch_objects(struct batch_options *opt) oid_array_clear(&sa); } + strbuf_release(&output); return 0; } @@ -559,10 +564,11 @@ static int batch_objects(struct batch_options *opt) data.rest = p; } - batch_one_object(input.buf, opt, &data); + batch_one_object(input.buf, &output, opt, &data); } strbuf_release(&input); + strbuf_release(&output); warn_on_object_refname_ambiguity = save_warning; return retval; } From 0889aae1cd18c1804ba01c1a4229e516dfb9fe9b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 14 Aug 2018 14:21:18 -0400 Subject: [PATCH 11/11] for_each_*_object: move declarations to object-store.h The for_each_loose_object() and for_each_packed_object() functions are meant to be part of a unified interface: they use the same set of for_each_object_flags, and it's not inconceivable that we might one day add a single for_each_object() wrapper around them. Let's put them together in a single file, so we can avoid awkwardness like saying "the flags for this function are over in cache.h". Moving the loose functions to packfile.h is silly. Moving the packed functions to cache.h works, but makes the "cache.h is a kitchen sink" problem worse. The best place is the recently-created object-store.h, since these are quite obviously related to object storage. The for_each_*_in_objdir() functions do not use the same flags, but they are logically part of the same interface as for_each_loose_object(), and share callback signatures. So we'll move those, as well, as they also make sense in object-store.h. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/prune-packed.c | 1 + cache.h | 75 ----------------------------------- object-store.h | 90 ++++++++++++++++++++++++++++++++++++++++++ packfile.h | 20 ---------- 4 files changed, 91 insertions(+), 95 deletions(-) diff --git a/builtin/prune-packed.c b/builtin/prune-packed.c index 4ff525e50f..a9e7b552b9 100644 --- a/builtin/prune-packed.c +++ b/builtin/prune-packed.c @@ -3,6 +3,7 @@ #include "progress.h" #include "parse-options.h" #include "packfile.h" +#include "object-store.h" static const char * const prune_packed_usage[] = { N_("git prune-packed [-n | --dry-run] [-q | --quiet]"), diff --git a/cache.h b/cache.h index 6d14702df2..aee36afa54 100644 --- a/cache.h +++ b/cache.h @@ -1575,81 +1575,6 @@ extern int odb_mkstemp(struct strbuf *temp_filename, const char *pattern); */ extern int odb_pack_keep(const char *name); -/* - * Iterate over the files in the loose-object parts of the object - * directory "path", triggering the following callbacks: - * - * - loose_object is called for each loose object we find. - * - * - loose_cruft is called for any files that do not appear to be - * loose objects. Note that we only look in the loose object - * directories "objects/[0-9a-f]{2}/", so we will not report - * "objects/foobar" as cruft. - * - * - loose_subdir is called for each top-level hashed subdirectory - * of the object directory (e.g., "$OBJDIR/f0"). It is called - * after the objects in the directory are processed. - * - * Any callback that is NULL will be ignored. Callbacks returning non-zero - * will end the iteration. - * - * In the "buf" variant, "path" is a strbuf which will also be used as a - * scratch buffer, but restored to its original contents before - * the function returns. - */ -typedef int each_loose_object_fn(const struct object_id *oid, - const char *path, - void *data); -typedef int each_loose_cruft_fn(const char *basename, - const char *path, - void *data); -typedef int each_loose_subdir_fn(unsigned int nr, - const char *path, - void *data); -int for_each_file_in_obj_subdir(unsigned int subdir_nr, - struct strbuf *path, - each_loose_object_fn obj_cb, - each_loose_cruft_fn cruft_cb, - each_loose_subdir_fn subdir_cb, - void *data); -int for_each_loose_file_in_objdir(const char *path, - each_loose_object_fn obj_cb, - each_loose_cruft_fn cruft_cb, - each_loose_subdir_fn subdir_cb, - void *data); -int for_each_loose_file_in_objdir_buf(struct strbuf *path, - each_loose_object_fn obj_cb, - each_loose_cruft_fn cruft_cb, - each_loose_subdir_fn subdir_cb, - void *data); - -/* - * Flags for for_each_*_object(), including for_each_loose below and - * for_each_packed in packfile.h. - */ -enum for_each_object_flags { - /* Iterate only over local objects, not alternates. */ - FOR_EACH_OBJECT_LOCAL_ONLY = (1<<0), - - /* Only iterate over packs obtained from the promisor remote. */ - FOR_EACH_OBJECT_PROMISOR_ONLY = (1<<1), - - /* - * Visit objects within a pack in packfile order rather than .idx order - */ - FOR_EACH_OBJECT_PACK_ORDER = (1<<2), -}; - -/* - * Iterate over all accessible loose objects without respect to - * reachability. By default, this includes both local and alternate objects. - * The order in which objects are visited is unspecified. - * - * Any flags specific to packs are ignored. - */ -int for_each_loose_object(each_loose_object_fn, void *, - enum for_each_object_flags flags); - /* * Set this to 0 to prevent sha1_object_info_extended() from fetching missing * blobs. This has a difference only if extensions.partialClone is set. diff --git a/object-store.h b/object-store.h index e481f7ad41..162156f5dc 100644 --- a/object-store.h +++ b/object-store.h @@ -262,4 +262,94 @@ int oid_object_info_extended(struct repository *r, const struct object_id *, struct object_info *, unsigned flags); +/* + * Iterate over the files in the loose-object parts of the object + * directory "path", triggering the following callbacks: + * + * - loose_object is called for each loose object we find. + * + * - loose_cruft is called for any files that do not appear to be + * loose objects. Note that we only look in the loose object + * directories "objects/[0-9a-f]{2}/", so we will not report + * "objects/foobar" as cruft. + * + * - loose_subdir is called for each top-level hashed subdirectory + * of the object directory (e.g., "$OBJDIR/f0"). It is called + * after the objects in the directory are processed. + * + * Any callback that is NULL will be ignored. Callbacks returning non-zero + * will end the iteration. + * + * In the "buf" variant, "path" is a strbuf which will also be used as a + * scratch buffer, but restored to its original contents before + * the function returns. + */ +typedef int each_loose_object_fn(const struct object_id *oid, + const char *path, + void *data); +typedef int each_loose_cruft_fn(const char *basename, + const char *path, + void *data); +typedef int each_loose_subdir_fn(unsigned int nr, + const char *path, + void *data); +int for_each_file_in_obj_subdir(unsigned int subdir_nr, + struct strbuf *path, + each_loose_object_fn obj_cb, + each_loose_cruft_fn cruft_cb, + each_loose_subdir_fn subdir_cb, + void *data); +int for_each_loose_file_in_objdir(const char *path, + each_loose_object_fn obj_cb, + each_loose_cruft_fn cruft_cb, + each_loose_subdir_fn subdir_cb, + void *data); +int for_each_loose_file_in_objdir_buf(struct strbuf *path, + each_loose_object_fn obj_cb, + each_loose_cruft_fn cruft_cb, + each_loose_subdir_fn subdir_cb, + void *data); + +/* Flags for for_each_*_object() below. */ +enum for_each_object_flags { + /* Iterate only over local objects, not alternates. */ + FOR_EACH_OBJECT_LOCAL_ONLY = (1<<0), + + /* Only iterate over packs obtained from the promisor remote. */ + FOR_EACH_OBJECT_PROMISOR_ONLY = (1<<1), + + /* + * Visit objects within a pack in packfile order rather than .idx order + */ + FOR_EACH_OBJECT_PACK_ORDER = (1<<2), +}; + +/* + * Iterate over all accessible loose objects without respect to + * reachability. By default, this includes both local and alternate objects. + * The order in which objects are visited is unspecified. + * + * Any flags specific to packs are ignored. + */ +int for_each_loose_object(each_loose_object_fn, void *, + enum for_each_object_flags flags); + +/* + * Iterate over all accessible packed objects without respect to reachability. + * By default, this includes both local and alternate packs. + * + * Note that some objects may appear twice if they are found in multiple packs. + * Each pack is visited in an unspecified order. By default, objects within a + * pack are visited in pack-idx order (i.e., sorted by oid). + */ +typedef int each_packed_object_fn(const struct object_id *oid, + struct packed_git *pack, + uint32_t pos, + void *data); +int for_each_object_in_pack(struct packed_git *p, + each_packed_object_fn, void *data, + enum for_each_object_flags flags); +int for_each_packed_object(each_packed_object_fn, void *, + enum for_each_object_flags flags); + #endif /* OBJECT_STORE_H */ diff --git a/packfile.h b/packfile.h index 99411bdd85..d91e43fe73 100644 --- a/packfile.h +++ b/packfile.h @@ -148,26 +148,6 @@ extern int has_object_pack(const struct object_id *oid); extern int has_pack_index(const unsigned char *sha1); -/* - * Iterate over all accessible packed objects without respect to reachability. - * By default, this includes both local and alternate packs. - * - * Note that some objects may appear twice if they are found in multiple packs. - * Each pack is visited in an unspecified order. By default, objects within a - * pack are visited in pack-idx order (i.e., sorted by oid). - * - * The list of flags can be found in cache.h. - */ -typedef int each_packed_object_fn(const struct object_id *oid, - struct packed_git *pack, - uint32_t pos, - void *data); -int for_each_object_in_pack(struct packed_git *p, - each_packed_object_fn, void *data, - enum for_each_object_flags flags); -int for_each_packed_object(each_packed_object_fn, void *, - enum for_each_object_flags flags); - /* * Return 1 if an object in a promisor packfile is or refers to the given * object, 0 otherwise.