From 35a9f1e99c5d31635bb78a4f2d498e72c04fc471 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:23 +0000
Subject: [PATCH 01/18] tree-walk.c: don't match submodule entries for
 'submod/anything'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Submodules should be handled the same as regular directories with
respect to the presence of a trailing slash, i.e. commands like:

  git diff rev1 rev2 -- $path
  git rev-list HEAD -- $path

should produce the same output whether $path is 'submod' or 'submod/'.
This has been fixed in commit 74b4f7f277 (tree-walk.c: ignore trailing
slash on submodule in tree_entry_interesting(), 2014-01-23).

Unfortunately, that commit had the unintended side effect to handle
'submod/anything' the same as 'submod' and 'submod/' as well, e.g.:

  $ git log --oneline --name-only -- sha1collisiondetection/whatever
  4125f78222 sha1dc: update from upstream
  sha1collisiondetection
  07a20f569b Makefile: fix unaligned loads in sha1dc with UBSan
  sha1collisiondetection
  23e37f8e9d sha1dc: update from upstream
  sha1collisiondetection
  86cfd61e6b sha1dc: optionally use sha1collisiondetection as a submodule
  sha1collisiondetection

Fix this by rejecting submodules as partial pathnames when their
trailing slash is followed by anything.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 t/t4010-diff-pathspec.sh | 4 +++-
 tree-walk.c              | 9 ++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/t/t4010-diff-pathspec.sh b/t/t4010-diff-pathspec.sh
index e5ca359edf..65cc703c65 100755
--- a/t/t4010-diff-pathspec.sh
+++ b/t/t4010-diff-pathspec.sh
@@ -125,7 +125,9 @@ test_expect_success 'setup submodules' '
 test_expect_success 'diff-tree ignores trailing slash on submodule path' '
 	git diff --name-only HEAD^ HEAD submod >expect &&
 	git diff --name-only HEAD^ HEAD submod/ >actual &&
-	test_cmp expect actual
+	test_cmp expect actual &&
+	git diff --name-only HEAD^ HEAD -- submod/whatever >actual &&
+	test_must_be_empty actual
 '
 
 test_expect_success 'diff multiple wildcard pathspecs' '
diff --git a/tree-walk.c b/tree-walk.c
index bb0ad34c54..0160294712 100644
--- a/tree-walk.c
+++ b/tree-walk.c
@@ -851,7 +851,14 @@ static int match_entry(const struct pathspec_item *item,
 	if (matchlen > pathlen) {
 		if (match[pathlen] != '/')
 			return 0;
-		if (!S_ISDIR(entry->mode) && !S_ISGITLINK(entry->mode))
+		/*
+		 * Reject non-directories as partial pathnames, except
+		 * when match is a submodule with a trailing slash and
+		 * nothing else (to handle 'submod/' and 'submod'
+		 * uniformly).
+		 */
+		if (!S_ISDIR(entry->mode) &&
+		    (!S_ISGITLINK(entry->mode) || matchlen > pathlen + 1))
 			return 0;
 	}
 

From cb9daf16db0e9935179fd995785e94c996267268 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:24 +0000
Subject: [PATCH 02/18] commit-graph: fix parsing the Chunk Lookup table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit-graph file format specifies that the chunks may be in any
order.  However, if the OID Lookup chunk happens to be the last one in
the file, then any command attempting to access the commit-graph data
will fail with:

  fatal: invalid commit position. commit-graph is likely corrupt

In this case the error is wrong, the commit-graph file does conform to
the specification, but the parsing of the Chunk Lookup table is a bit
buggy, and leaves the field holding the number of commits in the
commit-graph zero-initialized.

The number of commits in the commit-graph is determined while parsing
the Chunk Lookup table, by dividing the size of the OID Lookup chunk
with the hash size.  However, the Chunk Lookup table doesn't actually
store the size of the chunks, but it stores their starting offset.
Consequently, the size of a chunk can only be calculated by
subtracting the starting offsets of that chunk from the offset of the
subsequent chunk, or in case of the last chunk from the offset
recorded in the terminating label.  This is currenly implemented in a
bit complicated way: as we iterate over the entries of the Chunk
Lookup table, we check the ID of each chunk and store its starting
offset, then we check the ID of the last seen chunk and calculate its
size using its previously saved offset if necessary (at the moment
it's only necessary for the OID Lookup chunk).  Alas, while parsing
the Chunk Lookup table we only interate through the "real" chunks, but
never look at the terminating label, thus don't even check whether
it's necessary to calulate the size of the last chunk.  Consequently,
if the OID Lookup chunk is the last one, then we don't calculate its
size and turn don't run the piece of code determining the number of
commits in the commit graph, leaving the field holding that number
unchanged (i.e. zero-initialized), eventually triggering the sanity
check in load_oid_from_graph().

Fix this by iterating through all entries in the Chunk Lookup table,
including the terminating label.

Note that this is the minimal fix, suitable for the maintenance track.
A better fix would be to simplify how the chunk sizes are calculated,
but that is a more invasive change, less suitable for 'maint', so that
will be done in later patches.

This additional flexibility of scanning more chunks breaks a test for
"git commit-graph verify" so alter that test to mutate the commit-graph
to have an even lower chunk count.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c          | 2 +-
 t/t5318-commit-graph.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 196e817a84..7807d94562 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -277,7 +277,7 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 	last_chunk_id = 0;
 	last_chunk_offset = 8;
 	chunk_lookup = data + 8;
-	for (i = 0; i < graph->num_chunks; i++) {
+	for (i = 0; i <= graph->num_chunks; i++) {
 		uint32_t chunk_id;
 		uint64_t chunk_offset;
 		int chunk_repeated = 0;
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 18304a65e4..79e7fbcd40 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -488,7 +488,7 @@ test_expect_success 'detect bad hash version' '
 '
 
 test_expect_success 'detect low chunk count' '
-	corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\02" \
+	corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\01" \
 		"missing the .* chunk"
 '
 

From 6141cdfdcbe9e3edf25467582c5f32658ae79f40 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:25 +0000
Subject: [PATCH 03/18] commit-graph-format.txt: all multi-byte numbers are in
 network byte order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit-graph format specifies that "All 4-byte numbers are in
network order", but the commit-graph contains 8-byte integers as well
(file offsets in the Chunk Lookup table), and their byte order is
unspecified.

Clarify that all multi-byte integers are in network byte order.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Documentation/technical/commit-graph-format.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/technical/commit-graph-format.txt b/Documentation/technical/commit-graph-format.txt
index 1beef17182..440541045d 100644
--- a/Documentation/technical/commit-graph-format.txt
+++ b/Documentation/technical/commit-graph-format.txt
@@ -32,7 +32,7 @@ the body into "chunks" and provide a binary lookup table at the beginning
 of the body. The header includes certain values, such as number of chunks
 and hash type.
 
-All 4-byte numbers are in network order.
+All multi-byte numbers are in network byte order.
 
 HEADER:
 

From 1df15f8dee5c637c9013d11d5b8e72e189a04f06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:26 +0000
Subject: [PATCH 04/18] commit-slab: add a function to deep free entries on the
 slab
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

clear_##slabname() frees only the memory allocated for a commit slab
itself, but entries in the commit slab might own additional memory
outside the slab that should be freed as well.  We already have (at
least) one such commit slab, and this patch series is about to add one
more.

To free all additional memory owned by entries on the commit slab the
user of such a slab could iterate over all commits it knows about,
peek whether there is a valid entry associated with each commit, and
free the additional memory, if any.  Or it could rely on intimate
knowledge about the internals of the commit slab implementation, and
could itself iterate directly through all entries in the slab, and
free the additional memory.  Or it could just leak the additional
memory...

Introduce deep_clear_##slabname() to allow releasing memory owned by
commit slab entries by invoking the 'void free_fn(elemtype *ptr)'
function specified as parameter for each entry in the slab.

Use it in get_shallow_commits() in 'shallow.c' to replace an
open-coded iteration over a commit slab's entries.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-slab-decl.h |  1 +
 commit-slab-impl.h | 13 +++++++++++++
 commit-slab.h      | 10 ++++++++++
 shallow.c          | 14 +++++---------
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/commit-slab-decl.h b/commit-slab-decl.h
index adc7b46c83..286164b7e2 100644
--- a/commit-slab-decl.h
+++ b/commit-slab-decl.h
@@ -32,6 +32,7 @@ struct slabname {							\
 void init_ ##slabname## _with_stride(struct slabname *s, unsigned stride); \
 void init_ ##slabname(struct slabname *s);				\
 void clear_ ##slabname(struct slabname *s);				\
+void deep_clear_ ##slabname(struct slabname *s, void (*free_fn)(elemtype *ptr)); \
 elemtype *slabname## _at_peek(struct slabname *s, const struct commit *c, int add_if_missing); \
 elemtype *slabname## _at(struct slabname *s, const struct commit *c);	\
 elemtype *slabname## _peek(struct slabname *s, const struct commit *c)
diff --git a/commit-slab-impl.h b/commit-slab-impl.h
index 5c0eb91a5d..557738df27 100644
--- a/commit-slab-impl.h
+++ b/commit-slab-impl.h
@@ -38,6 +38,19 @@ scope void clear_ ##slabname(struct slabname *s)			\
 	FREE_AND_NULL(s->slab);						\
 }									\
 									\
+scope void deep_clear_ ##slabname(struct slabname *s, void (*free_fn)(elemtype *)) \
+{									\
+	unsigned int i;							\
+	for (i = 0; i < s->slab_count; i++) {				\
+		unsigned int j;						\
+		if (!s->slab[i])					\
+			continue;					\
+		for (j = 0; j < s->slab_size; j++)			\
+			free_fn(&s->slab[i][j * s->stride]);		\
+	}								\
+	clear_ ##slabname(s);						\
+}									\
+									\
 scope elemtype *slabname## _at_peek(struct slabname *s,			\
 						  const struct commit *c, \
 						  int add_if_missing)   \
diff --git a/commit-slab.h b/commit-slab.h
index 05b3f2804e..8e72a30536 100644
--- a/commit-slab.h
+++ b/commit-slab.h
@@ -47,6 +47,16 @@
  *
  *   Call this function before the slab falls out of scope to avoid
  *   leaking memory.
+ *
+ * - void deep_clear_indegree(struct indegree *, void (*free_fn)(int*))
+ *
+ *   Empties the slab, similar to clear_indegree(), but in addition it
+ *   calls the given 'free_fn' for each slab entry to release any
+ *   additional memory that might be owned by the entry (but not the
+ *   entry itself!).
+ *   Note that 'free_fn' might be called even for entries for which no
+ *   indegree_at() call has been made; in this case 'free_fn' is invoked
+ *   with a pointer to a zero-initialized location.
  */
 
 #define define_commit_slab(slabname, elemtype) \
diff --git a/shallow.c b/shallow.c
index 7fd04afed1..c4ac8a7327 100644
--- a/shallow.c
+++ b/shallow.c
@@ -84,6 +84,10 @@ int is_repository_shallow(struct repository *r)
  * supports a "valid" flag.
  */
 define_commit_slab(commit_depth, int *);
+static void free_depth_in_slab(int **ptr)
+{
+	FREE_AND_NULL(*ptr);
+}
 struct commit_list *get_shallow_commits(struct object_array *heads, int depth,
 		int shallow_flag, int not_shallow_flag)
 {
@@ -150,15 +154,7 @@ struct commit_list *get_shallow_commits(struct object_array *heads, int depth,
 			}
 		}
 	}
-	for (i = 0; i < depths.slab_count; i++) {
-		int j;
-
-		if (!depths.slab[i])
-			continue;
-		for (j = 0; j < depths.slab_size; j++)
-			free(depths.slab[i][j]);
-	}
-	clear_commit_depth(&depths);
+	deep_clear_commit_depth(&depths, free_depth_in_slab);
 
 	return result;
 }

From 0ee3cb888dc4da900b2e701ef8106205d8963ffb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:27 +0000
Subject: [PATCH 05/18] diff.h: drop diff_tree_oid() & friends' return value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ll_diff_tree_oid() has only ever returned 0 [1], so it's return value
is basically useless.  It's only caller diff_tree_oid() has only ever
returned the return value of ll_diff_tree_oid() as-is [2], so its
return value is just as useless.  Most of diff_tree_oid()'s callers
simply ignore its return value, except:

  - diff_root_tree_oid() is a thin wrapper around diff_tree_oid() and
    returns with its return value, but all of diff_root_tree_oid()'s
    callers ignore its return value.

  - rev_compare_tree() and rev_same_tree_as_empty() do look at the
    return value in a condition, but, since the return value is always
    0, the former's < 0 condition is never fulfilled, while the
    latter's >= 0 condition is always fulfilled.

So let's drop the return value of ll_diff_tree_oid(), diff_tree_oid()
and diff_root_tree_oid(), and drop those conditions from
rev_compare_tree() and rev_same_tree_as_empty() as well.

[1] ll_diff_tree_oid() and its ancestors have been returning only 0
    ever since it was introduced as diff_tree() in 9174026cfe (Add
    "diff-tree" program to show which files have changed between two
    trees., 2005-04-09).
[2] diff_tree_oid() traces back to diff-tree.c:main() in 9174026cfe as
    well.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 diff.h      | 10 +++++-----
 revision.c  |  9 +++------
 tree-diff.c | 30 ++++++++++++++----------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/diff.h b/diff.h
index 9443dc1b00..e0c0af6286 100644
--- a/diff.h
+++ b/diff.h
@@ -431,11 +431,11 @@ struct combine_diff_path *diff_tree_paths(
 	struct combine_diff_path *p, const struct object_id *oid,
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt);
-int diff_tree_oid(const struct object_id *old_oid,
-		  const struct object_id *new_oid,
-		  const char *base, struct diff_options *opt);
-int diff_root_tree_oid(const struct object_id *new_oid, const char *base,
-		       struct diff_options *opt);
+void diff_tree_oid(const struct object_id *old_oid,
+		   const struct object_id *new_oid,
+		   const char *base, struct diff_options *opt);
+void diff_root_tree_oid(const struct object_id *new_oid, const char *base,
+			struct diff_options *opt);
 
 struct combine_diff_path {
 	struct combine_diff_path *next;
diff --git a/revision.c b/revision.c
index cbf4b61aa6..c644c66091 100644
--- a/revision.c
+++ b/revision.c
@@ -791,9 +791,7 @@ static int rev_compare_tree(struct rev_info *revs,
 
 	tree_difference = REV_TREE_SAME;
 	revs->pruning.flags.has_changes = 0;
-	if (diff_tree_oid(&t1->object.oid, &t2->object.oid, "",
-			   &revs->pruning) < 0)
-		return REV_TREE_DIFFERENT;
+	diff_tree_oid(&t1->object.oid, &t2->object.oid, "", &revs->pruning);
 
 	if (!nth_parent)
 		if (bloom_ret == 1 && tree_difference == REV_TREE_SAME)
@@ -804,7 +802,6 @@ static int rev_compare_tree(struct rev_info *revs,
 
 static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit)
 {
-	int retval;
 	struct tree *t1 = get_commit_tree(commit);
 
 	if (!t1)
@@ -812,9 +809,9 @@ static int rev_same_tree_as_empty(struct rev_info *revs, struct commit *commit)
 
 	tree_difference = REV_TREE_SAME;
 	revs->pruning.flags.has_changes = 0;
-	retval = diff_tree_oid(NULL, &t1->object.oid, "", &revs->pruning);
+	diff_tree_oid(NULL, &t1->object.oid, "", &revs->pruning);
 
-	return retval >= 0 && (tree_difference == REV_TREE_SAME);
+	return tree_difference == REV_TREE_SAME;
 }
 
 struct treesame_state {
diff --git a/tree-diff.c b/tree-diff.c
index f3d303c6e5..6ebad1a46f 100644
--- a/tree-diff.c
+++ b/tree-diff.c
@@ -29,9 +29,9 @@ static struct combine_diff_path *ll_diff_tree_paths(
 	struct combine_diff_path *p, const struct object_id *oid,
 	const struct object_id **parents_oid, int nparent,
 	struct strbuf *base, struct diff_options *opt);
-static int ll_diff_tree_oid(const struct object_id *old_oid,
-			    const struct object_id *new_oid,
-			    struct strbuf *base, struct diff_options *opt);
+static void ll_diff_tree_oid(const struct object_id *old_oid,
+			     const struct object_id *new_oid,
+			     struct strbuf *base, struct diff_options *opt);
 
 /*
  * Compare two tree entries, taking into account only path/S_ISDIR(mode),
@@ -679,9 +679,9 @@ static void try_to_follow_renames(const struct object_id *old_oid,
 	q->nr = 1;
 }
 
-static int ll_diff_tree_oid(const struct object_id *old_oid,
-			    const struct object_id *new_oid,
-			    struct strbuf *base, struct diff_options *opt)
+static void ll_diff_tree_oid(const struct object_id *old_oid,
+			     const struct object_id *new_oid,
+			     struct strbuf *base, struct diff_options *opt)
 {
 	struct combine_diff_path phead, *p;
 	pathchange_fn_t pathchange_old = opt->pathchange;
@@ -697,29 +697,27 @@ static int ll_diff_tree_oid(const struct object_id *old_oid,
 	}
 
 	opt->pathchange = pathchange_old;
-	return 0;
 }
 
-int diff_tree_oid(const struct object_id *old_oid,
-		  const struct object_id *new_oid,
-		  const char *base_str, struct diff_options *opt)
+void diff_tree_oid(const struct object_id *old_oid,
+		   const struct object_id *new_oid,
+		   const char *base_str, struct diff_options *opt)
 {
 	struct strbuf base;
-	int retval;
 
 	strbuf_init(&base, PATH_MAX);
 	strbuf_addstr(&base, base_str);
 
-	retval = ll_diff_tree_oid(old_oid, new_oid, &base, opt);
+	ll_diff_tree_oid(old_oid, new_oid, &base, opt);
 	if (!*base_str && opt->flags.follow_renames && diff_might_be_rename())
 		try_to_follow_renames(old_oid, new_oid, &base, opt);
 
 	strbuf_release(&base);
-
-	return retval;
 }
 
-int diff_root_tree_oid(const struct object_id *new_oid, const char *base, struct diff_options *opt)
+void diff_root_tree_oid(const struct object_id *new_oid,
+			const char *base,
+			struct diff_options *opt)
 {
-	return diff_tree_oid(NULL, new_oid, base, opt);
+	diff_tree_oid(NULL, new_oid, base, opt);
 }

From fa7965309e709e6dcd59c38fc6743d17d82d538b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:28 +0000
Subject: [PATCH 06/18] commit-graph: clean up #includes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our CodingGuidelines says that it's sufficient to include one of
'git-compat-util.h' and 'cache.h', but both 'commit-graph.c' and
'commit-graph.h' include both.  Let's include only 'git-compat-util.h'
to loose a bunch of unnecessary dependencies; but include 'hash.h',
because 'commit-graph.h' does require the definition of 'struct
object_id'.

'commit-graph.h' explicitly includes 'repository.h' and
'string-list.h', but only needs the declaration of a few structs from
them.  Drop these includes and forward-declare the necessary structs
instead.

'commit-graph.c' includes 'dir.h', but doesn't actually use anything
from there, so let's drop that #include as well.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 4 +---
 commit-graph.h | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 7807d94562..6ed649388d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1,7 +1,5 @@
-#include "cache.h"
-#include "config.h"
-#include "dir.h"
 #include "git-compat-util.h"
+#include "config.h"
 #include "lockfile.h"
 #include "pack.h"
 #include "packfile.h"
diff --git a/commit-graph.h b/commit-graph.h
index 39484482cc..881c9b46e5 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -2,9 +2,6 @@
 #define COMMIT_GRAPH_H
 
 #include "git-compat-util.h"
-#include "repository.h"
-#include "string-list.h"
-#include "cache.h"
 #include "object-store.h"
 
 #define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
@@ -22,6 +19,9 @@ void git_test_write_commit_graph_or_die(void);
 
 struct commit;
 struct bloom_filter_settings;
+struct repository;
+struct raw_object_store;
+struct string_list;
 
 char *get_commit_graph_filename(struct object_directory *odb);
 int open_commit_graph(const char *graph_file, int *fd, struct stat *st);

From 2ad4f1a7c43cac4ba52cff56f9090d3322c67224 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:29 +0000
Subject: [PATCH 07/18] commit-graph: simplify parse_commit_graph() #1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While we iterate over all entries of the Chunk Lookup table we make
sure that we don't attempt to read past the end of the mmap-ed
commit-graph file, and check in each iteration that the chunk ID and
offset we are about to read is still within the mmap-ed memory region.
However, these checks in each iteration are not really necessary,
because the number of chunks in the commit-graph file is already known
before this loop from the just parsed commit-graph header.

So let's check that the commit-graph file is large enough for all
entries in the Chunk Lookup table before we start iterating over those
entries, and drop those per-iteration checks.  While at it, take into
account the size of everything that is necessary to have a valid
commit-graph file, i.e. the size of the header, the size of the
mandatory OID Fanout chunk, and the size of the signature in the
trailer as well.

Note that this necessitates the change of the error message as well,
and, consequently, have to update the 'detect incorrect chunk count'
test in 't5318-commit-graph.sh' as well.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c          | 16 +++++++++-------
 t/t5318-commit-graph.sh |  3 ++-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 6ed649388d..9927762f18 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -272,6 +272,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 	graph->data = graph_map;
 	graph->data_len = graph_size;
 
+	if (graph_size < GRAPH_HEADER_SIZE +
+			 (graph->num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH +
+			 GRAPH_FANOUT_SIZE + the_hash_algo->rawsz) {
+		error(_("commit-graph file is too small to hold %u chunks"),
+		      graph->num_chunks);
+		free(graph);
+		return NULL;
+	}
+
 	last_chunk_id = 0;
 	last_chunk_offset = 8;
 	chunk_lookup = data + 8;
@@ -280,13 +289,6 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 		uint64_t chunk_offset;
 		int chunk_repeated = 0;
 
-		if (data + graph_size - chunk_lookup <
-		    GRAPH_CHUNKLOOKUP_WIDTH) {
-			error(_("commit-graph chunk lookup table entry missing; file may be incomplete"));
-			free(graph);
-			return NULL;
-		}
-
 		chunk_id = get_be32(chunk_lookup + 0);
 		chunk_offset = get_be64(chunk_lookup + 4);
 
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 79e7fbcd40..1073f9e3cf 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -574,7 +574,8 @@ test_expect_success 'detect invalid checksum hash' '
 
 test_expect_success 'detect incorrect chunk count' '
 	corrupt_graph_and_verify $GRAPH_BYTE_CHUNK_COUNT "\377" \
-		"chunk lookup table entry missing" $GRAPH_CHUNK_LOOKUP_OFFSET
+		"commit-graph file is too small to hold [0-9]* chunks" \
+		$GRAPH_CHUNK_LOOKUP_OFFSET
 '
 
 test_expect_success 'git fsck (checks commit-graph)' '

From 5cfa438a76f7f67f34f6f758b62943261ded1fb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:30 +0000
Subject: [PATCH 08/18] commit-graph: simplify parse_commit_graph() #2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Chunk Lookup table stores the chunks' starting offset in the
commit-graph file, not their sizes.  Consequently, the size of a chunk
can only be calculated by subtracting its offset from the offset of
the subsequent chunk (or that of the terminating label).  This is
currenly implemented in a bit complicated way: as we iterate over the
entries of the Chunk Lookup table, we check the id of each chunk and
store its starting offset, then we check the id of the last seen chunk
and calculate its size using its previously saved offset.  At the
moment there is only one chunk for which we calculate its size, but
this patch series will add more, and the repeated chunk id checks are
not that pretty.

Instead let's read ahead the offset of the next chunk on each
iteration, so we can calculate the size of each chunk right away,
right where we store its starting offset.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 9927762f18..84206f0f51 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -230,8 +230,7 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 	const unsigned char *data, *chunk_lookup;
 	uint32_t i;
 	struct commit_graph *graph;
-	uint64_t last_chunk_offset;
-	uint32_t last_chunk_id;
+	uint64_t next_chunk_offset;
 	uint32_t graph_signature;
 	unsigned char graph_version, hash_version;
 
@@ -281,18 +280,17 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 		return NULL;
 	}
 
-	last_chunk_id = 0;
-	last_chunk_offset = 8;
 	chunk_lookup = data + 8;
-	for (i = 0; i <= graph->num_chunks; i++) {
+	next_chunk_offset = get_be64(chunk_lookup + 4);
+	for (i = 0; i < graph->num_chunks; i++) {
 		uint32_t chunk_id;
-		uint64_t chunk_offset;
+		uint64_t chunk_offset = next_chunk_offset;
 		int chunk_repeated = 0;
 
 		chunk_id = get_be32(chunk_lookup + 0);
-		chunk_offset = get_be64(chunk_lookup + 4);
 
 		chunk_lookup += GRAPH_CHUNKLOOKUP_WIDTH;
+		next_chunk_offset = get_be64(chunk_lookup + 4);
 
 		if (chunk_offset > graph_size - the_hash_algo->rawsz) {
 			error(_("commit-graph improper chunk offset %08x%08x"), (uint32_t)(chunk_offset >> 32),
@@ -312,8 +310,11 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 		case GRAPH_CHUNKID_OIDLOOKUP:
 			if (graph->chunk_oid_lookup)
 				chunk_repeated = 1;
-			else
+			else {
 				graph->chunk_oid_lookup = data + chunk_offset;
+				graph->num_commits = (next_chunk_offset - chunk_offset)
+						     / graph->hash_len;
+			}
 			break;
 
 		case GRAPH_CHUNKID_DATA:
@@ -368,15 +369,6 @@ struct commit_graph *parse_commit_graph(void *graph_map, int fd,
 			free(graph);
 			return NULL;
 		}
-
-		if (last_chunk_id == GRAPH_CHUNKID_OIDLOOKUP)
-		{
-			graph->num_commits = (chunk_offset - last_chunk_offset)
-					     / graph->hash_len;
-		}
-
-		last_chunk_id = chunk_id;
-		last_chunk_offset = chunk_offset;
 	}
 
 	if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {

From bb4d60e5d536009e0c97bfa7f464416c29c04c81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:31 +0000
Subject: [PATCH 09/18] commit-graph: simplify write_commit_graph_file() #1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In write_commit_graph_file() one block of code fills the array of
chunk IDs, another block of code fills the array of chunk offsets,
then the chunk IDs and offsets are written to the Chunk Lookup table,
and finally a third block of code writes the actual chunks.  In case
of optional chunks like Extra Edge List and Base Graphs List there is
also a condition checking whether that chunk is necessary/desired, and
that same condition is repeated in all those three blocks of code.
This patch series is about to add more optional chunks, so there would
be even more repeated conditions.

Those chunk offsets are relative to the beginning of the file, so they
inherently depend on the size of the Chunk Lookup table, which in turn
depends on the number of chunks that are to be written to the
commit-graph file.  IOW at the time we set the first chunk's ID we
can't yet know its offset, because we don't yet know how many chunks
there are.

Simplify this by initially filling an array of chunk sizes, not
offsets, and calculate the offsets based on the chunk sizes only
later, while we are writing the Chunk Lookup table.  This way we can
fill the arrays of chunk IDs and sizes in one go, eliminating one set
of repeated conditions.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 46 +++++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 84206f0f51..79cddabcd1 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1529,10 +1529,11 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	struct hashfile *f;
 	struct lock_file lk = LOCK_INIT;
 	uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
-	uint64_t chunk_offsets[MAX_NUM_CHUNKS + 1];
+	uint64_t chunk_sizes[MAX_NUM_CHUNKS + 1];
 	const unsigned hashsz = the_hash_algo->rawsz;
 	struct strbuf progress_title = STRBUF_INIT;
 	int num_chunks = 3;
+	uint64_t chunk_offset;
 	struct object_id file_hash;
 	const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
@@ -1573,50 +1574,34 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	}
 
 	chunk_ids[0] = GRAPH_CHUNKID_OIDFANOUT;
+	chunk_sizes[0] = GRAPH_FANOUT_SIZE;
 	chunk_ids[1] = GRAPH_CHUNKID_OIDLOOKUP;
+	chunk_sizes[1] = hashsz * ctx->commits.nr;
 	chunk_ids[2] = GRAPH_CHUNKID_DATA;
+	chunk_sizes[2] = (hashsz + 16) * ctx->commits.nr;
+
 	if (ctx->num_extra_edges) {
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
+		chunk_sizes[num_chunks] = 4 * ctx->num_extra_edges;
 		num_chunks++;
 	}
 	if (ctx->changed_paths) {
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
+		chunk_sizes[num_chunks] = sizeof(uint32_t) * ctx->commits.nr;
 		num_chunks++;
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
+		chunk_sizes[num_chunks] = sizeof(uint32_t) * 3
+					  + ctx->total_bloom_filter_data_size;
 		num_chunks++;
 	}
 	if (ctx->num_commit_graphs_after > 1) {
 		chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
+		chunk_sizes[num_chunks] = hashsz * (ctx->num_commit_graphs_after - 1);
 		num_chunks++;
 	}
 
 	chunk_ids[num_chunks] = 0;
-
-	chunk_offsets[0] = 8 + (num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH;
-	chunk_offsets[1] = chunk_offsets[0] + GRAPH_FANOUT_SIZE;
-	chunk_offsets[2] = chunk_offsets[1] + hashsz * ctx->commits.nr;
-	chunk_offsets[3] = chunk_offsets[2] + (hashsz + 16) * ctx->commits.nr;
-
-	num_chunks = 3;
-	if (ctx->num_extra_edges) {
-		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
-						4 * ctx->num_extra_edges;
-		num_chunks++;
-	}
-	if (ctx->changed_paths) {
-		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
-						sizeof(uint32_t) * ctx->commits.nr;
-		num_chunks++;
-
-		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
-						sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size;
-		num_chunks++;
-	}
-	if (ctx->num_commit_graphs_after > 1) {
-		chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
-						hashsz * (ctx->num_commit_graphs_after - 1);
-		num_chunks++;
-	}
+	chunk_sizes[num_chunks] = 0;
 
 	hashwrite_be32(f, GRAPH_SIGNATURE);
 
@@ -1625,13 +1610,16 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	hashwrite_u8(f, num_chunks);
 	hashwrite_u8(f, ctx->num_commit_graphs_after - 1);
 
+	chunk_offset = 8 + (num_chunks + 1) * GRAPH_CHUNKLOOKUP_WIDTH;
 	for (i = 0; i <= num_chunks; i++) {
 		uint32_t chunk_write[3];
 
 		chunk_write[0] = htonl(chunk_ids[i]);
-		chunk_write[1] = htonl(chunk_offsets[i] >> 32);
-		chunk_write[2] = htonl(chunk_offsets[i] & 0xffffffff);
+		chunk_write[1] = htonl(chunk_offset >> 32);
+		chunk_write[2] = htonl(chunk_offset & 0xffffffff);
 		hashwrite(f, chunk_write, 12);
+
+		chunk_offset += chunk_sizes[i];
 	}
 
 	if (ctx->report_progress) {

From 7fbfe07ab4d4e58c0971dac73001b89f180a0af3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Fri, 5 Jun 2020 13:00:32 +0000
Subject: [PATCH 10/18] commit-graph: simplify write_commit_graph_file() #2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unify the 'chunk_ids' and 'chunk_sizes' arrays into an array of
'struct chunk_info'.  This will allow more cleanups in the following
patches.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 79cddabcd1..887837e882 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1522,14 +1522,18 @@ static int write_graph_chunk_base(struct hashfile *f,
 	return 0;
 }
 
+struct chunk_info {
+	uint32_t id;
+	uint64_t size;
+};
+
 static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 {
 	uint32_t i;
 	int fd;
 	struct hashfile *f;
 	struct lock_file lk = LOCK_INIT;
-	uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
-	uint64_t chunk_sizes[MAX_NUM_CHUNKS + 1];
+	struct chunk_info chunks[MAX_NUM_CHUNKS + 1];
 	const unsigned hashsz = the_hash_algo->rawsz;
 	struct strbuf progress_title = STRBUF_INIT;
 	int num_chunks = 3;
@@ -1573,35 +1577,34 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 		f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
 	}
 
-	chunk_ids[0] = GRAPH_CHUNKID_OIDFANOUT;
-	chunk_sizes[0] = GRAPH_FANOUT_SIZE;
-	chunk_ids[1] = GRAPH_CHUNKID_OIDLOOKUP;
-	chunk_sizes[1] = hashsz * ctx->commits.nr;
-	chunk_ids[2] = GRAPH_CHUNKID_DATA;
-	chunk_sizes[2] = (hashsz + 16) * ctx->commits.nr;
-
+	chunks[0].id = GRAPH_CHUNKID_OIDFANOUT;
+	chunks[0].size = GRAPH_FANOUT_SIZE;
+	chunks[1].id = GRAPH_CHUNKID_OIDLOOKUP;
+	chunks[1].size = hashsz * ctx->commits.nr;
+	chunks[2].id = GRAPH_CHUNKID_DATA;
+	chunks[2].size = (hashsz + 16) * ctx->commits.nr;
 	if (ctx->num_extra_edges) {
-		chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
-		chunk_sizes[num_chunks] = 4 * ctx->num_extra_edges;
+		chunks[num_chunks].id = GRAPH_CHUNKID_EXTRAEDGES;
+		chunks[num_chunks].size = 4 * ctx->num_extra_edges;
 		num_chunks++;
 	}
 	if (ctx->changed_paths) {
-		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
-		chunk_sizes[num_chunks] = sizeof(uint32_t) * ctx->commits.nr;
+		chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMINDEXES;
+		chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr;
 		num_chunks++;
-		chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
-		chunk_sizes[num_chunks] = sizeof(uint32_t) * 3
+		chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMDATA;
+		chunks[num_chunks].size = sizeof(uint32_t) * 3
 					  + ctx->total_bloom_filter_data_size;
 		num_chunks++;
 	}
 	if (ctx->num_commit_graphs_after > 1) {
-		chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
-		chunk_sizes[num_chunks] = hashsz * (ctx->num_commit_graphs_after - 1);
+		chunks[num_chunks].id = GRAPH_CHUNKID_BASE;
+		chunks[num_chunks].size = hashsz * (ctx->num_commit_graphs_after - 1);
 		num_chunks++;
 	}
 
-	chunk_ids[num_chunks] = 0;
-	chunk_sizes[num_chunks] = 0;
+	chunks[num_chunks].id = 0;
+	chunks[num_chunks].size = 0;
 
 	hashwrite_be32(f, GRAPH_SIGNATURE);
 
@@ -1614,12 +1617,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	for (i = 0; i <= num_chunks; i++) {
 		uint32_t chunk_write[3];
 
-		chunk_write[0] = htonl(chunk_ids[i]);
+		chunk_write[0] = htonl(chunks[i].id);
 		chunk_write[1] = htonl(chunk_offset >> 32);
 		chunk_write[2] = htonl(chunk_offset & 0xffffffff);
 		hashwrite(f, chunk_write, 12);
 
-		chunk_offset += chunk_sizes[i];
+		chunk_offset += chunks[i].size;
 	}
 
 	if (ctx->report_progress) {

From 0f8ee206951ad25ea652ac9b00455df22e825c45 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 15 Jun 2020 20:14:46 +0000
Subject: [PATCH 11/18] commit-graph: place bloom_settings in context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Place an instance of struct bloom_settings into the struct
write_commit_graph_context. This allows simplifying the function
prototype of write_graph_chunk_bloom_data(). This will allow us
to combine the function prototypes and use function pointers to
simplify write_commit_graph_file().

Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 887837e882..05b7035d8d 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -882,6 +882,7 @@ struct write_commit_graph_context {
 
 	const struct split_commit_graph_opts *split_opts;
 	size_t total_bloom_filter_data_size;
+	struct bloom_filter_settings bloom_settings;
 };
 
 static void write_graph_chunk_fanout(struct hashfile *f,
@@ -1103,8 +1104,7 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f,
 }
 
 static void write_graph_chunk_bloom_data(struct hashfile *f,
-					 struct write_commit_graph_context *ctx,
-					 const struct bloom_filter_settings *settings)
+					 struct write_commit_graph_context *ctx)
 {
 	struct commit **list = ctx->commits.list;
 	struct commit **last = ctx->commits.list + ctx->commits.nr;
@@ -1116,9 +1116,9 @@ static void write_graph_chunk_bloom_data(struct hashfile *f,
 			_("Writing changed paths Bloom filters data"),
 			ctx->commits.nr);
 
-	hashwrite_be32(f, settings->hash_version);
-	hashwrite_be32(f, settings->num_hashes);
-	hashwrite_be32(f, settings->bits_per_entry);
+	hashwrite_be32(f, ctx->bloom_settings.hash_version);
+	hashwrite_be32(f, ctx->bloom_settings.num_hashes);
+	hashwrite_be32(f, ctx->bloom_settings.bits_per_entry);
 
 	while (list < last) {
 		struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
@@ -1541,6 +1541,8 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 	struct object_id file_hash;
 	const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
+	ctx->bloom_settings = bloom_settings;
+
 	if (ctx->split) {
 		struct strbuf tmp_file = STRBUF_INIT;
 
@@ -1642,7 +1644,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 		write_graph_chunk_extra_edges(f, ctx);
 	if (ctx->changed_paths) {
 		write_graph_chunk_bloom_indexes(f, ctx);
-		write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
+		write_graph_chunk_bloom_data(f, ctx);
 	}
 	if (ctx->num_commit_graphs_after > 1 &&
 	    write_graph_chunk_base(f, ctx)) {

From 7ea38d79b7987704013e55b3e9e2ff104dfb812c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Mon, 15 Jun 2020 20:14:47 +0000
Subject: [PATCH 12/18] commit-graph: unify the signatures of all
 write_graph_chunk_*() functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the write_graph_chunk_*() helper functions to have the same
signature:

  - Return an int error code from all these functions.
    write_graph_chunk_base() already has an int error code, now the
    others will have one, too, but since they don't indicate any
    error, they will always return 0.

  - Drop the hash size parameter of write_graph_chunk_oids() and
    write_graph_chunk_data(); its value can be read directly from
    'the_hash_algo' inside these functions as well.

This opens up the possibility for further cleanups and foolproofing in
the following two patches.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 05b7035d8d..3bae1e52ed 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -885,8 +885,8 @@ struct write_commit_graph_context {
 	struct bloom_filter_settings bloom_settings;
 };
 
-static void write_graph_chunk_fanout(struct hashfile *f,
-				     struct write_commit_graph_context *ctx)
+static int write_graph_chunk_fanout(struct hashfile *f,
+				    struct write_commit_graph_context *ctx)
 {
 	int i, count = 0;
 	struct commit **list = ctx->commits.list;
@@ -907,17 +907,21 @@ static void write_graph_chunk_fanout(struct hashfile *f,
 
 		hashwrite_be32(f, count);
 	}
+
+	return 0;
 }
 
-static void write_graph_chunk_oids(struct hashfile *f, int hash_len,
-				   struct write_commit_graph_context *ctx)
+static int write_graph_chunk_oids(struct hashfile *f,
+				  struct write_commit_graph_context *ctx)
 {
 	struct commit **list = ctx->commits.list;
 	int count;
 	for (count = 0; count < ctx->commits.nr; count++, list++) {
 		display_progress(ctx->progress, ++ctx->progress_cnt);
-		hashwrite(f, (*list)->object.oid.hash, (int)hash_len);
+		hashwrite(f, (*list)->object.oid.hash, (int)the_hash_algo->rawsz);
 	}
+
+	return 0;
 }
 
 static const unsigned char *commit_to_sha1(size_t index, void *table)
@@ -926,8 +930,8 @@ static const unsigned char *commit_to_sha1(size_t index, void *table)
 	return commits[index]->object.oid.hash;
 }
 
-static void write_graph_chunk_data(struct hashfile *f, int hash_len,
-				   struct write_commit_graph_context *ctx)
+static int write_graph_chunk_data(struct hashfile *f,
+				  struct write_commit_graph_context *ctx)
 {
 	struct commit **list = ctx->commits.list;
 	struct commit **last = ctx->commits.list + ctx->commits.nr;
@@ -944,7 +948,7 @@ static void write_graph_chunk_data(struct hashfile *f, int hash_len,
 			die(_("unable to parse commit %s"),
 				oid_to_hex(&(*list)->object.oid));
 		tree = get_commit_tree_oid(*list);
-		hashwrite(f, tree->hash, hash_len);
+		hashwrite(f, tree->hash, the_hash_algo->rawsz);
 
 		parent = (*list)->parents;
 
@@ -1024,10 +1028,12 @@ static void write_graph_chunk_data(struct hashfile *f, int hash_len,
 
 		list++;
 	}
+
+	return 0;
 }
 
-static void write_graph_chunk_extra_edges(struct hashfile *f,
-					  struct write_commit_graph_context *ctx)
+static int write_graph_chunk_extra_edges(struct hashfile *f,
+					 struct write_commit_graph_context *ctx)
 {
 	struct commit **list = ctx->commits.list;
 	struct commit **last = ctx->commits.list + ctx->commits.nr;
@@ -1076,10 +1082,12 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
 
 		list++;
 	}
+
+	return 0;
 }
 
-static void write_graph_chunk_bloom_indexes(struct hashfile *f,
-					    struct write_commit_graph_context *ctx)
+static int write_graph_chunk_bloom_indexes(struct hashfile *f,
+					   struct write_commit_graph_context *ctx)
 {
 	struct commit **list = ctx->commits.list;
 	struct commit **last = ctx->commits.list + ctx->commits.nr;
@@ -1101,10 +1109,11 @@ static void write_graph_chunk_bloom_indexes(struct hashfile *f,
 	}
 
 	stop_progress(&progress);
+	return 0;
 }
 
-static void write_graph_chunk_bloom_data(struct hashfile *f,
-					 struct write_commit_graph_context *ctx)
+static int write_graph_chunk_bloom_data(struct hashfile *f,
+					struct write_commit_graph_context *ctx)
 {
 	struct commit **list = ctx->commits.list;
 	struct commit **last = ctx->commits.list + ctx->commits.nr;
@@ -1128,6 +1137,7 @@ static void write_graph_chunk_bloom_data(struct hashfile *f,
 	}
 
 	stop_progress(&progress);
+	return 0;
 }
 
 static int oid_compare(const void *_a, const void *_b)
@@ -1638,8 +1648,8 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 			num_chunks * ctx->commits.nr);
 	}
 	write_graph_chunk_fanout(f, ctx);
-	write_graph_chunk_oids(f, hashsz, ctx);
-	write_graph_chunk_data(f, hashsz, ctx);
+	write_graph_chunk_oids(f, ctx);
+	write_graph_chunk_data(f, ctx);
 	if (ctx->num_extra_edges)
 		write_graph_chunk_extra_edges(f, ctx);
 	if (ctx->changed_paths) {

From de726f5c785f3a7c7467bc789bb7e7caa08c55de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Mon, 15 Jun 2020 20:14:48 +0000
Subject: [PATCH 13/18] commit-graph: simplify chunk writes into loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In write_commit_graph_file() we now have one block of code filling the
array of 'struct chunk_info' with the IDs and sizes of chunks to be
written, and an other block of code calling the functions responsible
for writing individual chunks.  In case of optional chunks like Extra
Edge List an Base Graphs List there is also a condition checking
whether that chunk is necessary/desired, and that same condition is
repeated in both blocks of code. Other, newer chunks have similar
optional conditions.

Eliminate these repeated conditions by storing the function pointers
responsible for writing individual chunks in the 'struct chunk_info'
array as well, and calling them in a loop to write the commit-graph
file.  This will open up the possibility for a bit of foolproofing in
the following patch.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 3bae1e52ed..78e023be66 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1532,9 +1532,13 @@ static int write_graph_chunk_base(struct hashfile *f,
 	return 0;
 }
 
+typedef int (*chunk_write_fn)(struct hashfile *f,
+			      struct write_commit_graph_context *ctx);
+
 struct chunk_info {
 	uint32_t id;
 	uint64_t size;
+	chunk_write_fn write_fn;
 };
 
 static int write_commit_graph_file(struct write_commit_graph_context *ctx)
@@ -1591,27 +1595,34 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 
 	chunks[0].id = GRAPH_CHUNKID_OIDFANOUT;
 	chunks[0].size = GRAPH_FANOUT_SIZE;
+	chunks[0].write_fn = write_graph_chunk_fanout;
 	chunks[1].id = GRAPH_CHUNKID_OIDLOOKUP;
 	chunks[1].size = hashsz * ctx->commits.nr;
+	chunks[1].write_fn = write_graph_chunk_oids;
 	chunks[2].id = GRAPH_CHUNKID_DATA;
 	chunks[2].size = (hashsz + 16) * ctx->commits.nr;
+	chunks[2].write_fn = write_graph_chunk_data;
 	if (ctx->num_extra_edges) {
 		chunks[num_chunks].id = GRAPH_CHUNKID_EXTRAEDGES;
 		chunks[num_chunks].size = 4 * ctx->num_extra_edges;
+		chunks[num_chunks].write_fn = write_graph_chunk_extra_edges;
 		num_chunks++;
 	}
 	if (ctx->changed_paths) {
 		chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMINDEXES;
 		chunks[num_chunks].size = sizeof(uint32_t) * ctx->commits.nr;
+		chunks[num_chunks].write_fn = write_graph_chunk_bloom_indexes;
 		num_chunks++;
 		chunks[num_chunks].id = GRAPH_CHUNKID_BLOOMDATA;
 		chunks[num_chunks].size = sizeof(uint32_t) * 3
 					  + ctx->total_bloom_filter_data_size;
+		chunks[num_chunks].write_fn = write_graph_chunk_bloom_data;
 		num_chunks++;
 	}
 	if (ctx->num_commit_graphs_after > 1) {
 		chunks[num_chunks].id = GRAPH_CHUNKID_BASE;
 		chunks[num_chunks].size = hashsz * (ctx->num_commit_graphs_after - 1);
+		chunks[num_chunks].write_fn = write_graph_chunk_base;
 		num_chunks++;
 	}
 
@@ -1647,19 +1658,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 			progress_title.buf,
 			num_chunks * ctx->commits.nr);
 	}
-	write_graph_chunk_fanout(f, ctx);
-	write_graph_chunk_oids(f, ctx);
-	write_graph_chunk_data(f, ctx);
-	if (ctx->num_extra_edges)
-		write_graph_chunk_extra_edges(f, ctx);
-	if (ctx->changed_paths) {
-		write_graph_chunk_bloom_indexes(f, ctx);
-		write_graph_chunk_bloom_data(f, ctx);
-	}
-	if (ctx->num_commit_graphs_after > 1 &&
-	    write_graph_chunk_base(f, ctx)) {
-		return -1;
+
+	for (i = 0; i < num_chunks; i++) {
+		if (chunks[i].write_fn(f, ctx)) {
+			error(_("failed writing chunk with id %"PRIx32""),
+			      chunks[i].id);
+			return -1;
+		}
 	}
+
 	stop_progress(&ctx->progress);
 	strbuf_release(&progress_title);
 

From bd74c1d6fd4d91ecb026384b9f2c9f55c054e011 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Mon, 15 Jun 2020 20:14:49 +0000
Subject: [PATCH 14/18] commit-graph: check chunk sizes after writing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In my experience while experimenting with new commit-graph chunks,
early versions of the corresponding new write_commit_graph_my_chunk()
functions are, sadly but not surprisingly, often buggy, and write more
or less data than they are supposed to, especially if the chunk size
is not directly proportional to the number of commits.  This then
causes all kinds of issues when reading such a bogus commit-graph
file, raising the question of whether the writing or the reading part
happens to be buggy this time.

Let's catch such issues early, already when writing the commit-graph
file, and check that each write_graph_chunk_*() function wrote the
amount of data that it was expected to, and what has been encoded in
the Chunk Lookup table.  Now that all commit-graph chunks are written
in a loop we can do this check in a single place for all chunks, and
any chunks added in the future will get checked as well.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/commit-graph.c b/commit-graph.c
index 78e023be66..5c8f210cad 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1659,12 +1659,21 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
 			num_chunks * ctx->commits.nr);
 	}
 
+	chunk_offset = f->total + f->offset;
 	for (i = 0; i < num_chunks; i++) {
+		uint64_t end_offset;
+
 		if (chunks[i].write_fn(f, ctx)) {
 			error(_("failed writing chunk with id %"PRIx32""),
 			      chunks[i].id);
 			return -1;
 		}
+
+		end_offset = f->total + f->offset;
+		if (end_offset - chunk_offset != chunks[i].size)
+			BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead",
+			    chunks[i].size, chunks[i].id, end_offset - chunk_offset);
+		chunk_offset = end_offset;
 	}
 
 	stop_progress(&ctx->progress);

From d83d1650bc3eeace6ed4d4e0d24d305e8fcb397b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?SZEDER=20G=C3=A1bor?= <szeder.dev@gmail.com>
Date: Mon, 15 Jun 2020 20:14:50 +0000
Subject: [PATCH 15/18] commit-graph: check all leading directories in changed
 path Bloom filters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The file 'dir/subdir/file' can only be modified if its leading
directories 'dir' and 'dir/subdir' are modified as well.

So when checking modified path Bloom filters looking for commits
modifying a path with multiple path components, then check not only
the full path in the Bloom filters, but all its leading directories as
well.  Take care to check these paths in "deepest first" order,
because it's the full path that is least likely to be modified, and
the Bloom filter queries can short circuit sooner.

This can significantly reduce the average false positive rate, by
about an order of magnitude or three(!), and can further speed up
pathspec-limited revision walks.  The table below compares the average
false positive rate and runtime of

  git rev-list HEAD -- "$path"

before and after this change for 5000+ randomly* selected paths from
each repository:

                    Average false           Average        Average
                    positive rate           runtime        runtime
                  before     after     before     after   difference
  ------------------------------------------------------------------
  git             3.220%   0.7853%     0.0558s   0.0387s   -30.6%
  linux           2.453%   0.0296%     0.1046s   0.0766s   -26.8%
  tensorflow      2.536%   0.6977%     0.0594s   0.0420s   -29.2%

*Path selection was done with the following pipeline:

	git ls-tree -r --name-only HEAD | sort -R | head -n 5000

The improvements in runtime are much smaller than the improvements in
average false positive rate, as we are clearly reaching diminishing
returns here.  However, all these timings depend on that accessing
tree objects is reasonably fast (warm caches).  If we had a partial
clone and the tree objects had to be fetched from a promisor remote,
e.g.:

  $ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git
  $ git -C webkit.git -c core.modifiedPathBloomFilters=1 \
        commit-graph write --reachable
  $ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/
  $ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \
        rev-list HEAD -- "$path"

then checking all leading path component can reduce the runtime from
over an hour to a few seconds (and this is with the clone and the
promisor on the same machine).

This adjusts the tracing values in t4216-log-bloom.sh, which provides a
concrete way to notice the improvement.

Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 revision.c           | 35 ++++++++++++++++++++++++++---------
 revision.h           |  6 ++++--
 t/t4216-log-bloom.sh |  2 +-
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/revision.c b/revision.c
index c644c66091..027ae3982b 100644
--- a/revision.c
+++ b/revision.c
@@ -670,9 +670,10 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
 {
 	struct pathspec_item *pi;
 	char *path_alloc = NULL;
-	const char *path;
+	const char *path, *p;
 	int last_index;
-	int len;
+	size_t len;
+	int path_component_nr = 0, j;
 
 	if (!revs->commits)
 		return;
@@ -705,8 +706,22 @@ static void prepare_to_use_bloom_filter(struct rev_info *revs)
 
 	len = strlen(path);
 
-	revs->bloom_key = xmalloc(sizeof(struct bloom_key));
-	fill_bloom_key(path, len, revs->bloom_key, revs->bloom_filter_settings);
+	p = path;
+	do {
+		p = strchrnul(p + 1, '/');
+		path_component_nr++;
+	} while (p - path < len);
+
+	revs->bloom_keys_nr = path_component_nr;
+	ALLOC_ARRAY(revs->bloom_keys, revs->bloom_keys_nr);
+
+	p = path;
+	for (j = 0; j < revs->bloom_keys_nr; j++) {
+		p = strchrnul(p + 1, '/');
+
+		fill_bloom_key(path, p - path, &revs->bloom_keys[j],
+			       revs->bloom_filter_settings);
+	}
 
 	if (trace2_is_enabled() && !bloom_filter_atexit_registered) {
 		atexit(trace2_bloom_filter_statistics_atexit);
@@ -720,7 +735,7 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 						 struct commit *commit)
 {
 	struct bloom_filter *filter;
-	int result;
+	int result = 1, j;
 
 	if (!revs->repo->objects->commit_graph)
 		return -1;
@@ -740,9 +755,11 @@ static int check_maybe_different_in_bloom_filter(struct rev_info *revs,
 		return -1;
 	}
 
-	result = bloom_filter_contains(filter,
-				       revs->bloom_key,
-				       revs->bloom_filter_settings);
+	for (j = 0; result && j < revs->bloom_keys_nr; j++) {
+		result = bloom_filter_contains(filter,
+					       &revs->bloom_keys[j],
+					       revs->bloom_filter_settings);
+	}
 
 	if (result)
 		count_bloom_filter_maybe++;
@@ -782,7 +799,7 @@ static int rev_compare_tree(struct rev_info *revs,
 			return REV_TREE_SAME;
 	}
 
-	if (revs->bloom_key && !nth_parent) {
+	if (revs->bloom_keys_nr && !nth_parent) {
 		bloom_ret = check_maybe_different_in_bloom_filter(revs, commit);
 
 		if (bloom_ret == 0)
diff --git a/revision.h b/revision.h
index 7c026fe41f..abbfb4ab59 100644
--- a/revision.h
+++ b/revision.h
@@ -295,8 +295,10 @@ struct rev_info {
 	struct topo_walk_info *topo_walk_info;
 
 	/* Commit graph bloom filter fields */
-	/* The bloom filter key for the pathspec */
-	struct bloom_key *bloom_key;
+	/* The bloom filter key(s) for the pathspec */
+	struct bloom_key *bloom_keys;
+	int bloom_keys_nr;
+
 	/*
 	 * The bloom filter settings used to generate the key.
 	 * This is loaded from the commit-graph being used.
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index c7011f33e2..c13b97d3bd 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -142,7 +142,7 @@ test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
 
 test_bloom_filters_used_when_some_filters_are_missing () {
 	log_args=$1
-	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":8,\"definitely_not\":6"
+	bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"zero_length_filter\":0,\"maybe\":6,\"definitely_not\":8"
 	setup "$log_args" &&
 	grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
 	test_cmp log_wo_bloom log_w_bloom

From 488ae8cf265fe61b3158d8dcdd16bbaab7577cdc Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 15 Jun 2020 20:14:51 +0000
Subject: [PATCH 16/18] bloom: enforce a minimum size of 8 bytes

The original design of changed-path Bloom filters included an 8-byte
block size for filter lengths. This was changed mid-way through the
submission process, and now the length stored in the commit-graph has
one-byte granularity.

This can cause some issues for very small filters. The analysis for
false positive rates assume large filters, so rounding errors become
less important at that scale. When there are only a few paths changed,
a filter that has size only a few bytes could have very different
behavior. In fact, this is evidenced in the Git repository due to the
code organization and careful patch creation that leads to many commits
with very small filters. These small filters frequently have
false-positive rates in the 8-10% range or higher.

The previous change improved the false-positive rate using multiple
Bloom keys when the path has multiple directory components. However,
that does not help at all for files at root. It is typical to have
several commits that change only the README at root, and those commits
would be likely to have these artificially high false-positive rates.

Correct this issue by creating a minimum filters size of 8 bytes. This
requires the very small commits (with fewer than six changes, including
non-root directories) to have a larger filter. In principle, this
violates the bits_per_entry value of struct bloom_filter_settings.
However, it does not actually create a functional problem.

As for compatibility, this only affects new versions writing filters for
commits that do not yet have a filter. Old version will write the
smaller filters and this version will persist and properly read that
data. Now, the new files will be generated slightly larger.

               Bytes before   Bytes after  Difference
  --------------------------------------------------
  git             4,021,078    4,275,311   +6.32%
  linux          72,212,101   73,909,286   +2.35%
  tensorflow      7,596,359    7,691,646   +1.25%

This has a measurable improvement in the false-positive rate and the
end-to-end run time for these repos. The table below compares the average
false-positive rate and runtime of

  git rev-list HEAD -- "$path"

before and after this change for 5000+ randomly* selected paths from
each repository:

                    Average false           Average        Average
                    positive rate           runtime        runtime
                  before     after     before     after   difference
  ------------------------------------------------------------------
  git             0.786%     0.227%    0.0387s    0.0289s -25.5%
  linux           0.0296%    0.0174%   0.0766s    0.0706s  -7.8%
  tensorflow      0.6977%    0.0268%   0.0420s    0.0384s  -8.5%

*Path selection was done with the following pipeline:

        git ls-tree -r --name-only HEAD | sort -R | head -n 5000

These relatively-small increases in file size appear to be a fair price
to pay for these performance improvements.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 bloom.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bloom.c b/bloom.c
index c38d1cff0c..875e3853c2 100644
--- a/bloom.c
+++ b/bloom.c
@@ -258,6 +258,10 @@ struct bloom_filter *get_bloom_filter(struct repository *r,
 		}
 
 		filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
+
+		if (filter->len && filter->len < 8)
+			filter->len = 8;
+
 		filter->data = xcalloc(filter->len, sizeof(unsigned char));
 
 		hashmap_for_each_entry(&pathmap, &iter, e, entry) {

From ef713d4bea7db709b08a569d394fac7c16547713 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 15 Jun 2020 20:14:52 +0000
Subject: [PATCH 17/18] commit-graph: change test to die on parse, not load

43d3561 (commit-graph write: don't die if the existing graph is corrupt,
2019-03-25) introduced the GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD environment
variable. This was created to verify that commit-graph was not loaded
when writing a new non-incremental commit-graph.

An upcoming change wants to load a commit-graph in some valuable cases,
but we want to maintain that we don't trust the commit-graph data when
writing our new file. Instead of dying on load, instead die if we ever
try to parse a commit from the commit-graph. This functionally verifies
the same intended behavior, but allows a more advanced feature in the
next change.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 commit-graph.c          | 12 ++++++++----
 commit-graph.h          |  2 +-
 t/t5318-commit-graph.sh |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/commit-graph.c b/commit-graph.c
index 5c8f210cad..3a64e3b382 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -564,10 +564,6 @@ static int prepare_commit_graph(struct repository *r)
 		return !!r->objects->commit_graph;
 	r->objects->commit_graph_attempted = 1;
 
-	if (git_env_bool(GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD, 0))
-		die("dying as requested by the '%s' variable on commit-graph load!",
-		    GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD);
-
 	prepare_repo_settings(r);
 
 	if (!git_env_bool(GIT_TEST_COMMIT_GRAPH, 0) &&
@@ -790,6 +786,14 @@ static int parse_commit_in_graph_one(struct repository *r,
 
 int parse_commit_in_graph(struct repository *r, struct commit *item)
 {
+	static int checked_env = 0;
+
+	if (!checked_env &&
+	    git_env_bool(GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE, 0))
+		die("dying as requested by the '%s' variable on commit-graph parse!",
+		    GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE);
+	checked_env = 1;
+
 	if (!prepare_commit_graph(r))
 		return 0;
 	return parse_commit_in_graph_one(r, r->objects->commit_graph, item);
diff --git a/commit-graph.h b/commit-graph.h
index 881c9b46e5..f0fb13e3f2 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -5,7 +5,7 @@
 #include "object-store.h"
 
 #define GIT_TEST_COMMIT_GRAPH "GIT_TEST_COMMIT_GRAPH"
-#define GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD "GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD"
+#define GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE "GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE"
 #define GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS "GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS"
 
 /*
diff --git a/t/t5318-commit-graph.sh b/t/t5318-commit-graph.sh
index 1073f9e3cf..5ec01abdaa 100755
--- a/t/t5318-commit-graph.sh
+++ b/t/t5318-commit-graph.sh
@@ -436,7 +436,7 @@ corrupt_graph_verify() {
 		cp $objdir/info/commit-graph commit-graph-pre-write-test
 	fi &&
 	git status --short &&
-	GIT_TEST_COMMIT_GRAPH_DIE_ON_LOAD=true git commit-graph write &&
+	GIT_TEST_COMMIT_GRAPH_DIE_ON_PARSE=true git commit-graph write &&
 	git commit-graph verify
 }
 

From 744d4961e66819ff3ea8857b286d0adf3e3c5f24 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <dstolee@microsoft.com>
Date: Mon, 15 Jun 2020 20:14:53 +0000
Subject: [PATCH 18/18] commit-graph: persist existence of changed-paths

The changed-path Bloom filters were released in v2.27.0, but have a
significant drawback. A user can opt-in to writing the changed-path
filters using the "--changed-paths" option to "git commit-graph write"
but the next write will drop the filters unless that option is
specified.

This becomes even more important when considering the interaction with
gc.writeCommitGraph (on by default) or fetch.writeCommitGraph (part of
features.experimental). These config options trigger commit-graph writes
that the user did not signal, and hence there is no --changed-paths
option available.

Allow a user that opts-in to the changed-path filters to persist the
property of "my commit-graph has changed-path filters" automatically. A
user can drop filters using the --no-changed-paths option.

Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 Documentation/git-commit-graph.txt |  5 ++++-
 builtin/commit-graph.c             |  5 ++++-
 commit-graph.c                     | 12 +++++++++++-
 commit-graph.h                     |  1 +
 t/t4216-log-bloom.sh               |  2 +-
 5 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/Documentation/git-commit-graph.txt b/Documentation/git-commit-graph.txt
index f4b13c005b..369b222b08 100644
--- a/Documentation/git-commit-graph.txt
+++ b/Documentation/git-commit-graph.txt
@@ -60,7 +60,10 @@ existing commit-graph file.
 With the `--changed-paths` option, compute and write information about the
 paths changed between a commit and it's first parent. This operation can
 take a while on large repositories. It provides significant performance gains
-for getting history of a directory or a file with `git log -- <path>`.
+for getting history of a directory or a file with `git log -- <path>`. If
+this option is given, future commit-graph writes will automatically assume
+that this option was intended. Use `--no-changed-paths` to stop storing this
+data.
 +
 With the `--split` option, write the commit-graph as a chain of multiple
 commit-graph files stored in `<dir>/info/commit-graphs`. The new commits
diff --git a/builtin/commit-graph.c b/builtin/commit-graph.c
index 59009837dc..ff7b177c33 100644
--- a/builtin/commit-graph.c
+++ b/builtin/commit-graph.c
@@ -151,6 +151,7 @@ static int graph_write(int argc, const char **argv)
 	};
 
 	opts.progress = isatty(2);
+	opts.enable_changed_paths = -1;
 	split_opts.size_multiple = 2;
 	split_opts.max_commits = 0;
 	split_opts.expire_time = 0;
@@ -171,7 +172,9 @@ static int graph_write(int argc, const char **argv)
 		flags |= COMMIT_GRAPH_WRITE_SPLIT;
 	if (opts.progress)
 		flags |= COMMIT_GRAPH_WRITE_PROGRESS;
-	if (opts.enable_changed_paths ||
+	if (!opts.enable_changed_paths)
+		flags |= COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS;
+	if (opts.enable_changed_paths == 1 ||
 	    git_env_bool(GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS, 0))
 		flags |= COMMIT_GRAPH_WRITE_BLOOM_FILTERS;
 
diff --git a/commit-graph.c b/commit-graph.c
index 3a64e3b382..04eea72523 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -1996,9 +1996,19 @@ int write_commit_graph(struct object_directory *odb,
 	ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0;
 	ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0;
 	ctx->split_opts = split_opts;
-	ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
 	ctx->total_bloom_filter_data_size = 0;
 
+	if (flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS)
+		ctx->changed_paths = 1;
+	else if (!(flags & COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS)) {
+		prepare_commit_graph_one(ctx->r, ctx->odb);
+
+		/* We have changed-paths already. Keep them in the next graph */
+		if (ctx->r->objects->commit_graph &&
+		    ctx->r->objects->commit_graph->chunk_bloom_data)
+			ctx->changed_paths = 1;
+	}
+
 	if (ctx->split) {
 		struct commit_graph *g;
 		prepare_commit_graph(ctx->r);
diff --git a/commit-graph.h b/commit-graph.h
index f0fb13e3f2..45b1e5bca3 100644
--- a/commit-graph.h
+++ b/commit-graph.h
@@ -96,6 +96,7 @@ enum commit_graph_write_flags {
 	/* Make sure that each OID in the input is a valid commit OID. */
 	COMMIT_GRAPH_WRITE_CHECK_OIDS = (1 << 3),
 	COMMIT_GRAPH_WRITE_BLOOM_FILTERS = (1 << 4),
+	COMMIT_GRAPH_NO_WRITE_BLOOM_FILTERS = (1 << 5),
 };
 
 struct split_commit_graph_opts {
diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh
index c13b97d3bd..30c8d9562e 100755
--- a/t/t4216-log-bloom.sh
+++ b/t/t4216-log-bloom.sh
@@ -126,7 +126,7 @@ test_expect_success 'setup - add commit-graph to the chain without Bloom filters
 	test_commit c14 A/anotherFile2 &&
 	test_commit c15 A/B/anotherFile2 &&
 	test_commit c16 A/B/C/anotherFile2 &&
-	GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0 git commit-graph write --reachable --split &&
+	git commit-graph write --reachable --split --no-changed-paths &&
 	test_line_count = 2 .git/objects/info/commit-graphs/commit-graph-chain
 '