diff --git a/Documentation/config/commitgraph.txt b/Documentation/config/commitgraph.txt index acc74a2f27..30604e4a4c 100644 --- a/Documentation/config/commitgraph.txt +++ b/Documentation/config/commitgraph.txt @@ -9,26 +9,6 @@ commitGraph.maxNewFilters:: commit-graph write` (c.f., linkgit:git-commit-graph[1]). commitGraph.readChangedPaths:: - Deprecated. Equivalent to commitGraph.changedPathsVersion=-1 if true, and - commitGraph.changedPathsVersion=0 if false. (If commitGraph.changedPathVersion - is also set, commitGraph.changedPathsVersion takes precedence.) - -commitGraph.changedPathsVersion:: - Specifies the version of the changed-path Bloom filters that Git will read and - write. May be -1, 0, 1, or 2. -+ -Defaults to -1. -+ -If -1, Git will use the version of the changed-path Bloom filters in the -repository, defaulting to 1 if there are none. -+ -If 0, Git will not read any Bloom filters, and will write version 1 Bloom -filters when instructed to write. -+ -If 1, Git will only read version 1 Bloom filters, and will write version 1 -Bloom filters. -+ -If 2, Git will only read version 2 Bloom filters, and will write version 2 -Bloom filters. -+ -See linkgit:git-commit-graph[1] for more information. + If true, then git will use the changed-path Bloom filters in the + commit-graph file (if it exists, and they are present). Defaults to + true. See linkgit:git-commit-graph[1] for more information. diff --git a/Documentation/gitformat-commit-graph.txt b/Documentation/gitformat-commit-graph.txt index 3e906e8030..31cad585e2 100644 --- a/Documentation/gitformat-commit-graph.txt +++ b/Documentation/gitformat-commit-graph.txt @@ -142,16 +142,13 @@ All multi-byte numbers are in network byte order. ==== Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional] * It starts with header consisting of three unsigned 32-bit integers: - - Version of the hash algorithm being used. We currently support - value 2 which corresponds to the 32-bit version of the murmur3 hash + - Version of the hash algorithm being used. We currently only support + value 1 which corresponds to the 32-bit version of the murmur3 hash implemented exactly as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double hashing technique using seed values 0x293ae76f and 0x7e646e2 as described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters - in Probabilistic Verification". Version 1 Bloom filters have a bug that appears - when char is signed and the repository has path names that have characters >= - 0x80; Git supports reading and writing them, but this ability will be removed - in a future version of Git. + in Probabilistic Verification" - The number of times a path is hashed and hence the number of bit positions that cumulatively determine whether a file is present in the commit. - The minimum number of bits 'b' per entry in the Bloom filter. If the filter diff --git a/bloom.c b/bloom.c index ebef5cfd2f..aef6b5fea2 100644 --- a/bloom.c +++ b/bloom.c @@ -29,9 +29,9 @@ static inline unsigned char get_bitmask(uint32_t pos) return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1)); } -int load_bloom_filter_from_graph(struct commit_graph *g, - struct bloom_filter *filter, - uint32_t graph_pos) +static int load_bloom_filter_from_graph(struct commit_graph *g, + struct bloom_filter *filter, + uint32_t graph_pos) { uint32_t lex_pos, start_index, end_index; @@ -66,64 +66,7 @@ int load_bloom_filter_from_graph(struct commit_graph *g, * Not considered to be cryptographically secure. * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm */ -uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len) -{ - const uint32_t c1 = 0xcc9e2d51; - const uint32_t c2 = 0x1b873593; - const uint32_t r1 = 15; - const uint32_t r2 = 13; - const uint32_t m = 5; - const uint32_t n = 0xe6546b64; - int i; - uint32_t k1 = 0; - const char *tail; - - int len4 = len / sizeof(uint32_t); - - uint32_t k; - for (i = 0; i < len4; i++) { - uint32_t byte1 = (uint32_t)(unsigned char)data[4*i]; - uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8; - uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16; - uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24; - k = byte1 | byte2 | byte3 | byte4; - k *= c1; - k = rotate_left(k, r1); - k *= c2; - - seed ^= k; - seed = rotate_left(seed, r2) * m + n; - } - - tail = (data + len4 * sizeof(uint32_t)); - - switch (len & (sizeof(uint32_t) - 1)) { - case 3: - k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16; - /*-fallthrough*/ - case 2: - k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8; - /*-fallthrough*/ - case 1: - k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0; - k1 *= c1; - k1 = rotate_left(k1, r1); - k1 *= c2; - seed ^= k1; - break; - } - - seed ^= (uint32_t)len; - seed ^= (seed >> 16); - seed *= 0x85ebca6b; - seed ^= (seed >> 13); - seed *= 0xc2b2ae35; - seed ^= (seed >> 16); - - return seed; -} - -static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len) +uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len) { const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; @@ -188,14 +131,8 @@ void fill_bloom_key(const char *data, int i; const uint32_t seed0 = 0x293ae76f; const uint32_t seed1 = 0x7e646e2c; - uint32_t hash0, hash1; - if (settings->hash_version == 2) { - hash0 = murmur3_seeded_v2(seed0, data, len); - hash1 = murmur3_seeded_v2(seed1, data, len); - } else { - hash0 = murmur3_seeded_v1(seed0, data, len); - hash1 = murmur3_seeded_v1(seed1, data, len); - } + const uint32_t hash0 = murmur3_seeded(seed0, data, len); + const uint32_t hash1 = murmur3_seeded(seed1, data, len); key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t)); for (i = 0; i < settings->num_hashes; i++) diff --git a/bloom.h b/bloom.h index 138d57a86b..adde6dfe21 100644 --- a/bloom.h +++ b/bloom.h @@ -3,16 +3,13 @@ struct commit; struct repository; -struct commit_graph; struct bloom_filter_settings { /* * The version of the hashing technique being used. - * The newest version is 2, which is + * We currently only support version = 1 which is * the seeded murmur3 hashing technique implemented - * in bloom.c. Bloom filters of version 1 were created - * with prior versions of Git, which had a bug in the - * implementation of the hash function. + * in bloom.c. */ uint32_t hash_version; @@ -71,10 +68,6 @@ struct bloom_key { uint32_t *hashes; }; -int load_bloom_filter_from_graph(struct commit_graph *g, - struct bloom_filter *filter, - uint32_t graph_pos); - /* * Calculate the murmur3 32-bit hash value for the given data * using the given seed. @@ -82,7 +75,7 @@ int load_bloom_filter_from_graph(struct commit_graph *g, * Not considered to be cryptographically secure. * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm */ -uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len); +uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len); void fill_bloom_key(const char *data, size_t len, diff --git a/commit-graph.c b/commit-graph.c index be2b2683c1..0aa1640d15 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -304,26 +304,17 @@ static int graph_read_oid_lookup(const unsigned char *chunk_start, return 0; } -struct graph_read_bloom_data_context { - struct commit_graph *g; - int *commit_graph_changed_paths_version; -}; - static int graph_read_bloom_data(const unsigned char *chunk_start, size_t chunk_size, void *data) { - struct graph_read_bloom_data_context *c = data; - struct commit_graph *g = c->g; + struct commit_graph *g = data; uint32_t hash_version; + g->chunk_bloom_data = chunk_start; hash_version = get_be32(chunk_start); - if (*c->commit_graph_changed_paths_version == -1) { - *c->commit_graph_changed_paths_version = hash_version; - } else if (hash_version != *c->commit_graph_changed_paths_version) { + if (hash_version != 1) return 0; - } - g->chunk_bloom_data = chunk_start; g->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings)); g->bloom_filter_settings->hash_version = hash_version; g->bloom_filter_settings->num_hashes = get_be32(chunk_start + 4); @@ -410,15 +401,11 @@ struct commit_graph *parse_commit_graph(struct repo_settings *s, graph->read_generation_data = 1; } - if (s->commit_graph_changed_paths_version) { - struct graph_read_bloom_data_context context = { - .g = graph, - .commit_graph_changed_paths_version = &s->commit_graph_changed_paths_version - }; + if (s->commit_graph_read_changed_paths) { pair_chunk(cf, GRAPH_CHUNKID_BLOOMINDEXES, &graph->chunk_bloom_indexes); read_chunk(cf, GRAPH_CHUNKID_BLOOMDATA, - graph_read_bloom_data, &context); + graph_read_bloom_data, graph); } if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) { @@ -2401,14 +2388,6 @@ int write_commit_graph(struct object_directory *odb, ctx->write_generation_data = (get_configured_generation_version(r) == 2); ctx->num_generation_data_overflows = 0; - if (r->settings.commit_graph_changed_paths_version < -1 - || r->settings.commit_graph_changed_paths_version > 2) { - warning(_("attempting to write a commit-graph, but 'commitgraph.changedPathsVersion' (%d) is not supported"), - r->settings.commit_graph_changed_paths_version); - return 0; - } - bloom_settings.hash_version = r->settings.commit_graph_changed_paths_version == 2 - ? 2 : 1; bloom_settings.bits_per_entry = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY", bloom_settings.bits_per_entry); bloom_settings.num_hashes = git_env_ulong("GIT_TEST_BLOOM_SETTINGS_NUM_HASHES", @@ -2438,7 +2417,7 @@ int write_commit_graph(struct object_directory *odb, g = ctx->r->objects->commit_graph; /* We have changed-paths already. Keep them in the next graph */ - if (g && g->bloom_filter_settings) { + if (g && g->chunk_bloom_data) { ctx->changed_paths = 1; ctx->bloom_settings = g->bloom_filter_settings; } diff --git a/oss-fuzz/fuzz-commit-graph.c b/oss-fuzz/fuzz-commit-graph.c index 325c0b991a..2992079dd9 100644 --- a/oss-fuzz/fuzz-commit-graph.c +++ b/oss-fuzz/fuzz-commit-graph.c @@ -19,7 +19,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) * possible. */ the_repository->settings.commit_graph_generation_version = 2; - the_repository->settings.commit_graph_changed_paths_version = 1; + the_repository->settings.commit_graph_read_changed_paths = 1; g = parse_commit_graph(&the_repository->settings, (void *)data, size); repo_clear(the_repository); free_commit_graph(g); diff --git a/repo-settings.c b/repo-settings.c index db8fe817f3..525f69c0c7 100644 --- a/repo-settings.c +++ b/repo-settings.c @@ -24,7 +24,6 @@ void prepare_repo_settings(struct repository *r) int value; const char *strval; int manyfiles; - int read_changed_paths; if (!r->gitdir) BUG("Cannot add settings for uninitialized repository"); @@ -55,10 +54,7 @@ void prepare_repo_settings(struct repository *r) /* Commit graph config or default, does not cascade (simple) */ repo_cfg_bool(r, "core.commitgraph", &r->settings.core_commit_graph, 1); repo_cfg_int(r, "commitgraph.generationversion", &r->settings.commit_graph_generation_version, 2); - repo_cfg_bool(r, "commitgraph.readchangedpaths", &read_changed_paths, 1); - repo_cfg_int(r, "commitgraph.changedpathsversion", - &r->settings.commit_graph_changed_paths_version, - read_changed_paths ? -1 : 0); + repo_cfg_bool(r, "commitgraph.readchangedpaths", &r->settings.commit_graph_read_changed_paths, 1); repo_cfg_bool(r, "gc.writecommitgraph", &r->settings.gc_write_commit_graph, 1); repo_cfg_bool(r, "fetch.writecommitgraph", &r->settings.fetch_write_commit_graph, 0); diff --git a/repository.h b/repository.h index f71154e12c..5f18486f64 100644 --- a/repository.h +++ b/repository.h @@ -29,7 +29,7 @@ struct repo_settings { int core_commit_graph; int commit_graph_generation_version; - int commit_graph_changed_paths_version; + int commit_graph_read_changed_paths; int gc_write_commit_graph; int fetch_write_commit_graph; int command_requires_full_index; diff --git a/t/helper/test-bloom.c b/t/helper/test-bloom.c index 3cbc0a5b50..aabe31d724 100644 --- a/t/helper/test-bloom.c +++ b/t/helper/test-bloom.c @@ -50,7 +50,6 @@ static void get_bloom_filter_for_commit(const struct object_id *commit_oid) static const char *bloom_usage = "\n" " test-tool bloom get_murmur3 \n" -" test-tool bloom get_murmur3_seven_highbit\n" " test-tool bloom generate_filter [...]\n" " test-tool bloom get_filter_for_commit \n"; @@ -65,13 +64,7 @@ int cmd__bloom(int argc, const char **argv) uint32_t hashed; if (argc < 3) usage(bloom_usage); - hashed = murmur3_seeded_v2(0, argv[2], strlen(argv[2])); - printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); - } - - if (!strcmp(argv[1], "get_murmur3_seven_highbit")) { - uint32_t hashed; - hashed = murmur3_seeded_v2(0, "\x99\xaa\xbb\xcc\xdd\xee\xff", 7); + hashed = murmur3_seeded(0, argv[2], strlen(argv[2])); printf("Murmur3 Hash with seed=0:0x%08x\n", hashed); } diff --git a/t/helper/test-read-graph.c b/t/helper/test-read-graph.c index da9ac8584d..8c7a83f578 100644 --- a/t/helper/test-read-graph.c +++ b/t/helper/test-read-graph.c @@ -5,8 +5,20 @@ #include "bloom.h" #include "setup.h" -static void dump_graph_info(struct commit_graph *graph) +int cmd__read_graph(int argc UNUSED, const char **argv UNUSED) { + struct commit_graph *graph = NULL; + struct object_directory *odb; + + setup_git_directory(); + odb = the_repository->objects->odb; + + prepare_repo_settings(the_repository); + + graph = read_commit_graph_one(the_repository, odb); + if (!graph) + return 1; + printf("header: %08x %d %d %d %d\n", ntohl(*(uint32_t*)graph->data), *(unsigned char*)(graph->data + 4), @@ -45,59 +57,8 @@ static void dump_graph_info(struct commit_graph *graph) if (graph->topo_levels) printf(" topo_levels"); printf("\n"); -} -static void dump_graph_bloom_filters(struct commit_graph *graph) -{ - uint32_t i; - - for (i = 0; i < graph->num_commits + graph->num_commits_in_base; i++) { - struct bloom_filter filter = { 0 }; - size_t j; - - if (load_bloom_filter_from_graph(graph, &filter, i) < 0) { - fprintf(stderr, "missing Bloom filter for graph " - "position %"PRIu32"\n", i); - continue; - } - - for (j = 0; j < filter.len; j++) - printf("%02x", filter.data[j]); - if (filter.len) - printf("\n"); - } -} - -int cmd__read_graph(int argc, const char **argv) -{ - struct commit_graph *graph = NULL; - struct object_directory *odb; - int ret = 0; - - setup_git_directory(); - odb = the_repository->objects->odb; - - prepare_repo_settings(the_repository); - - graph = read_commit_graph_one(the_repository, odb); - if (!graph) { - ret = 1; - goto done; - } - - if (argc <= 1) - dump_graph_info(graph); - else if (!strcmp(argv[1], "bloom-filters")) - dump_graph_bloom_filters(graph); - else { - fprintf(stderr, "unknown sub-command: '%s'\n", argv[1]); - ret = 1; - } - -done: UNLEAK(graph); - return ret; + return 0; } - - diff --git a/t/t0095-bloom.sh b/t/t0095-bloom.sh index c8d84ab606..b567383eb8 100755 --- a/t/t0095-bloom.sh +++ b/t/t0095-bloom.sh @@ -29,14 +29,6 @@ test_expect_success 'compute unseeded murmur3 hash for test string 2' ' test_cmp expect actual ' -test_expect_success 'compute unseeded murmur3 hash for test string 3' ' - cat >expect <<-\EOF && - Murmur3 Hash with seed=0:0xa183ccfd - EOF - test-tool bloom get_murmur3_seven_highbit >actual && - test_cmp expect actual -' - test_expect_success 'compute bloom key for empty string' ' cat >expect <<-\EOF && Hashes:0x5615800c|0x5b966560|0x61174ab4|0x66983008|0x6c19155c|0x7199fab0|0x771ae004| diff --git a/t/t4216-log-bloom.sh b/t/t4216-log-bloom.sh index 775e59d864..fa9d32facf 100755 --- a/t/t4216-log-bloom.sh +++ b/t/t4216-log-bloom.sh @@ -404,143 +404,4 @@ test_expect_success 'Bloom generation backfills empty commits' ' ) ' -get_first_changed_path_filter () { - test-tool read-graph bloom-filters >filters.dat && - head -n 1 filters.dat -} - -# chosen to be the same under all Unicode normalization forms -CENT=$(printf "\302\242") - -test_expect_success 'set up repo with high bit path, version 1 changed-path' ' - git init highbit1 && - test_commit -C highbit1 c1 "$CENT" && - git -C highbit1 commit-graph write --reachable --changed-paths -' - -test_expect_success 'setup check value of version 1 changed-path' ' - ( - cd highbit1 && - echo "52a9" >expect && - get_first_changed_path_filter >actual && - test_cmp expect actual - ) -' - -# expect will not match actual if char is unsigned by default. Write the test -# in this way, so that a user running this test script can still see if the two -# files match. (It will appear as an ordinary success if they match, and a skip -# if not.) -if test_cmp highbit1/expect highbit1/actual -then - test_set_prereq SIGNED_CHAR_BY_DEFAULT -fi -test_expect_success SIGNED_CHAR_BY_DEFAULT 'check value of version 1 changed-path' ' - # Only the prereq matters for this test. - true -' - -test_expect_success 'version 1 changed-path used when version 1 requested' ' - ( - cd highbit1 && - test_bloom_filters_used "-- $CENT" - ) -' - -test_expect_success 'version 1 changed-path not used when version 2 requested' ' - ( - cd highbit1 && - git config --add commitgraph.changedPathsVersion 2 && - test_bloom_filters_not_used "-- $CENT" - ) -' - -test_expect_success 'version 1 changed-path used when autodetect requested' ' - ( - cd highbit1 && - git config --add commitgraph.changedPathsVersion -1 && - test_bloom_filters_used "-- $CENT" - ) -' - -test_expect_success 'when writing another commit graph, preserve existing version 1 of changed-path' ' - test_commit -C highbit1 c1double "$CENT$CENT" && - git -C highbit1 commit-graph write --reachable --changed-paths && - ( - cd highbit1 && - git config --add commitgraph.changedPathsVersion -1 && - echo "options: bloom(1,10,7) read_generation_data" >expect && - test-tool read-graph >full && - grep options full >actual && - test_cmp expect actual - ) -' - -test_expect_success 'set up repo with high bit path, version 2 changed-path' ' - git init highbit2 && - git -C highbit2 config --add commitgraph.changedPathsVersion 2 && - test_commit -C highbit2 c2 "$CENT" && - git -C highbit2 commit-graph write --reachable --changed-paths -' - -test_expect_success 'check value of version 2 changed-path' ' - ( - cd highbit2 && - echo "c01f" >expect && - get_first_changed_path_filter >actual && - test_cmp expect actual - ) -' - -test_expect_success 'version 2 changed-path used when version 2 requested' ' - ( - cd highbit2 && - test_bloom_filters_used "-- $CENT" - ) -' - -test_expect_success 'version 2 changed-path not used when version 1 requested' ' - ( - cd highbit2 && - git config --add commitgraph.changedPathsVersion 1 && - test_bloom_filters_not_used "-- $CENT" - ) -' - -test_expect_success 'version 2 changed-path used when autodetect requested' ' - ( - cd highbit2 && - git config --add commitgraph.changedPathsVersion -1 && - test_bloom_filters_used "-- $CENT" - ) -' - -test_expect_success 'when writing another commit graph, preserve existing version 2 of changed-path' ' - test_commit -C highbit2 c2double "$CENT$CENT" && - git -C highbit2 commit-graph write --reachable --changed-paths && - ( - cd highbit2 && - git config --add commitgraph.changedPathsVersion -1 && - echo "options: bloom(2,10,7) read_generation_data" >expect && - test-tool read-graph >full && - grep options full >actual && - test_cmp expect actual - ) -' - -test_expect_success 'when writing commit graph, do not reuse changed-path of another version' ' - git init doublewrite && - test_commit -C doublewrite c "$CENT" && - git -C doublewrite config --add commitgraph.changedPathsVersion 1 && - git -C doublewrite commit-graph write --reachable --changed-paths && - git -C doublewrite config --add commitgraph.changedPathsVersion 2 && - git -C doublewrite commit-graph write --reachable --changed-paths && - ( - cd doublewrite && - echo "c01f" >expect && - get_first_changed_path_filter >actual && - test_cmp expect actual - ) -' - test_done