Merge branch 'gs/commit-graph-path-filter'

Introduce an extension to the commit-graph to make it efficient to
check for the paths that were modified at each commit using Bloom
filters.

* gs/commit-graph-path-filter:
  bloom: ignore renames when computing changed paths
  commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag
  t4216: add end to end tests for git log with Bloom filters
  revision.c: add trace2 stats around Bloom filter usage
  revision.c: use Bloom filters to speed up path based revision walks
  commit-graph: add --changed-paths option to write subcommand
  commit-graph: reuse existing Bloom filters during write
  commit-graph: write Bloom filters to commit graph file
  commit-graph: examine commits by generation number
  commit-graph: examine changed-path objects in pack order
  commit-graph: compute Bloom filters for changed paths
  diff: halt tree-diff early after max_changes
  bloom.c: core Bloom filter implementation for changed paths.
  bloom.c: introduce core Bloom filter constructs
  bloom.c: add the murmur3 hash implementation
  commit-graph: define and use MAX_NUM_CHUNKS
This commit is contained in:
Junio C Hamano
2020-05-01 13:39:53 -07:00
22 changed files with 1140 additions and 11 deletions

View File

@ -16,13 +16,18 @@
#include "hashmap.h"
#include "replace-object.h"
#include "progress.h"
#include "bloom.h"
#include "commit-slab.h"
#define GRAPH_SIGNATURE 0x43475048 /* "CGPH" */
#define GRAPH_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */
#define GRAPH_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
#define GRAPH_CHUNKID_DATA 0x43444154 /* "CDAT" */
#define GRAPH_CHUNKID_EXTRAEDGES 0x45444745 /* "EDGE" */
#define GRAPH_CHUNKID_BLOOMINDEXES 0x42494458 /* "BIDX" */
#define GRAPH_CHUNKID_BLOOMDATA 0x42444154 /* "BDAT" */
#define GRAPH_CHUNKID_BASE 0x42415345 /* "BASE" */
#define MAX_NUM_CHUNKS 7
#define GRAPH_DATA_WIDTH (the_hash_algo->rawsz + 16)
@ -44,9 +49,51 @@
/* Remember to update object flag allocation in object.h */
#define REACHABLE (1u<<15)
char *get_commit_graph_filename(struct object_directory *odb)
/* Keep track of the order in which commits are added to our list. */
define_commit_slab(commit_pos, int);
static struct commit_pos commit_pos = COMMIT_SLAB_INIT(1, commit_pos);
static void set_commit_pos(struct repository *r, const struct object_id *oid)
{
return xstrfmt("%s/info/commit-graph", odb->path);
static int32_t max_pos;
struct commit *commit = lookup_commit(r, oid);
if (!commit)
return; /* should never happen, but be lenient */
*commit_pos_at(&commit_pos, commit) = max_pos++;
}
static int commit_pos_cmp(const void *va, const void *vb)
{
const struct commit *a = *(const struct commit **)va;
const struct commit *b = *(const struct commit **)vb;
return commit_pos_at(&commit_pos, a) -
commit_pos_at(&commit_pos, b);
}
static int commit_gen_cmp(const void *va, const void *vb)
{
const struct commit *a = *(const struct commit **)va;
const struct commit *b = *(const struct commit **)vb;
/* lower generation commits first */
if (a->generation < b->generation)
return -1;
else if (a->generation > b->generation)
return 1;
/* use date as a heuristic when generations are equal */
if (a->date < b->date)
return -1;
else if (a->date > b->date)
return 1;
return 0;
}
char *get_commit_graph_filename(struct object_directory *obj_dir)
{
return xstrfmt("%s/info/commit-graph", obj_dir->path);
}
static char *get_split_graph_filename(struct object_directory *odb,
@ -270,6 +317,32 @@ struct commit_graph *parse_commit_graph(void *graph_map, size_t graph_size)
chunk_repeated = 1;
else
graph->chunk_base_graphs = data + chunk_offset;
break;
case GRAPH_CHUNKID_BLOOMINDEXES:
if (graph->chunk_bloom_indexes)
chunk_repeated = 1;
else
graph->chunk_bloom_indexes = data + chunk_offset;
break;
case GRAPH_CHUNKID_BLOOMDATA:
if (graph->chunk_bloom_data)
chunk_repeated = 1;
else {
uint32_t hash_version;
graph->chunk_bloom_data = data + chunk_offset;
hash_version = get_be32(data + chunk_offset);
if (hash_version != 1)
break;
graph->bloom_filter_settings = xmalloc(sizeof(struct bloom_filter_settings));
graph->bloom_filter_settings->hash_version = hash_version;
graph->bloom_filter_settings->num_hashes = get_be32(data + chunk_offset + 4);
graph->bloom_filter_settings->bits_per_entry = get_be32(data + chunk_offset + 8);
}
break;
}
if (chunk_repeated) {
@ -288,6 +361,15 @@ struct commit_graph *parse_commit_graph(void *graph_map, size_t graph_size)
last_chunk_offset = chunk_offset;
}
if (graph->chunk_bloom_indexes && graph->chunk_bloom_data) {
init_bloom_filters();
} else {
/* We need both the bloom chunks to exist together. Else ignore the data */
graph->chunk_bloom_indexes = NULL;
graph->chunk_bloom_data = NULL;
graph->bloom_filter_settings = NULL;
}
hashcpy(graph->oid.hash, graph->data + graph->data_len - graph->hash_len);
if (verify_commit_graph_lite(graph)) {
@ -784,9 +866,12 @@ struct write_commit_graph_context {
unsigned append:1,
report_progress:1,
split:1,
check_oids:1;
check_oids:1,
changed_paths:1,
order_by_pack:1;
const struct split_commit_graph_opts *split_opts;
size_t total_bloom_filter_data_size;
};
static void write_graph_chunk_fanout(struct hashfile *f,
@ -982,6 +1067,59 @@ static void write_graph_chunk_extra_edges(struct hashfile *f,
}
}
static void write_graph_chunk_bloom_indexes(struct hashfile *f,
struct write_commit_graph_context *ctx)
{
struct commit **list = ctx->commits.list;
struct commit **last = ctx->commits.list + ctx->commits.nr;
uint32_t cur_pos = 0;
struct progress *progress = NULL;
int i = 0;
if (ctx->report_progress)
progress = start_delayed_progress(
_("Writing changed paths Bloom filters index"),
ctx->commits.nr);
while (list < last) {
struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
cur_pos += filter->len;
display_progress(progress, ++i);
hashwrite_be32(f, cur_pos);
list++;
}
stop_progress(&progress);
}
static void write_graph_chunk_bloom_data(struct hashfile *f,
struct write_commit_graph_context *ctx,
const struct bloom_filter_settings *settings)
{
struct commit **list = ctx->commits.list;
struct commit **last = ctx->commits.list + ctx->commits.nr;
struct progress *progress = NULL;
int i = 0;
if (ctx->report_progress)
progress = start_delayed_progress(
_("Writing changed paths Bloom filters data"),
ctx->commits.nr);
hashwrite_be32(f, settings->hash_version);
hashwrite_be32(f, settings->num_hashes);
hashwrite_be32(f, settings->bits_per_entry);
while (list < last) {
struct bloom_filter *filter = get_bloom_filter(ctx->r, *list, 0);
display_progress(progress, ++i);
hashwrite(f, filter->data, filter->len * sizeof(unsigned char));
list++;
}
stop_progress(&progress);
}
static int oid_compare(const void *_a, const void *_b)
{
const struct object_id *a = (const struct object_id *)_a;
@ -1013,6 +1151,8 @@ static int add_packed_commits(const struct object_id *oid,
oidcpy(&(ctx->oids.list[ctx->oids.nr]), oid);
ctx->oids.nr++;
set_commit_pos(ctx->r, oid);
return 0;
}
@ -1132,6 +1272,38 @@ static void compute_generation_numbers(struct write_commit_graph_context *ctx)
stop_progress(&ctx->progress);
}
static void compute_bloom_filters(struct write_commit_graph_context *ctx)
{
int i;
struct progress *progress = NULL;
struct commit **sorted_commits;
init_bloom_filters();
if (ctx->report_progress)
progress = start_delayed_progress(
_("Computing commit changed paths Bloom filters"),
ctx->commits.nr);
ALLOC_ARRAY(sorted_commits, ctx->commits.nr);
COPY_ARRAY(sorted_commits, ctx->commits.list, ctx->commits.nr);
if (ctx->order_by_pack)
QSORT(sorted_commits, ctx->commits.nr, commit_pos_cmp);
else
QSORT(sorted_commits, ctx->commits.nr, commit_gen_cmp);
for (i = 0; i < ctx->commits.nr; i++) {
struct commit *c = sorted_commits[i];
struct bloom_filter *filter = get_bloom_filter(ctx->r, c, 1);
ctx->total_bloom_filter_data_size += sizeof(unsigned char) * filter->len;
display_progress(progress, i + 1);
}
free(sorted_commits);
stop_progress(&progress);
}
static int add_ref_to_set(const char *refname,
const struct object_id *oid,
int flags, void *cb_data)
@ -1361,12 +1533,13 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
int fd;
struct hashfile *f;
struct lock_file lk = LOCK_INIT;
uint32_t chunk_ids[6];
uint64_t chunk_offsets[6];
uint32_t chunk_ids[MAX_NUM_CHUNKS + 1];
uint64_t chunk_offsets[MAX_NUM_CHUNKS + 1];
const unsigned hashsz = the_hash_algo->rawsz;
struct strbuf progress_title = STRBUF_INIT;
int num_chunks = 3;
struct object_id file_hash;
const struct bloom_filter_settings bloom_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
if (ctx->split) {
struct strbuf tmp_file = STRBUF_INIT;
@ -1411,6 +1584,12 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
chunk_ids[num_chunks] = GRAPH_CHUNKID_EXTRAEDGES;
num_chunks++;
}
if (ctx->changed_paths) {
chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMINDEXES;
num_chunks++;
chunk_ids[num_chunks] = GRAPH_CHUNKID_BLOOMDATA;
num_chunks++;
}
if (ctx->num_commit_graphs_after > 1) {
chunk_ids[num_chunks] = GRAPH_CHUNKID_BASE;
num_chunks++;
@ -1429,6 +1608,15 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
4 * ctx->num_extra_edges;
num_chunks++;
}
if (ctx->changed_paths) {
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
sizeof(uint32_t) * ctx->commits.nr;
num_chunks++;
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
sizeof(uint32_t) * 3 + ctx->total_bloom_filter_data_size;
num_chunks++;
}
if (ctx->num_commit_graphs_after > 1) {
chunk_offsets[num_chunks + 1] = chunk_offsets[num_chunks] +
hashsz * (ctx->num_commit_graphs_after - 1);
@ -1466,6 +1654,10 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
write_graph_chunk_data(f, hashsz, ctx);
if (ctx->num_extra_edges)
write_graph_chunk_extra_edges(f, ctx);
if (ctx->changed_paths) {
write_graph_chunk_bloom_indexes(f, ctx);
write_graph_chunk_bloom_data(f, ctx, &bloom_settings);
}
if (ctx->num_commit_graphs_after > 1 &&
write_graph_chunk_base(f, ctx)) {
return -1;
@ -1804,6 +1996,8 @@ int write_commit_graph(struct object_directory *odb,
ctx->split = flags & COMMIT_GRAPH_WRITE_SPLIT ? 1 : 0;
ctx->check_oids = flags & COMMIT_GRAPH_WRITE_CHECK_OIDS ? 1 : 0;
ctx->split_opts = split_opts;
ctx->changed_paths = flags & COMMIT_GRAPH_WRITE_BLOOM_FILTERS ? 1 : 0;
ctx->total_bloom_filter_data_size = 0;
if (ctx->split) {
struct commit_graph *g;
@ -1856,6 +2050,7 @@ int write_commit_graph(struct object_directory *odb,
}
if (pack_indexes) {
ctx->order_by_pack = 1;
if ((res = fill_oids_from_packs(ctx, pack_indexes)))
goto cleanup;
}
@ -1865,8 +2060,10 @@ int write_commit_graph(struct object_directory *odb,
goto cleanup;
}
if (!pack_indexes && !commits)
if (!pack_indexes && !commits) {
ctx->order_by_pack = 1;
fill_oids_from_all_packs(ctx);
}
close_reachable(ctx);
@ -1902,6 +2099,9 @@ int write_commit_graph(struct object_directory *odb,
compute_generation_numbers(ctx);
if (ctx->changed_paths)
compute_bloom_filters(ctx);
res = write_commit_graph_file(ctx);
if (ctx->split)
@ -2126,6 +2326,7 @@ void free_commit_graph(struct commit_graph *g)
g->data = NULL;
}
free(g->filename);
free(g->bloom_filter_settings);
free(g);
}