Merge branch 'tb/reverse-midx'

An on-disk reverse-index to map the in-pack location of an object
back to its object name across multiple packfiles is introduced.

* tb/reverse-midx:
  midx.c: improve cache locality in midx_pack_order_cmp()
  pack-revindex: write multi-pack reverse indexes
  pack-write.c: extract 'write_rev_file_order'
  pack-revindex: read multi-pack reverse indexes
  Documentation/technical: describe multi-pack reverse indexes
  midx: make some functions non-static
  midx: keep track of the checksum
  midx: don't free midx_name early
  midx: allow marking a pack as preferred
  t/helper/test-read-midx.c: add '--show-objects'
  builtin/multi-pack-index.c: display usage on unrecognized command
  builtin/multi-pack-index.c: don't enter bogus cmd_mode
  builtin/multi-pack-index.c: split sub-commands
  builtin/multi-pack-index.c: define common usage with a macro
  builtin/multi-pack-index.c: don't handle 'progress' separately
  builtin/multi-pack-index.c: inline 'flags' with options
This commit is contained in:
Junio C Hamano
2021-04-08 13:23:25 -07:00
14 changed files with 741 additions and 76 deletions

219
midx.c
View File

@ -12,6 +12,7 @@
#include "run-command.h"
#include "repository.h"
#include "chunk-format.h"
#include "pack.h"
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
#define MIDX_VERSION 1
@ -47,11 +48,22 @@ static uint8_t oid_version(void)
}
}
static const unsigned char *get_midx_checksum(struct multi_pack_index *m)
{
return m->data + m->data_len - the_hash_algo->rawsz;
}
static char *get_midx_filename(const char *object_dir)
{
return xstrfmt("%s/pack/multi-pack-index", object_dir);
}
char *get_midx_rev_filename(struct multi_pack_index *m)
{
return xstrfmt("%s/pack/multi-pack-index-%s.rev",
m->object_dir, hash_to_hex(get_midx_checksum(m)));
}
static int midx_read_oid_fanout(const unsigned char *chunk_start,
size_t chunk_size, void *data)
{
@ -239,7 +251,7 @@ struct object_id *nth_midxed_object_oid(struct object_id *oid,
return oid;
}
static off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos)
off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos)
{
const unsigned char *offset_data;
uint32_t offset32;
@ -258,7 +270,7 @@ static off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos)
return offset32;
}
static uint32_t nth_midxed_pack_int_id(struct multi_pack_index *m, uint32_t pos)
uint32_t nth_midxed_pack_int_id(struct multi_pack_index *m, uint32_t pos)
{
return get_be32(m->chunk_object_offsets +
(off_t)pos * MIDX_CHUNK_OFFSET_WIDTH);
@ -431,6 +443,14 @@ static int pack_info_compare(const void *_a, const void *_b)
return strcmp(a->pack_name, b->pack_name);
}
static int idx_or_pack_name_cmp(const void *_va, const void *_vb)
{
const char *pack_name = _va;
const struct pack_info *compar = _vb;
return cmp_idx_or_pack_name(pack_name, compar->pack_name);
}
struct write_midx_context {
struct pack_info *info;
uint32_t nr;
@ -443,8 +463,11 @@ struct write_midx_context {
uint32_t entries_nr;
uint32_t *pack_perm;
uint32_t *pack_order;
unsigned large_offsets_needed:1;
uint32_t num_large_offsets;
int preferred_pack_idx;
};
static void add_pack_to_midx(const char *full_path, size_t full_path_len,
@ -489,6 +512,7 @@ struct pack_midx_entry {
uint32_t pack_int_id;
time_t pack_mtime;
uint64_t offset;
unsigned preferred : 1;
};
static int midx_oid_compare(const void *_a, const void *_b)
@ -500,6 +524,12 @@ static int midx_oid_compare(const void *_a, const void *_b)
if (cmp)
return cmp;
/* Sort objects in a preferred pack first when multiple copies exist. */
if (a->preferred > b->preferred)
return -1;
if (a->preferred < b->preferred)
return 1;
if (a->pack_mtime > b->pack_mtime)
return -1;
else if (a->pack_mtime < b->pack_mtime)
@ -527,7 +557,8 @@ static int nth_midxed_pack_midx_entry(struct multi_pack_index *m,
static void fill_pack_entry(uint32_t pack_int_id,
struct packed_git *p,
uint32_t cur_object,
struct pack_midx_entry *entry)
struct pack_midx_entry *entry,
int preferred)
{
if (nth_packed_object_id(&entry->oid, p, cur_object) < 0)
die(_("failed to locate object %d in packfile"), cur_object);
@ -536,6 +567,7 @@ static void fill_pack_entry(uint32_t pack_int_id,
entry->pack_mtime = p->mtime;
entry->offset = nth_packed_object_offset(p, cur_object);
entry->preferred = !!preferred;
}
/*
@ -552,7 +584,8 @@ static void fill_pack_entry(uint32_t pack_int_id,
static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m,
struct pack_info *info,
uint32_t nr_packs,
uint32_t *nr_objects)
uint32_t *nr_objects,
int preferred_pack)
{
uint32_t cur_fanout, cur_pack, cur_object;
uint32_t alloc_fanout, alloc_objects, total_objects = 0;
@ -589,12 +622,17 @@ static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m,
nth_midxed_pack_midx_entry(m,
&entries_by_fanout[nr_fanout],
cur_object);
if (nth_midxed_pack_int_id(m, cur_object) == preferred_pack)
entries_by_fanout[nr_fanout].preferred = 1;
else
entries_by_fanout[nr_fanout].preferred = 0;
nr_fanout++;
}
}
for (cur_pack = start_pack; cur_pack < nr_packs; cur_pack++) {
uint32_t start = 0, end;
int preferred = cur_pack == preferred_pack;
if (cur_fanout)
start = get_pack_fanout(info[cur_pack].p, cur_fanout - 1);
@ -602,7 +640,11 @@ static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m,
for (cur_object = start; cur_object < end; cur_object++) {
ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout);
fill_pack_entry(cur_pack, info[cur_pack].p, cur_object, &entries_by_fanout[nr_fanout]);
fill_pack_entry(cur_pack,
info[cur_pack].p,
cur_object,
&entries_by_fanout[nr_fanout],
preferred);
nr_fanout++;
}
}
@ -776,10 +818,80 @@ static int write_midx_large_offsets(struct hashfile *f,
return 0;
}
struct midx_pack_order_data {
uint32_t nr;
uint32_t pack;
off_t offset;
};
static int midx_pack_order_cmp(const void *va, const void *vb)
{
const struct midx_pack_order_data *a = va, *b = vb;
if (a->pack < b->pack)
return -1;
else if (a->pack > b->pack)
return 1;
else if (a->offset < b->offset)
return -1;
else if (a->offset > b->offset)
return 1;
else
return 0;
}
static uint32_t *midx_pack_order(struct write_midx_context *ctx)
{
struct midx_pack_order_data *data;
uint32_t *pack_order;
uint32_t i;
ALLOC_ARRAY(data, ctx->entries_nr);
for (i = 0; i < ctx->entries_nr; i++) {
struct pack_midx_entry *e = &ctx->entries[i];
data[i].nr = i;
data[i].pack = ctx->pack_perm[e->pack_int_id];
if (!e->preferred)
data[i].pack |= (1U << 31);
data[i].offset = e->offset;
}
QSORT(data, ctx->entries_nr, midx_pack_order_cmp);
ALLOC_ARRAY(pack_order, ctx->entries_nr);
for (i = 0; i < ctx->entries_nr; i++)
pack_order[i] = data[i].nr;
free(data);
return pack_order;
}
static void write_midx_reverse_index(char *midx_name, unsigned char *midx_hash,
struct write_midx_context *ctx)
{
struct strbuf buf = STRBUF_INIT;
const char *tmp_file;
strbuf_addf(&buf, "%s-%s.rev", midx_name, hash_to_hex(midx_hash));
tmp_file = write_rev_file_order(NULL, ctx->pack_order, ctx->entries_nr,
midx_hash, WRITE_REV);
if (finalize_object_file(tmp_file, buf.buf))
die(_("cannot store reverse index file"));
strbuf_release(&buf);
}
static void clear_midx_files_ext(struct repository *r, const char *ext,
unsigned char *keep_hash);
static int write_midx_internal(const char *object_dir, struct multi_pack_index *m,
struct string_list *packs_to_drop, unsigned flags)
struct string_list *packs_to_drop,
const char *preferred_pack_name,
unsigned flags)
{
char *midx_name;
unsigned char midx_hash[GIT_MAX_RAWSZ];
uint32_t i;
struct hashfile *f = NULL;
struct lock_file lk;
@ -828,7 +940,19 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
if (ctx.m && ctx.nr == ctx.m->num_packs && !packs_to_drop)
goto cleanup;
ctx.entries = get_sorted_entries(ctx.m, ctx.info, ctx.nr, &ctx.entries_nr);
ctx.preferred_pack_idx = -1;
if (preferred_pack_name) {
for (i = 0; i < ctx.nr; i++) {
if (!cmp_idx_or_pack_name(preferred_pack_name,
ctx.info[i].pack_name)) {
ctx.preferred_pack_idx = i;
break;
}
}
}
ctx.entries = get_sorted_entries(ctx.m, ctx.info, ctx.nr, &ctx.entries_nr,
ctx.preferred_pack_idx);
ctx.large_offsets_needed = 0;
for (i = 0; i < ctx.entries_nr; i++) {
@ -889,13 +1013,30 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
pack_name_concat_len += strlen(ctx.info[i].pack_name) + 1;
}
/* Check that the preferred pack wasn't expired (if given). */
if (preferred_pack_name) {
struct pack_info *preferred = bsearch(preferred_pack_name,
ctx.info, ctx.nr,
sizeof(*ctx.info),
idx_or_pack_name_cmp);
if (!preferred)
warning(_("unknown preferred pack: '%s'"),
preferred_pack_name);
else {
uint32_t perm = ctx.pack_perm[preferred->orig_pack_int_id];
if (perm == PACK_EXPIRED)
warning(_("preferred pack '%s' is expired"),
preferred_pack_name);
}
}
if (pack_name_concat_len % MIDX_CHUNK_ALIGNMENT)
pack_name_concat_len += MIDX_CHUNK_ALIGNMENT -
(pack_name_concat_len % MIDX_CHUNK_ALIGNMENT);
hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR);
f = hashfd(get_lock_file_fd(&lk), get_lock_file_path(&lk));
FREE_AND_NULL(midx_name);
if (ctx.m)
close_midx(ctx.m);
@ -927,8 +1068,16 @@ static int write_midx_internal(const char *object_dir, struct multi_pack_index *
write_midx_header(f, get_num_chunks(cf), ctx.nr - dropped_packs);
write_chunkfile(cf, &ctx);
finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
finalize_hashfile(f, midx_hash, CSUM_FSYNC | CSUM_HASH_IN_STREAM);
free_chunkfile(cf);
if (flags & MIDX_WRITE_REV_INDEX)
ctx.pack_order = midx_pack_order(&ctx);
if (flags & MIDX_WRITE_REV_INDEX)
write_midx_reverse_index(midx_name, midx_hash, &ctx);
clear_midx_files_ext(the_repository, ".rev", midx_hash);
commit_lock_file(&lk);
cleanup:
@ -943,13 +1092,55 @@ cleanup:
free(ctx.info);
free(ctx.entries);
free(ctx.pack_perm);
free(ctx.pack_order);
free(midx_name);
return result;
}
int write_midx_file(const char *object_dir, unsigned flags)
int write_midx_file(const char *object_dir,
const char *preferred_pack_name,
unsigned flags)
{
return write_midx_internal(object_dir, NULL, NULL, flags);
return write_midx_internal(object_dir, NULL, NULL, preferred_pack_name,
flags);
}
struct clear_midx_data {
char *keep;
const char *ext;
};
static void clear_midx_file_ext(const char *full_path, size_t full_path_len,
const char *file_name, void *_data)
{
struct clear_midx_data *data = _data;
if (!(starts_with(file_name, "multi-pack-index-") &&
ends_with(file_name, data->ext)))
return;
if (data->keep && !strcmp(data->keep, file_name))
return;
if (unlink(full_path))
die_errno(_("failed to remove %s"), full_path);
}
static void clear_midx_files_ext(struct repository *r, const char *ext,
unsigned char *keep_hash)
{
struct clear_midx_data data;
memset(&data, 0, sizeof(struct clear_midx_data));
if (keep_hash)
data.keep = xstrfmt("multi-pack-index-%s%s",
hash_to_hex(keep_hash), ext);
data.ext = ext;
for_each_file_in_pack_dir(r->objects->odb->path,
clear_midx_file_ext,
&data);
free(data.keep);
}
void clear_midx_file(struct repository *r)
@ -964,6 +1155,8 @@ void clear_midx_file(struct repository *r)
if (remove_path(midx))
die(_("failed to clear multi-pack-index at %s"), midx);
clear_midx_files_ext(r, ".rev", NULL);
free(midx);
}
@ -1184,7 +1377,7 @@ int expire_midx_packs(struct repository *r, const char *object_dir, unsigned fla
free(count);
if (packs_to_drop.nr)
result = write_midx_internal(object_dir, m, &packs_to_drop, flags);
result = write_midx_internal(object_dir, m, &packs_to_drop, NULL, flags);
string_list_clear(&packs_to_drop, 0);
return result;
@ -1373,7 +1566,7 @@ int midx_repack(struct repository *r, const char *object_dir, size_t batch_size,
goto cleanup;
}
result = write_midx_internal(object_dir, m, NULL, flags);
result = write_midx_internal(object_dir, m, NULL, NULL, flags);
m = NULL;
cleanup: