It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
94 lines
2.9 KiB
C
94 lines
2.9 KiB
C
#ifndef PACK_BITMAP_H
|
|
#define PACK_BITMAP_H
|
|
|
|
#include "ewah/ewok.h"
|
|
#include "khash.h"
|
|
#include "pack.h"
|
|
#include "pack-objects.h"
|
|
|
|
struct commit;
|
|
struct repository;
|
|
struct rev_info;
|
|
struct list_objects_filter_options;
|
|
|
|
static const char BITMAP_IDX_SIGNATURE[] = {'B', 'I', 'T', 'M'};
|
|
|
|
struct bitmap_disk_header {
|
|
char magic[ARRAY_SIZE(BITMAP_IDX_SIGNATURE)];
|
|
uint16_t version;
|
|
uint16_t options;
|
|
uint32_t entry_count;
|
|
unsigned char checksum[GIT_MAX_RAWSZ];
|
|
};
|
|
|
|
#define NEEDS_BITMAP (1u<<22)
|
|
|
|
enum pack_bitmap_opts {
|
|
BITMAP_OPT_FULL_DAG = 1,
|
|
BITMAP_OPT_HASH_CACHE = 4,
|
|
};
|
|
|
|
enum pack_bitmap_flags {
|
|
BITMAP_FLAG_REUSE = 0x1
|
|
};
|
|
|
|
typedef int (*show_reachable_fn)(
|
|
const struct object_id *oid,
|
|
enum object_type type,
|
|
int flags,
|
|
uint32_t hash,
|
|
struct packed_git *found_pack,
|
|
off_t found_offset);
|
|
|
|
struct bitmap_index;
|
|
|
|
struct bitmap_index *prepare_bitmap_git(struct repository *r);
|
|
void count_bitmap_commit_list(struct bitmap_index *, uint32_t *commits,
|
|
uint32_t *trees, uint32_t *blobs, uint32_t *tags);
|
|
void traverse_bitmap_commit_list(struct bitmap_index *,
|
|
struct rev_info *revs,
|
|
show_reachable_fn show_reachable);
|
|
void test_bitmap_walk(struct rev_info *revs);
|
|
struct bitmap_index *prepare_bitmap_walk(struct rev_info *revs,
|
|
struct list_objects_filter_options *filter);
|
|
int reuse_partial_packfile_from_bitmap(struct bitmap_index *,
|
|
struct packed_git **packfile,
|
|
uint32_t *entries,
|
|
struct bitmap **reuse_out);
|
|
int rebuild_existing_bitmaps(struct bitmap_index *, struct packing_data *mapping,
|
|
kh_oid_map_t *reused_bitmaps, int show_progress);
|
|
void free_bitmap_index(struct bitmap_index *);
|
|
int bitmap_walk_contains(struct bitmap_index *,
|
|
struct bitmap *bitmap, const struct object_id *oid);
|
|
|
|
/*
|
|
* After a traversal has been performed by prepare_bitmap_walk(), this can be
|
|
* queried to see if a particular object was reachable from any of the
|
|
* objects flagged as UNINTERESTING.
|
|
*/
|
|
int bitmap_has_oid_in_uninteresting(struct bitmap_index *, const struct object_id *oid);
|
|
|
|
off_t get_disk_usage_from_bitmap(struct bitmap_index *, struct rev_info *);
|
|
|
|
void bitmap_writer_show_progress(int show);
|
|
void bitmap_writer_set_checksum(unsigned char *sha1);
|
|
void bitmap_writer_build_type_index(struct packing_data *to_pack,
|
|
struct pack_idx_entry **index,
|
|
uint32_t index_nr);
|
|
uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git,
|
|
struct packing_data *mapping);
|
|
int rebuild_bitmap(const uint32_t *reposition,
|
|
struct ewah_bitmap *source,
|
|
struct bitmap *dest);
|
|
struct ewah_bitmap *bitmap_for_commit(struct bitmap_index *bitmap_git,
|
|
struct commit *commit);
|
|
void bitmap_writer_select_commits(struct commit **indexed_commits,
|
|
unsigned int indexed_commits_nr, int max_bitmaps);
|
|
void bitmap_writer_build(struct packing_data *to_pack);
|
|
void bitmap_writer_finish(struct pack_idx_entry **index,
|
|
uint32_t index_nr,
|
|
const char *filename,
|
|
uint16_t options);
|
|
|
|
#endif
|