rev-list: add --disk-usage option for calculating disk usage
It can sometimes be useful to see which refs are contributing to the
overall repository size (e.g., does some branch have a bunch of objects
not found elsewhere in history, which indicates that deleting it would
shrink the size of a clone).
You can find that out by generating a list of objects, getting their
sizes from cat-file, and then summing them, like:
git rev-list --objects --no-object-names main..branch
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
Though note that the caveats from git-cat-file(1) apply here. We "blame"
base objects more than their deltas, even though the relationship could
easily be flipped. Still, it can be a useful rough measure.
But one problem is that it's slow to run. Teaching rev-list to sum up
the sizes can be much faster for two reasons:
1. It skips all of the piping of object names and sizes.
2. If bitmaps are in use, for objects that are in the
bitmapped packfile we can skip the oid_object_info()
lookup entirely, and just ask the revindex for the
on-disk size.
This patch implements a --disk-usage option which produces the same
answer in a fraction of the time. Here are some timings using a clone of
torvalds/linux:
[rev-list piped to cat-file, no bitmaps]
$ time git rev-list --objects --no-object-names --all |
git cat-file --buffer --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m29.635s
user 0m38.003s
sys 0m1.093s
[internal, no bitmaps]
$ time git rev-list --disk-usage --objects --all
1459938510
real 0m31.262s
user 0m30.885s
sys 0m0.376s
Even though the wall-clock time is slightly worse due to parallelism,
notice the CPU savings between the two. We saved 21% of the CPU just by
avoiding the pipes.
But the real win is with bitmaps. If we use them without the new option:
[rev-list piped to cat-file, bitmaps]
$ time git rev-list --objects --no-object-names --all --use-bitmap-index |
git cat-file --batch-check='%(objectsize:disk)' |
perl -lne '$total += $_; END { print $total }'
1459938510
real 0m6.244s
user 0m8.452s
sys 0m0.311s
then we're faster to generate the list of objects, but we still spend a
lot of time piping and looking things up. But if we do both together:
[internal, bitmaps]
$ time git rev-list --disk-usage --objects --all --use-bitmap-index
1459938510
real 0m0.219s
user 0m0.169s
sys 0m0.049s
then we get the same answer much faster.
For "--all", that answer will correspond closely to "du objects/pack",
of course. But we're actually checking reachability here, so we're still
fast when we ask for more interesting things:
$ time git rev-list --disk-usage --use-bitmap-index v5.0..v5.10
374798628
real 0m0.429s
user 0m0.356s
sys 0m0.072s
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
committed by
Junio C Hamano
parent
3803a3a099
commit
16950f8384
@ -80,6 +80,19 @@ static int arg_show_object_names = 1;
|
||||
|
||||
#define DEFAULT_OIDSET_SIZE (16*1024)
|
||||
|
||||
static int show_disk_usage;
|
||||
static off_t total_disk_usage;
|
||||
|
||||
static off_t get_object_disk_usage(struct object *obj)
|
||||
{
|
||||
off_t size;
|
||||
struct object_info oi = OBJECT_INFO_INIT;
|
||||
oi.disk_sizep = &size;
|
||||
if (oid_object_info_extended(the_repository, &obj->oid, &oi, 0) < 0)
|
||||
die(_("unable to get disk usage of %s"), oid_to_hex(&obj->oid));
|
||||
return size;
|
||||
}
|
||||
|
||||
static void finish_commit(struct commit *commit);
|
||||
static void show_commit(struct commit *commit, void *data)
|
||||
{
|
||||
@ -88,6 +101,9 @@ static void show_commit(struct commit *commit, void *data)
|
||||
|
||||
display_progress(progress, ++progress_counter);
|
||||
|
||||
if (show_disk_usage)
|
||||
total_disk_usage += get_object_disk_usage(&commit->object);
|
||||
|
||||
if (info->flags & REV_LIST_QUIET) {
|
||||
finish_commit(commit);
|
||||
return;
|
||||
@ -258,6 +274,8 @@ static void show_object(struct object *obj, const char *name, void *cb_data)
|
||||
if (finish_object(obj, name, cb_data))
|
||||
return;
|
||||
display_progress(progress, ++progress_counter);
|
||||
if (show_disk_usage)
|
||||
total_disk_usage += get_object_disk_usage(obj);
|
||||
if (info->flags & REV_LIST_QUIET)
|
||||
return;
|
||||
|
||||
@ -452,6 +470,23 @@ static int try_bitmap_traversal(struct rev_info *revs,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int try_bitmap_disk_usage(struct rev_info *revs,
|
||||
struct list_objects_filter_options *filter)
|
||||
{
|
||||
struct bitmap_index *bitmap_git;
|
||||
|
||||
if (!show_disk_usage)
|
||||
return -1;
|
||||
|
||||
bitmap_git = prepare_bitmap_walk(revs, filter);
|
||||
if (!bitmap_git)
|
||||
return -1;
|
||||
|
||||
printf("%"PRIuMAX"\n",
|
||||
(uintmax_t)get_disk_usage_from_bitmap(bitmap_git, revs));
|
||||
return 0;
|
||||
}
|
||||
|
||||
int cmd_rev_list(int argc, const char **argv, const char *prefix)
|
||||
{
|
||||
struct rev_info revs;
|
||||
@ -584,6 +619,12 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strcmp(arg, "--disk-usage")) {
|
||||
show_disk_usage = 1;
|
||||
info.flags |= REV_LIST_QUIET;
|
||||
continue;
|
||||
}
|
||||
|
||||
usage(rev_list_usage);
|
||||
|
||||
}
|
||||
@ -626,6 +667,8 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
|
||||
if (use_bitmap_index) {
|
||||
if (!try_bitmap_count(&revs, &filter_options))
|
||||
return 0;
|
||||
if (!try_bitmap_disk_usage(&revs, &filter_options))
|
||||
return 0;
|
||||
if (!try_bitmap_traversal(&revs, &filter_options))
|
||||
return 0;
|
||||
}
|
||||
@ -690,5 +733,8 @@ int cmd_rev_list(int argc, const char **argv, const char *prefix)
|
||||
printf("%d\n", revs.count_left + revs.count_right);
|
||||
}
|
||||
|
||||
if (show_disk_usage)
|
||||
printf("%"PRIuMAX"\n", (uintmax_t)total_disk_usage);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user