
In an earlier commit, a bug was described where it's possible for Git to produce non-murmur3 hashes when the platform's "char" type is signed, and there are paths with characters whose highest bit is set (i.e. all characters >= 0x80). That patch allows the caller to control which version of Bloom filters are read and written. However, even on platforms with a signed "char" type, it is possible to reuse existing Bloom filters if and only if there are no changed paths in any commit's first parent tree-diff whose characters have their highest bit set. When this is the case, we can reuse the existing filter without having to compute a new one. This is done by marking trees which are known to have (or not have) any such paths. When a commit's root tree is verified to not have any such paths, we mark it as such and declare that the commit's Bloom filter is reusable. Note that this heuristic only goes in one direction. If neither a commit nor its first parent have any paths in their trees with non-ASCII characters, then we know for certain that a path with non-ASCII characters will not appear in a tree-diff against that commit's first parent. The reverse isn't necessarily true: just because the tree-diff doesn't contain any such paths does not imply that no such paths exist in either tree. So we end up recomputing some Bloom filters that we don't strictly have to (i.e. their bits are the same no matter which version of murmur3 we use). But culling these out is impossible, since we'd have to perform the full tree-diff, which is the same effort as computing the Bloom filter from scratch. But because we can cache our results in each tree's flag bits, we can often avoid recomputing many filters, thereby reducing the time it takes to run $ git commit-graph write --changed-paths --reachable when upgrading from v1 to v2 Bloom filters. To benchmark this, let's generate a commit-graph in linux.git with v1 changed-paths in generation order[^1]: $ git clone git@github.com:torvalds/linux.git $ cd linux $ git commit-graph write --reachable --changed-paths $ graph=".git/objects/info/commit-graph" $ mv $graph{,.bak} Then let's time how long it takes to go from v1 to v2 filters (with and without the upgrade path enabled), resetting the state of the commit-graph each time: $ git config commitGraph.changedPathsVersion 2 $ hyperfine -p 'cp -f $graph.bak $graph' -L v 0,1 \ 'GIT_TEST_UPGRADE_BLOOM_FILTERS={v} git.compile commit-graph write --reachable --changed-paths' On linux.git (where there aren't any non-ASCII paths), the timings indicate that this patch represents a speed-up over recomputing all Bloom filters from scratch: Benchmark 1: GIT_TEST_UPGRADE_BLOOM_FILTERS=0 git.compile commit-graph write --reachable --changed-paths Time (mean ± σ): 124.873 s ± 0.316 s [User: 124.081 s, System: 0.643 s] Range (min … max): 124.621 s … 125.227 s 3 runs Benchmark 2: GIT_TEST_UPGRADE_BLOOM_FILTERS=1 git.compile commit-graph write --reachable --changed-paths Time (mean ± σ): 79.271 s ± 0.163 s [User: 74.611 s, System: 4.521 s] Range (min … max): 79.112 s … 79.437 s 3 runs Summary 'GIT_TEST_UPGRADE_BLOOM_FILTERS=1 git.compile commit-graph write --reachable --changed-paths' ran 1.58 ± 0.01 times faster than 'GIT_TEST_UPGRADE_BLOOM_FILTERS=0 git.compile commit-graph write --reachable --changed-paths' On git.git, we do have some non-ASCII paths, giving us a more modest improvement from 4.163 seconds to 3.348 seconds, for a 1.24x speed-up. On my machine, the stats for git.git are: - 8,285 Bloom filters computed from scratch - 10 Bloom filters generated as empty - 4 Bloom filters generated as truncated due to too many changed paths - 65,114 Bloom filters were reused when transitioning from v1 to v2. [^1]: Note that this is is important, since `--stdin-packs` or `--stdin-commits` orders commits in the commit-graph by their pack position (with `--stdin-packs`) or in the raw input (with `--stdin-commits`). Since we compute Bloom filters in the same order that commits appear in the graph, we must see a commit's (first) parent before we process the commit itself. This is only guaranteed to happen when sorting commits by their generation number. Signed-off-by: Taylor Blau <me@ttaylorr.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
291 lines
8.9 KiB
C
291 lines
8.9 KiB
C
#ifndef OBJECT_H
|
|
#define OBJECT_H
|
|
|
|
#include "hash-ll.h"
|
|
|
|
struct buffer_slab;
|
|
struct repository;
|
|
|
|
struct parsed_object_pool {
|
|
struct object **obj_hash;
|
|
int nr_objs, obj_hash_size;
|
|
|
|
/* TODO: migrate alloc_states to mem-pool? */
|
|
struct alloc_state *blob_state;
|
|
struct alloc_state *tree_state;
|
|
struct alloc_state *commit_state;
|
|
struct alloc_state *tag_state;
|
|
struct alloc_state *object_state;
|
|
|
|
/* parent substitutions from .git/info/grafts and .git/shallow */
|
|
struct commit_graft **grafts;
|
|
int grafts_alloc, grafts_nr;
|
|
|
|
int is_shallow;
|
|
struct stat_validity *shallow_stat;
|
|
char *alternate_shallow_file;
|
|
|
|
int commit_graft_prepared;
|
|
int substituted_parent;
|
|
|
|
struct buffer_slab *buffer_slab;
|
|
};
|
|
|
|
struct parsed_object_pool *parsed_object_pool_new(void);
|
|
void parsed_object_pool_clear(struct parsed_object_pool *o);
|
|
|
|
struct object_list {
|
|
struct object *item;
|
|
struct object_list *next;
|
|
};
|
|
|
|
struct object_array {
|
|
unsigned int nr;
|
|
unsigned int alloc;
|
|
struct object_array_entry {
|
|
struct object *item;
|
|
/*
|
|
* name or NULL. If non-NULL, the memory pointed to
|
|
* is owned by this object *except* if it points at
|
|
* object_array_slopbuf, which is a static copy of the
|
|
* empty string.
|
|
*/
|
|
char *name;
|
|
char *path;
|
|
unsigned mode;
|
|
} *objects;
|
|
};
|
|
|
|
#define OBJECT_ARRAY_INIT { 0 }
|
|
|
|
void object_array_init(struct object_array *array);
|
|
|
|
/*
|
|
* object flag allocation:
|
|
* revision.h: 0---------10 15 23------27
|
|
* fetch-pack.c: 01 67
|
|
* negotiator/default.c: 2--5
|
|
* walker.c: 0-2
|
|
* upload-pack.c: 4 11-----14 16-----19
|
|
* builtin/blame.c: 12-13
|
|
* bisect.c: 16
|
|
* bundle.c: 16
|
|
* http-push.c: 11-----14
|
|
* commit-graph.c: 15
|
|
* commit-reach.c: 16-----19
|
|
* sha1-name.c: 20
|
|
* list-objects-filter.c: 21
|
|
* bloom.c: 2122
|
|
* builtin/fsck.c: 0--3
|
|
* builtin/gc.c: 0
|
|
* builtin/index-pack.c: 2021
|
|
* reflog.c: 10--12
|
|
* builtin/show-branch.c: 0-------------------------------------------26
|
|
* builtin/unpack-objects.c: 2021
|
|
*/
|
|
#define FLAG_BITS 28
|
|
|
|
#define TYPE_BITS 3
|
|
|
|
/*
|
|
* Values in this enum (except those outside the 3 bit range) are part
|
|
* of pack file format. See gitformat-pack(5) for more information.
|
|
*/
|
|
enum object_type {
|
|
OBJ_BAD = -1,
|
|
OBJ_NONE = 0,
|
|
OBJ_COMMIT = 1,
|
|
OBJ_TREE = 2,
|
|
OBJ_BLOB = 3,
|
|
OBJ_TAG = 4,
|
|
/* 5 for future expansion */
|
|
OBJ_OFS_DELTA = 6,
|
|
OBJ_REF_DELTA = 7,
|
|
OBJ_ANY,
|
|
OBJ_MAX
|
|
};
|
|
|
|
/* unknown mode (impossible combination S_IFIFO|S_IFCHR) */
|
|
#define S_IFINVALID 0030000
|
|
|
|
/*
|
|
* A "directory link" is a link to another git directory.
|
|
*
|
|
* The value 0160000 is not normally a valid mode, and
|
|
* also just happens to be S_IFDIR + S_IFLNK
|
|
*/
|
|
#define S_IFGITLINK 0160000
|
|
#define S_ISGITLINK(m) (((m) & S_IFMT) == S_IFGITLINK)
|
|
|
|
#define S_ISSPARSEDIR(m) ((m) == S_IFDIR)
|
|
|
|
static inline enum object_type object_type(unsigned int mode)
|
|
{
|
|
return S_ISDIR(mode) ? OBJ_TREE :
|
|
S_ISGITLINK(mode) ? OBJ_COMMIT :
|
|
OBJ_BLOB;
|
|
}
|
|
|
|
#define ce_permissions(mode) (((mode) & 0100) ? 0755 : 0644)
|
|
static inline unsigned int create_ce_mode(unsigned int mode)
|
|
{
|
|
if (S_ISLNK(mode))
|
|
return S_IFLNK;
|
|
if (S_ISSPARSEDIR(mode))
|
|
return S_IFDIR;
|
|
if (S_ISDIR(mode) || S_ISGITLINK(mode))
|
|
return S_IFGITLINK;
|
|
return S_IFREG | ce_permissions(mode);
|
|
}
|
|
|
|
static inline unsigned int canon_mode(unsigned int mode)
|
|
{
|
|
if (S_ISREG(mode))
|
|
return S_IFREG | ce_permissions(mode);
|
|
if (S_ISLNK(mode))
|
|
return S_IFLNK;
|
|
if (S_ISDIR(mode))
|
|
return S_IFDIR;
|
|
return S_IFGITLINK;
|
|
}
|
|
|
|
/*
|
|
* The object type is stored in 3 bits.
|
|
*/
|
|
struct object {
|
|
unsigned parsed : 1;
|
|
unsigned type : TYPE_BITS;
|
|
unsigned flags : FLAG_BITS;
|
|
struct object_id oid;
|
|
};
|
|
|
|
const char *type_name(unsigned int type);
|
|
int type_from_string_gently(const char *str, ssize_t, int gentle);
|
|
#define type_from_string(str) type_from_string_gently(str, -1, 0)
|
|
|
|
/*
|
|
* Return the current number of buckets in the object hashmap.
|
|
*/
|
|
unsigned int get_max_object_index(void);
|
|
|
|
/*
|
|
* Return the object from the specified bucket in the object hashmap.
|
|
*/
|
|
struct object *get_indexed_object(unsigned int);
|
|
|
|
/*
|
|
* This can be used to see if we have heard of the object before, but
|
|
* it can return "yes we have, and here is a half-initialised object"
|
|
* for an object that we haven't loaded/parsed yet.
|
|
*
|
|
* When parsing a commit to create an in-core commit object, its
|
|
* parents list holds commit objects that represent its parents, but
|
|
* they are expected to be lazily initialized and do not know what
|
|
* their trees or parents are yet. When this function returns such a
|
|
* half-initialised objects, the caller is expected to initialize them
|
|
* by calling parse_object() on them.
|
|
*/
|
|
struct object *lookup_object(struct repository *r, const struct object_id *oid);
|
|
|
|
void *create_object(struct repository *r, const struct object_id *oid, void *obj);
|
|
|
|
void *object_as_type(struct object *obj, enum object_type type, int quiet);
|
|
|
|
/*
|
|
* Returns the object, having parsed it to find out what it is.
|
|
*
|
|
* Returns NULL if the object is missing or corrupt.
|
|
*/
|
|
enum parse_object_flags {
|
|
PARSE_OBJECT_SKIP_HASH_CHECK = 1 << 0,
|
|
};
|
|
struct object *parse_object(struct repository *r, const struct object_id *oid);
|
|
struct object *parse_object_with_flags(struct repository *r,
|
|
const struct object_id *oid,
|
|
enum parse_object_flags flags);
|
|
|
|
/*
|
|
* Like parse_object, but will die() instead of returning NULL. If the
|
|
* "name" parameter is not NULL, it is included in the error message
|
|
* (otherwise, the hex object ID is given).
|
|
*/
|
|
struct object *parse_object_or_die(const struct object_id *oid, const char *name);
|
|
|
|
/* Given the result of read_sha1_file(), returns the object after
|
|
* parsing it. eaten_p indicates if the object has a borrowed copy
|
|
* of buffer and the caller should not free() it.
|
|
*/
|
|
struct object *parse_object_buffer(struct repository *r, const struct object_id *oid, enum object_type type, unsigned long size, void *buffer, int *eaten_p);
|
|
|
|
/*
|
|
* Allocate and return an object struct, even if you do not know the type of
|
|
* the object. The returned object may have its "type" field set to a real type
|
|
* (if somebody previously called lookup_blob(), etc), or it may be set to
|
|
* OBJ_NONE. In the latter case, subsequent calls to lookup_blob(), etc, will
|
|
* set the type field as appropriate.
|
|
*
|
|
* Use this when you do not know the expected type of an object and want to
|
|
* avoid parsing it for efficiency reasons. Try to avoid it otherwise; it
|
|
* may allocate excess memory, since the returned object must be as large as
|
|
* the maximum struct of any type.
|
|
*/
|
|
struct object *lookup_unknown_object(struct repository *r, const struct object_id *oid);
|
|
|
|
/*
|
|
* Dispatch to the appropriate lookup_blob(), lookup_commit(), etc, based on
|
|
* "type".
|
|
*/
|
|
struct object *lookup_object_by_type(struct repository *r, const struct object_id *oid,
|
|
enum object_type type);
|
|
|
|
struct object_list *object_list_insert(struct object *item,
|
|
struct object_list **list_p);
|
|
|
|
int object_list_contains(struct object_list *list, struct object *obj);
|
|
|
|
void object_list_free(struct object_list **list);
|
|
|
|
/* Object array handling .. */
|
|
void add_object_array(struct object *obj, const char *name, struct object_array *array);
|
|
void add_object_array_with_path(struct object *obj, const char *name, struct object_array *array, unsigned mode, const char *path);
|
|
|
|
/*
|
|
* Returns NULL if the array is empty. Otherwise, returns the last object
|
|
* after removing its entry from the array. Other resources associated
|
|
* with that object are left in an unspecified state and should not be
|
|
* examined.
|
|
*/
|
|
struct object *object_array_pop(struct object_array *array);
|
|
|
|
typedef int (*object_array_each_func_t)(struct object_array_entry *, void *);
|
|
|
|
/*
|
|
* Apply want to each entry in array, retaining only the entries for
|
|
* which the function returns true. Preserve the order of the entries
|
|
* that are retained.
|
|
*/
|
|
void object_array_filter(struct object_array *array,
|
|
object_array_each_func_t want, void *cb_data);
|
|
|
|
/*
|
|
* Remove from array all but the first entry with a given name.
|
|
* Warning: this function uses an O(N^2) algorithm.
|
|
*/
|
|
void object_array_remove_duplicates(struct object_array *array);
|
|
|
|
/*
|
|
* Remove any objects from the array, freeing all used memory; afterwards
|
|
* the array is ready to store more objects with add_object_array().
|
|
*/
|
|
void object_array_clear(struct object_array *array);
|
|
|
|
void clear_object_flags(unsigned flags);
|
|
|
|
/*
|
|
* Clear the specified object flags from all in-core commit objects from
|
|
* the specified repository.
|
|
*/
|
|
void repo_clear_commit_marks(struct repository *r, unsigned int flags);
|
|
|
|
#endif /* OBJECT_H */
|