From 5305474ec4650755f2c0437c004cabfb1c60cf6a Mon Sep 17 00:00:00 2001 From: Victoria Dye Date: Mon, 9 Oct 2023 21:58:53 +0000 Subject: [PATCH 1/4] ref-cache.c: fix prefix matching in ref iteration Update 'cache_ref_iterator_advance' to skip over refs that are not matched by the given prefix. Currently, a ref entry is considered "matched" if the entry name is fully contained within the prefix: * prefix: "refs/heads/v1" * entry: "refs/heads/v1.0" OR if the prefix is fully contained in the entry name: * prefix: "refs/heads/v1.0" * entry: "refs/heads/v1" The first case is always correct, but the second is only correct if the ref cache entry is a directory, for example: * prefix: "refs/heads/example" * entry: "refs/heads/" Modify the logic in 'cache_ref_iterator_advance' to reflect these expectations: 1. If 'overlaps_prefix' returns 'PREFIX_EXCLUDES_DIR', then the prefix and ref cache entry do not overlap at all. Skip this entry. 2. If 'overlaps_prefix' returns 'PREFIX_WITHIN_DIR', then the prefix matches inside this entry if it is a directory. Skip if the entry is not a directory, otherwise iterate over it. 3. Otherwise, 'overlaps_prefix' returned 'PREFIX_CONTAINS_DIR', indicating that the cache entry (directory or not) is fully contained by or equal to the prefix. Iterate over this entry. Note that condition 2 relies on the names of directory entries having the appropriate trailing slash. The existing function documentation of 'create_dir_entry' explicitly calls out the trailing slash requirement, so this is a safe assumption to make. This bug generally doesn't have any user-facing impact, since it requires: 1. using a non-empty prefix without a trailing slash in an iteration like 'for_each_fullref_in', 2. the callback to said iteration not reapplying the original filter (as for-each-ref does) to ensure unmatched refs are skipped, and 3. the repository having one or more refs that match part of, but not all of, the prefix. However, there are some niche scenarios that meet those criteria (specifically, 'rev-parse --bisect' and '(log|show|shortlog) --bisect'). Add tests covering those cases to demonstrate the fix in this patch. Signed-off-by: Victoria Dye Signed-off-by: Junio C Hamano --- refs/ref-cache.c | 3 ++- t/t1500-rev-parse.sh | 23 +++++++++++++++++++++++ t/t4205-log-pretty-formats.sh | 30 ++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/refs/ref-cache.c b/refs/ref-cache.c index 2294c4564f..6e3b725245 100644 --- a/refs/ref-cache.c +++ b/refs/ref-cache.c @@ -412,7 +412,8 @@ static int cache_ref_iterator_advance(struct ref_iterator *ref_iterator) if (level->prefix_state == PREFIX_WITHIN_DIR) { entry_prefix_state = overlaps_prefix(entry->name, iter->prefix); - if (entry_prefix_state == PREFIX_EXCLUDES_DIR) + if (entry_prefix_state == PREFIX_EXCLUDES_DIR || + (entry_prefix_state == PREFIX_WITHIN_DIR && !(entry->flag & REF_DIR))) continue; } else { entry_prefix_state = level->prefix_state; diff --git a/t/t1500-rev-parse.sh b/t/t1500-rev-parse.sh index 37ee5091b5..3f9e7f62e4 100755 --- a/t/t1500-rev-parse.sh +++ b/t/t1500-rev-parse.sh @@ -264,4 +264,27 @@ test_expect_success 'rev-parse --since= unsqueezed ordering' ' test_cmp expect actual ' +test_expect_success 'rev-parse --bisect includes bad, excludes good' ' + test_commit_bulk 6 && + + git update-ref refs/bisect/bad-1 HEAD~1 && + git update-ref refs/bisect/b HEAD~2 && + git update-ref refs/bisect/bad-3 HEAD~3 && + git update-ref refs/bisect/good-3 HEAD~3 && + git update-ref refs/bisect/bad-4 HEAD~4 && + git update-ref refs/bisect/go HEAD~4 && + + # Note: refs/bisect/b and refs/bisect/go should be ignored because they + # do not match the refs/bisect/bad or refs/bisect/good prefixes. + cat >expect <<-EOF && + refs/bisect/bad-1 + refs/bisect/bad-3 + refs/bisect/bad-4 + ^refs/bisect/good-3 + EOF + + git rev-parse --symbolic-full-name --bisect >actual && + test_cmp expect actual +' + test_done diff --git a/t/t4205-log-pretty-formats.sh b/t/t4205-log-pretty-formats.sh index dd9035aa38..b431a22bbd 100755 --- a/t/t4205-log-pretty-formats.sh +++ b/t/t4205-log-pretty-formats.sh @@ -924,6 +924,36 @@ test_expect_success '%S in git log --format works with other placeholders (part test_cmp expect actual ' +test_expect_success 'setup more commits for %S with --bisect' ' + test_commit four && + test_commit five && + + head1=$(git rev-parse --verify HEAD~0) && + head2=$(git rev-parse --verify HEAD~1) && + head3=$(git rev-parse --verify HEAD~2) && + head4=$(git rev-parse --verify HEAD~3) +' + +test_expect_success '%S with --bisect labels commits with refs/bisect/bad ref' ' + git update-ref refs/bisect/bad-$head1 $head1 && + git update-ref refs/bisect/go $head1 && + git update-ref refs/bisect/bad-$head2 $head2 && + git update-ref refs/bisect/b $head3 && + git update-ref refs/bisect/bad-$head4 $head4 && + git update-ref refs/bisect/good-$head4 $head4 && + + # We expect to see the range of commits betwee refs/bisect/good-$head4 + # and refs/bisect/bad-$head1. The "source" ref is the nearest bisect ref + # from which the commit is reachable. + cat >expect <<-EOF && + $head1 refs/bisect/bad-$head1 + $head2 refs/bisect/bad-$head2 + $head3 refs/bisect/bad-$head2 + EOF + git log --bisect --format="%H %S" >actual && + test_cmp expect actual +' + test_expect_success 'log --pretty=reference' ' git log --pretty="tformat:%h (%s, %as)" >expect && git log --pretty=reference >actual && From 6dc10043338bbb29ffd7f8fc431f37b0fed08ae6 Mon Sep 17 00:00:00 2001 From: Victoria Dye Date: Mon, 9 Oct 2023 21:58:54 +0000 Subject: [PATCH 2/4] dir.[ch]: expose 'get_dtype' Move 'get_dtype()' from 'diagnose.c' to 'dir.c' and add its declaration to 'dir.h' so that it is accessible to callers in other files. The function and its documentation are moved verbatim except for a small addition to the description clarifying what the 'path' arg represents. Signed-off-by: Victoria Dye Signed-off-by: Junio C Hamano --- diagnose.c | 36 ------------------------------------ dir.c | 28 ++++++++++++++++++++++++++++ dir.h | 11 +++++++++++ 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/diagnose.c b/diagnose.c index 8430064000..fc4d344bd6 100644 --- a/diagnose.c +++ b/diagnose.c @@ -71,42 +71,6 @@ static int dir_file_stats(struct object_directory *object_dir, void *data) return 0; } -/* - * Get the d_type of a dirent. If the d_type is unknown, derive it from - * stat.st_mode. - * - * Note that 'path' is assumed to have a trailing slash. It is also modified - * in-place during the execution of the function, but is then reverted to its - * original value before returning. - */ -static unsigned char get_dtype(struct dirent *e, struct strbuf *path) -{ - struct stat st; - unsigned char dtype = DTYPE(e); - size_t base_path_len; - - if (dtype != DT_UNKNOWN) - return dtype; - - /* d_type unknown in dirent, try to fall back on lstat results */ - base_path_len = path->len; - strbuf_addstr(path, e->d_name); - if (lstat(path->buf, &st)) - goto cleanup; - - /* determine d_type from st_mode */ - if (S_ISREG(st.st_mode)) - dtype = DT_REG; - else if (S_ISDIR(st.st_mode)) - dtype = DT_DIR; - else if (S_ISLNK(st.st_mode)) - dtype = DT_LNK; - -cleanup: - strbuf_setlen(path, base_path_len); - return dtype; -} - static int count_files(struct strbuf *path) { DIR *dir = opendir(path->buf); diff --git a/dir.c b/dir.c index 8486e4d56f..5e01af3a25 100644 --- a/dir.c +++ b/dir.c @@ -2235,6 +2235,34 @@ static int get_index_dtype(struct index_state *istate, return DT_UNKNOWN; } +unsigned char get_dtype(struct dirent *e, struct strbuf *path) +{ + struct stat st; + unsigned char dtype = DTYPE(e); + size_t base_path_len; + + if (dtype != DT_UNKNOWN) + return dtype; + + /* d_type unknown in dirent, try to fall back on lstat results */ + base_path_len = path->len; + strbuf_addstr(path, e->d_name); + if (lstat(path->buf, &st)) + goto cleanup; + + /* determine d_type from st_mode */ + if (S_ISREG(st.st_mode)) + dtype = DT_REG; + else if (S_ISDIR(st.st_mode)) + dtype = DT_DIR; + else if (S_ISLNK(st.st_mode)) + dtype = DT_LNK; + +cleanup: + strbuf_setlen(path, base_path_len); + return dtype; +} + static int resolve_dtype(int dtype, struct index_state *istate, const char *path, int len) { diff --git a/dir.h b/dir.h index ad06682fd5..28c630ce80 100644 --- a/dir.h +++ b/dir.h @@ -363,6 +363,17 @@ struct dir_struct { struct dirent *readdir_skip_dot_and_dotdot(DIR *dirp); +/* + * Get the d_type of a dirent. If the d_type is unknown, derive it from + * stat.st_mode using the path to the dirent's containing directory (path) and + * the name of the dirent itself. + * + * Note that 'path' is assumed to have a trailing slash. It is also modified + * in-place during the execution of the function, but is then reverted to its + * original value before returning. + */ +unsigned char get_dtype(struct dirent *e, struct strbuf *path); + /*Count the number of slashes for string s*/ int count_slashes(const char *s); From aa79636fe70247a19648d73e52a7774536100567 Mon Sep 17 00:00:00 2001 From: Victoria Dye Date: Mon, 9 Oct 2023 21:58:55 +0000 Subject: [PATCH 3/4] dir.[ch]: add 'follow_symlink' arg to 'get_dtype' Add a 'follow_symlink' boolean option to 'get_type()'. If 'follow_symlink' is enabled, DT_LNK (in addition to DT_UNKNOWN) d_types triggers the stat-based d_type resolution, using 'stat' instead of 'lstat' to get the type of the followed symlink. Note that symlinks are not followed recursively, so a symlink pointing to another symlink will still resolve to DT_LNK. Update callers in 'diagnose.c' to specify 'follow_symlink = 0' to preserve current behavior. Signed-off-by: Victoria Dye Signed-off-by: Junio C Hamano --- diagnose.c | 6 +++--- dir.c | 13 +++++++++---- dir.h | 7 ++++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/diagnose.c b/diagnose.c index fc4d344bd6..4d096c857f 100644 --- a/diagnose.c +++ b/diagnose.c @@ -81,7 +81,7 @@ static int count_files(struct strbuf *path) return 0; while ((e = readdir_skip_dot_and_dotdot(dir)) != NULL) - if (get_dtype(e, path) == DT_REG) + if (get_dtype(e, path, 0) == DT_REG) count++; closedir(dir); @@ -110,7 +110,7 @@ static void loose_objs_stats(struct strbuf *buf, const char *path) base_path_len = count_path.len; while ((e = readdir_skip_dot_and_dotdot(dir)) != NULL) - if (get_dtype(e, &count_path) == DT_DIR && + if (get_dtype(e, &count_path, 0) == DT_DIR && strlen(e->d_name) == 2 && !hex_to_bytes(&c, e->d_name, 1)) { strbuf_setlen(&count_path, base_path_len); @@ -155,7 +155,7 @@ static int add_directory_to_archiver(struct strvec *archiver_args, strbuf_add_absolute_path(&abspath, at_root ? "." : path); strbuf_addch(&abspath, '/'); - dtype = get_dtype(e, &abspath); + dtype = get_dtype(e, &abspath, 0); strbuf_setlen(&buf, len); strbuf_addstr(&buf, e->d_name); diff --git a/dir.c b/dir.c index 5e01af3a25..16fdb03f2a 100644 --- a/dir.c +++ b/dir.c @@ -2235,19 +2235,24 @@ static int get_index_dtype(struct index_state *istate, return DT_UNKNOWN; } -unsigned char get_dtype(struct dirent *e, struct strbuf *path) +unsigned char get_dtype(struct dirent *e, struct strbuf *path, + int follow_symlink) { struct stat st; unsigned char dtype = DTYPE(e); size_t base_path_len; - if (dtype != DT_UNKNOWN) + if (dtype != DT_UNKNOWN && !(follow_symlink && dtype == DT_LNK)) return dtype; - /* d_type unknown in dirent, try to fall back on lstat results */ + /* + * d_type unknown or unfollowed symlink, try to fall back on [l]stat + * results. If [l]stat fails, explicitly set DT_UNKNOWN. + */ base_path_len = path->len; strbuf_addstr(path, e->d_name); - if (lstat(path->buf, &st)) + if ((follow_symlink && stat(path->buf, &st)) || + (!follow_symlink && lstat(path->buf, &st))) goto cleanup; /* determine d_type from st_mode */ diff --git a/dir.h b/dir.h index 28c630ce80..98aa85fcc0 100644 --- a/dir.h +++ b/dir.h @@ -368,11 +368,16 @@ struct dirent *readdir_skip_dot_and_dotdot(DIR *dirp); * stat.st_mode using the path to the dirent's containing directory (path) and * the name of the dirent itself. * + * If 'follow_symlink' is 1, this function will attempt to follow DT_LNK types + * using 'stat'. Links are *not* followed recursively, so a symlink pointing + * to another symlink will still resolve to 'DT_LNK'. + * * Note that 'path' is assumed to have a trailing slash. It is also modified * in-place during the execution of the function, but is then reverted to its * original value before returning. */ -unsigned char get_dtype(struct dirent *e, struct strbuf *path); +unsigned char get_dtype(struct dirent *e, struct strbuf *path, + int follow_symlink); /*Count the number of slashes for string s*/ int count_slashes(const char *s); From 2cdb796101be2feb47a57760b3b0057ad71402bf Mon Sep 17 00:00:00 2001 From: Victoria Dye Date: Mon, 9 Oct 2023 21:58:56 +0000 Subject: [PATCH 4/4] files-backend.c: avoid stat in 'loose_fill_ref_dir' Modify the 'readdir' loop in 'loose_fill_ref_dir' to, rather than 'stat' a file to determine whether it is a directory or not, use 'get_dtype'. Currently, the loop uses 'stat' to determine whether each dirent is a directory itself or not in order to construct the appropriate ref cache entry. If 'stat' fails (returning a negative value), the dirent is silently skipped; otherwise, 'S_ISDIR(st.st_mode)' is used to check whether the entry is a directory. On platforms that include an entry's d_type in in the 'dirent' struct, this extra 'stat' check is redundant. We can use the 'get_dtype' method to extract this information on platforms that support it (i.e. where NO_D_TYPE_IN_DIRENT is unset), and derive it with 'stat' on platforms that don't. Because 'stat' is an expensive call, this confers a modest-but-noticeable performance improvement when iterating over large numbers of refs (approximately 20% speedup in 'git for-each-ref' in a 30k ref repo). Unlike other existing usage of 'get_dtype', the 'follow_symlinks' arg is set to 1 to replicate the existing handling of symlink dirents. This unfortunately requires calling 'stat' on the associated entry regardless of platform, but symlinks in the loose ref store are highly unlikely since they'd need to be created manually by a user. Note that this patch also changes the condition for skipping creation of a ref entry from "when 'stat' fails" to "when the d_type is anything other than DT_REG or DT_DIR". If a dirent's d_type is DT_UNKNOWN (either because the platform doesn't support d_type in dirents or some other reason) or DT_LNK, 'get_dtype' will try to derive the underlying type with 'stat'. If the 'stat' fails, the d_type will remain 'DT_UNKNOWN' and dirent will be skipped. However, it will also be skipped if it is any other valid d_type (e.g. DT_FIFO for named pipes, DT_LNK for a nested symlink). Git does not handle these properly anyway, so we can safely constrain accepted types to directories and regular files. Signed-off-by: Victoria Dye Signed-off-by: Junio C Hamano --- refs/files-backend.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/refs/files-backend.c b/refs/files-backend.c index 341354182b..db5c0c7a72 100644 --- a/refs/files-backend.c +++ b/refs/files-backend.c @@ -246,10 +246,8 @@ static void loose_fill_ref_dir(struct ref_store *ref_store, int dirnamelen = strlen(dirname); struct strbuf refname; struct strbuf path = STRBUF_INIT; - size_t path_baselen; files_ref_path(refs, &path, dirname); - path_baselen = path.len; d = opendir(path.buf); if (!d) { @@ -262,23 +260,22 @@ static void loose_fill_ref_dir(struct ref_store *ref_store, while ((de = readdir(d)) != NULL) { struct object_id oid; - struct stat st; int flag; + unsigned char dtype; if (de->d_name[0] == '.') continue; if (ends_with(de->d_name, ".lock")) continue; strbuf_addstr(&refname, de->d_name); - strbuf_addstr(&path, de->d_name); - if (stat(path.buf, &st) < 0) { - ; /* silently ignore */ - } else if (S_ISDIR(st.st_mode)) { + + dtype = get_dtype(de, &path, 1); + if (dtype == DT_DIR) { strbuf_addch(&refname, '/'); add_entry_to_dir(dir, create_dir_entry(dir->cache, refname.buf, refname.len)); - } else { + } else if (dtype == DT_REG) { if (!refs_resolve_ref_unsafe(&refs->base, refname.buf, RESOLVE_REF_READING, @@ -308,7 +305,6 @@ static void loose_fill_ref_dir(struct ref_store *ref_store, create_ref_entry(refname.buf, &oid, flag)); } strbuf_setlen(&refname, dirnamelen); - strbuf_setlen(&path, path_baselen); } strbuf_release(&refname); strbuf_release(&path);