Merge branch 'ds/path-walk-1'

Introduce a new API to visit objects in batches based on a common path, or by type. * ds/path-walk-1: path-walk: drop redundant parse_tree() call path-walk: reorder object visits path-walk: mark trees and blobs as UNINTERESTING path-walk: visit tags and cached objects path-walk: allow consumer to specify object types t6601: add helper for testing path-walk API test-lib-functions: add test_cmp_sorted path-walk: introduce an object walk by path
2025-01-29 14:05:08 -08:00
parent da898a5c64 b224e8e36c
commit f046ab2dd4
12 changed files with 1220 additions and 0 deletions
--- a/Documentation/technical/api-path-walk.txt
+++ b/Documentation/technical/api-path-walk.txt
@ -0,0 +1,63 @@
+Path-Walk API
+=============
+
+The path-walk API is used to walk reachable objects, but to visit objects
+in batches based on a common path they appear in, or by type.
+
+For example, all reachable commits are visited in a group. All tags are
+visited in a group. Then, all root trees are visited. At some point, all
+blobs reachable via a path `my/dir/to/A` are visited. When there are
+multiple paths possible to reach the same object, then only one of those
+paths is used to visit the object.
+
+Basics
+------
+
+To use the path-walk API, include `path-walk.h` and call
+`walk_objects_by_path()` with a customized `path_walk_info` struct. The
+struct is used to set all of the options for how the walk should proceed.
+Let's dig into the different options and their use.
+
+`path_fn` and `path_fn_data`::
+	The most important option is the `path_fn` option, which is a
+	function pointer to the callback that can execute logic on the
+	object IDs for objects grouped by type and path. This function
+	also receives a `data` value that corresponds to the
+	`path_fn_data` member, for providing custom data structures to
+	this callback function.
+
+`revs`::
+	To configure the exact details of the reachable set of objects,
+	use the `revs` member and initialize it using the revision
+	machinery in `revision.h`. Initialize `revs` using calls such as
+	`setup_revisions()` or `parse_revision_opt()`. Do not call
+	`prepare_revision_walk()`, as that will be called within
+	`walk_objects_by_path()`.
+
+It is also important that you do not specify the `--objects` flag for the
+`revs` struct. The revision walk should only be used to walk commits, and
+the objects will be walked in a separate way based on those starting
+commits.
+
+`commits`, `blobs`, `trees`, `tags`::
+	By default, these members are enabled and signal that the path-walk
+	API should call the `path_fn` on objects of these types. Specialized
+	applications could disable some options to make it simpler to walk
+	the objects or to have fewer calls to `path_fn`.
+
+While it is possible to walk only commits in this way, consumers would be
+better off using the revision walk API instead.
+
+`prune_all_uninteresting`::
+	By default, all reachable paths are emitted by the path-walk API.
+	This option allows consumers to declare that they are not
+	interested in paths where all included objects are marked with the
+	`UNINTERESTING` flag. This requires using the `boundary` option in
+	the revision walk so that the walk emits commits marked with the
+	`UNINTERESTING` flag.
+
+Examples
+--------
+
+See example usages in:
+	`t/helper/test-path-walk.c`
--- a/2
+++ b/2
@ -818,6 +818,7 @@ TEST_BUILTINS_OBJS += test-parse-options.o
 TEST_BUILTINS_OBJS += test-parse-pathspec-file.o
 TEST_BUILTINS_OBJS += test-partial-clone.o
 TEST_BUILTINS_OBJS += test-path-utils.o
+TEST_BUILTINS_OBJS += test-path-walk.o
 TEST_BUILTINS_OBJS += test-pcre2-config.o
 TEST_BUILTINS_OBJS += test-pkt-line.o
 TEST_BUILTINS_OBJS += test-proc-receive.o
@ -1094,6 +1095,7 @@ LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
 LIB_OBJS += path.o
+LIB_OBJS += path-walk.o
 LIB_OBJS += pathspec.o
 LIB_OBJS += pkt-line.o
 LIB_OBJS += preload-index.o
--- a/meson.build
+++ b/meson.build
@ -358,6 +358,7 @@ libgit_sources = [
  'patch-delta.c',
  'patch-ids.c',
  'path.c',
+  'path-walk.c',
  'pathspec.c',
  'pkt-line.c',
  'preload-index.c',
--- a/path-walk.c
+++ b/path-walk.c
@ -0,0 +1,591 @@
+/*
+ * path-walk.c: implementation for path-based walks of the object graph.
+ */
+#include "git-compat-util.h"
+#include "path-walk.h"
+#include "blob.h"
+#include "commit.h"
+#include "dir.h"
+#include "hashmap.h"
+#include "hex.h"
+#include "list-objects.h"
+#include "object.h"
+#include "oid-array.h"
+#include "prio-queue.h"
+#include "revision.h"
+#include "string-list.h"
+#include "strmap.h"
+#include "tag.h"
+#include "trace2.h"
+#include "tree.h"
+#include "tree-walk.h"
+
+static const char *root_path = "";
+
+struct type_and_oid_list {
+	enum object_type type;
+	struct oid_array oids;
+	int maybe_interesting;
+};
+
+#define TYPE_AND_OID_LIST_INIT { \
+	.type = OBJ_NONE, 	 \
+	.oids = OID_ARRAY_INIT	 \
+}
+
+struct path_walk_context {
+	/**
+	 * Repeats of data in 'struct path_walk_info' for
+	 * access with fewer characters.
+	 */
+	struct repository *repo;
+	struct rev_info *revs;
+	struct path_walk_info *info;
+
+	/**
+	 * Map a path to a 'struct type_and_oid_list'
+	 * containing the objects discovered at that
+	 * path.
+	 */
+	struct strmap paths_to_lists;
+
+	/**
+	 * Store the current list of paths in a priority queue,
+	 * using object type as a sorting mechanism, mostly to
+	 * make sure blobs are popped off the stack first. No
+	 * other sort is made, so within each object type it acts
+	 * like a stack and performs a DFS within the trees.
+	 *
+	 * Use path_stack_pushed to indicate whether a path
+	 * was previously added to path_stack.
+	 */
+	struct prio_queue path_stack;
+	struct strset path_stack_pushed;
+};
+
+static int compare_by_type(const void *one, const void *two, void *cb_data)
+{
+	struct type_and_oid_list *list1, *list2;
+	const char *str1 = one;
+	const char *str2 = two;
+	struct path_walk_context *ctx = cb_data;
+
+	list1 = strmap_get(&ctx->paths_to_lists, str1);
+	list2 = strmap_get(&ctx->paths_to_lists, str2);
+
+	/*
+	 * If object types are equal, then use path comparison.
+	 */
+	if (!list1 || !list2 || list1->type == list2->type)
+		return strcmp(str1, str2);
+
+	/* Prefer tags to be popped off first. */
+	if (list1->type == OBJ_TAG)
+		return -1;
+	if (list2->type == OBJ_TAG)
+		return 1;
+
+	/* Prefer blobs to be popped off second. */
+	if (list1->type == OBJ_BLOB)
+		return -1;
+	if (list2->type == OBJ_BLOB)
+		return 1;
+
+	return 0;
+}
+
+static void push_to_stack(struct path_walk_context *ctx,
+			  const char *path)
+{
+	if (strset_contains(&ctx->path_stack_pushed, path))
+		return;
+
+	strset_add(&ctx->path_stack_pushed, path);
+	prio_queue_put(&ctx->path_stack, xstrdup(path));
+}
+
+static int add_tree_entries(struct path_walk_context *ctx,
+			    const char *base_path,
+			    struct object_id *oid)
+{
+	struct tree_desc desc;
+	struct name_entry entry;
+	struct strbuf path = STRBUF_INIT;
+	size_t base_len;
+	struct tree *tree = lookup_tree(ctx->repo, oid);
+
+	if (!tree) {
+		error(_("failed to walk children of tree %s: not found"),
+		      oid_to_hex(oid));
+		return -1;
+	} else if (parse_tree_gently(tree, 1)) {
+		error("bad tree object %s", oid_to_hex(oid));
+		return -1;
+	}
+
+	strbuf_addstr(&path, base_path);
+	base_len = path.len;
+
+	init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
+	while (tree_entry(&desc, &entry)) {
+		struct type_and_oid_list *list;
+		struct object *o;
+		/* Not actually true, but we will ignore submodules later. */
+		enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;
+
+		/* Skip submodules. */
+		if (S_ISGITLINK(entry.mode))
+			continue;
+
+		/* If the caller doesn't want blobs, then don't bother. */
+		if (!ctx->info->blobs && type == OBJ_BLOB)
+			continue;
+
+		if (type == OBJ_TREE) {
+			struct tree *child = lookup_tree(ctx->repo, &entry.oid);
+			o = child ? &child->object : NULL;
+		} else if (type == OBJ_BLOB) {
+			struct blob *child = lookup_blob(ctx->repo, &entry.oid);
+			o = child ? &child->object : NULL;
+		} else {
+			BUG("invalid type for tree entry: %d", type);
+		}
+
+		if (!o) {
+			error(_("failed to find object %s"),
+			      oid_to_hex(&o->oid));
+			return -1;
+		}
+
+		/* Skip this object if already seen. */
+		if (o->flags & SEEN)
+			continue;
+		o->flags |= SEEN;
+
+		strbuf_setlen(&path, base_len);
+		strbuf_add(&path, entry.path, entry.pathlen);
+
+		/*
+		 * Trees will end with "/" for concatenation and distinction
+		 * from blobs at the same path.
+		 */
+		if (type == OBJ_TREE)
+			strbuf_addch(&path, '/');
+
+		if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
+			CALLOC_ARRAY(list, 1);
+			list->type = type;
+			strmap_put(&ctx->paths_to_lists, path.buf, list);
+		}
+		push_to_stack(ctx, path.buf);
+
+		if (!(o->flags & UNINTERESTING))
+			list->maybe_interesting = 1;
+
+		oid_array_append(&list->oids, &entry.oid);
+	}
+
+	free_tree_buffer(tree);
+	strbuf_release(&path);
+	return 0;
+}
+
+/*
+ * For each path in paths_to_explore, walk the trees another level
+ * and add any found blobs to the batch (but only if they exist and
+ * haven't been added yet).
+ */
+static int walk_path(struct path_walk_context *ctx,
+		     const char *path)
+{
+	struct type_and_oid_list *list;
+	int ret = 0;
+
+	list = strmap_get(&ctx->paths_to_lists, path);
+
+	if (!list)
+		BUG("provided path '%s' that had no associated list", path);
+
+	if (!list->oids.nr)
+		return 0;
+
+	if (ctx->info->prune_all_uninteresting) {
+		/*
+		 * This is true if all objects were UNINTERESTING
+		 * when added to the list.
+		 */
+		if (!list->maybe_interesting)
+			return 0;
+
+		/*
+		 * But it's still possible that the objects were set
+		 * as UNINTERESTING after being added. Do a quick check.
+		 */
+		list->maybe_interesting = 0;
+		for (size_t i = 0;
+		     !list->maybe_interesting && i < list->oids.nr;
+		     i++) {
+			if (list->type == OBJ_TREE) {
+				struct tree *t = lookup_tree(ctx->repo,
+							     &list->oids.oid[i]);
+				if (t && !(t->object.flags & UNINTERESTING))
+					list->maybe_interesting = 1;
+			} else if (list->type == OBJ_BLOB) {
+				struct blob *b = lookup_blob(ctx->repo,
+							     &list->oids.oid[i]);
+				if (b && !(b->object.flags & UNINTERESTING))
+					list->maybe_interesting = 1;
+			} else {
+				/* Tags are always interesting if visited. */
+				list->maybe_interesting = 1;
+			}
+		}
+
+		/* We have confirmed that all objects are UNINTERESTING. */
+		if (!list->maybe_interesting)
+			return 0;
+	}
+
+	/* Evaluate function pointer on this data, if requested. */
+	if ((list->type == OBJ_TREE && ctx->info->trees) ||
+	    (list->type == OBJ_BLOB && ctx->info->blobs) ||
+	    (list->type == OBJ_TAG && ctx->info->tags))
+		ret = ctx->info->path_fn(path, &list->oids, list->type,
+					ctx->info->path_fn_data);
+
+	/* Expand data for children. */
+	if (list->type == OBJ_TREE) {
+		for (size_t i = 0; i < list->oids.nr; i++) {
+			ret |= add_tree_entries(ctx,
+					    path,
+					    &list->oids.oid[i]);
+		}
+	}
+
+	oid_array_clear(&list->oids);
+	strmap_remove(&ctx->paths_to_lists, path, 1);
+	return ret;
+}
+
+static void clear_paths_to_lists(struct strmap *map)
+{
+	struct hashmap_iter iter;
+	struct strmap_entry *e;
+
+	hashmap_for_each_entry(&map->map, &iter, e, ent) {
+		struct type_and_oid_list *list = e->value;
+		oid_array_clear(&list->oids);
+	}
+	strmap_clear(map, 1);
+	strmap_init(map);
+}
+
+static struct repository *edge_repo;
+static struct type_and_oid_list *edge_tree_list;
+
+static void show_edge(struct commit *commit)
+{
+	struct tree *t = repo_get_commit_tree(edge_repo, commit);
+
+	if (!t)
+		return;
+
+	if (commit->object.flags & UNINTERESTING)
+		t->object.flags |= UNINTERESTING;
+
+	if (t->object.flags & SEEN)
+		return;
+	t->object.flags |= SEEN;
+
+	oid_array_append(&edge_tree_list->oids, &t->object.oid);
+}
+
+static int setup_pending_objects(struct path_walk_info *info,
+				 struct path_walk_context *ctx)
+{
+	struct type_and_oid_list *tags = NULL;
+	struct type_and_oid_list *tagged_blobs = NULL;
+	struct type_and_oid_list *root_tree_list = NULL;
+
+	if (info->tags)
+		CALLOC_ARRAY(tags, 1);
+	if (info->blobs)
+		CALLOC_ARRAY(tagged_blobs, 1);
+	if (info->trees)
+		root_tree_list = strmap_get(&ctx->paths_to_lists, root_path);
+
+	/*
+	 * Pending objects include:
+	 * * Commits at branch tips.
+	 * * Annotated tags at tag tips.
+	 * * Any kind of object at lightweight tag tips.
+	 * * Trees and blobs in the index (with an associated path).
+	 */
+	for (size_t i = 0; i < info->revs->pending.nr; i++) {
+		struct object_array_entry *pending = info->revs->pending.objects + i;
+		struct object *obj = pending->item;
+
+		/* Commits will be picked up by revision walk. */
+		if (obj->type == OBJ_COMMIT)
+			continue;
+
+		/* Navigate annotated tag object chains. */
+		while (obj->type == OBJ_TAG) {
+			struct tag *tag = lookup_tag(info->revs->repo, &obj->oid);
+			if (!tag) {
+				error(_("failed to find tag %s"),
+				      oid_to_hex(&obj->oid));
+				return -1;
+			}
+			if (tag->object.flags & SEEN)
+				break;
+			tag->object.flags |= SEEN;
+
+			if (tags)
+				oid_array_append(&tags->oids, &obj->oid);
+			obj = tag->tagged;
+		}
+
+		if (obj->type == OBJ_TAG)
+			continue;
+
+		/* We are now at a non-tag object. */
+		if (obj->flags & SEEN)
+			continue;
+		obj->flags |= SEEN;
+
+		switch (obj->type) {
+		case OBJ_TREE:
+			if (!info->trees)
+				continue;
+			if (pending->path) {
+				struct type_and_oid_list *list;
+				char *path = *pending->path ? xstrfmt("%s/", pending->path)
+							    : xstrdup("");
+				if (!(list = strmap_get(&ctx->paths_to_lists, path))) {
+					CALLOC_ARRAY(list, 1);
+					list->type = OBJ_TREE;
+					strmap_put(&ctx->paths_to_lists, path, list);
+				}
+				oid_array_append(&list->oids, &obj->oid);
+				free(path);
+			} else {
+				/* assume a root tree, such as a lightweight tag. */
+				oid_array_append(&root_tree_list->oids, &obj->oid);
+			}
+			break;
+
+		case OBJ_BLOB:
+			if (!info->blobs)
+				continue;
+			if (pending->path) {
+				struct type_and_oid_list *list;
+				char *path = pending->path;
+				if (!(list = strmap_get(&ctx->paths_to_lists, path))) {
+					CALLOC_ARRAY(list, 1);
+					list->type = OBJ_BLOB;
+					strmap_put(&ctx->paths_to_lists, path, list);
+				}
+				oid_array_append(&list->oids, &obj->oid);
+			} else {
+				/* assume a root tree, such as a lightweight tag. */
+				oid_array_append(&tagged_blobs->oids, &obj->oid);
+			}
+			break;
+
+		case OBJ_COMMIT:
+			/* Make sure it is in the object walk */
+			if (obj != pending->item)
+				add_pending_object(info->revs, obj, "");
+			break;
+
+		default:
+			BUG("should not see any other type here");
+		}
+	}
+
+	/*
+	 * Add tag objects and tagged blobs if they exist.
+	 */
+	if (tagged_blobs) {
+		if (tagged_blobs->oids.nr) {
+			const char *tagged_blob_path = "/tagged-blobs";
+			tagged_blobs->type = OBJ_BLOB;
+			tagged_blobs->maybe_interesting = 1;
+			strmap_put(&ctx->paths_to_lists, tagged_blob_path, tagged_blobs);
+			push_to_stack(ctx, tagged_blob_path);
+		} else {
+			oid_array_clear(&tagged_blobs->oids);
+			free(tagged_blobs);
+		}
+	}
+	if (tags) {
+		if (tags->oids.nr) {
+			const char *tag_path = "/tags";
+			tags->type = OBJ_TAG;
+			tags->maybe_interesting = 1;
+			strmap_put(&ctx->paths_to_lists, tag_path, tags);
+			push_to_stack(ctx, tag_path);
+		} else {
+			oid_array_clear(&tags->oids);
+			free(tags);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Given the configuration of 'info', walk the commits based on 'info->revs' and
+ * call 'info->path_fn' on each discovered path.
+ *
+ * Returns nonzero on an error.
+ */
+int walk_objects_by_path(struct path_walk_info *info)
+{
+	int ret;
+	size_t commits_nr = 0, paths_nr = 0;
+	struct commit *c;
+	struct type_and_oid_list *root_tree_list;
+	struct type_and_oid_list *commit_list;
+	struct path_walk_context ctx = {
+		.repo = info->revs->repo,
+		.revs = info->revs,
+		.info = info,
+		.path_stack = {
+			.compare = compare_by_type,
+			.cb_data = &ctx
+		},
+		.path_stack_pushed = STRSET_INIT,
+		.paths_to_lists = STRMAP_INIT
+	};
+
+	trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
+
+	CALLOC_ARRAY(commit_list, 1);
+	commit_list->type = OBJ_COMMIT;
+
+	if (info->tags)
+		info->revs->tag_objects = 1;
+
+	/* Insert a single list for the root tree into the paths. */
+	CALLOC_ARRAY(root_tree_list, 1);
+	root_tree_list->type = OBJ_TREE;
+	root_tree_list->maybe_interesting = 1;
+	strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
+	push_to_stack(&ctx, root_path);
+
+	/*
+	 * Set these values before preparing the walk to catch
+	 * lightweight tags pointing to non-commits and indexed objects.
+	 */
+	info->revs->blob_objects = info->blobs;
+	info->revs->tree_objects = info->trees;
+
+	if (prepare_revision_walk(info->revs))
+		die(_("failed to setup revision walk"));
+
+	/* Walk trees to mark them as UNINTERESTING. */
+	edge_repo = info->revs->repo;
+	edge_tree_list = root_tree_list;
+	mark_edges_uninteresting(info->revs, show_edge,
+				 info->prune_all_uninteresting);
+	edge_repo = NULL;
+	edge_tree_list = NULL;
+
+	info->revs->blob_objects = info->revs->tree_objects = 0;
+
+	trace2_region_enter("path-walk", "pending-walk", info->revs->repo);
+	ret = setup_pending_objects(info, &ctx);
+	trace2_region_leave("path-walk", "pending-walk", info->revs->repo);
+
+	if (ret)
+		return ret;
+
+	while ((c = get_revision(info->revs))) {
+		struct object_id *oid;
+		struct tree *t;
+		commits_nr++;
+
+		if (info->commits)
+			oid_array_append(&commit_list->oids,
+					 &c->object.oid);
+
+		/* If we only care about commits, then skip trees. */
+		if (!info->trees && !info->blobs)
+			continue;
+
+		oid = get_commit_tree_oid(c);
+		t = lookup_tree(info->revs->repo, oid);
+
+		if (!t) {
+			error("could not find tree %s", oid_to_hex(oid));
+			return -1;
+		}
+
+		if (t->object.flags & SEEN)
+			continue;
+		t->object.flags |= SEEN;
+		oid_array_append(&root_tree_list->oids, oid);
+	}
+
+	trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
+	trace2_region_leave("path-walk", "commit-walk", info->revs->repo);
+
+	/* Track all commits. */
+	if (info->commits && commit_list->oids.nr)
+		ret = info->path_fn("", &commit_list->oids, OBJ_COMMIT,
+				    info->path_fn_data);
+	oid_array_clear(&commit_list->oids);
+	free(commit_list);
+
+	trace2_region_enter("path-walk", "path-walk", info->revs->repo);
+	while (!ret && ctx.path_stack.nr) {
+		char *path = prio_queue_get(&ctx.path_stack);
+		paths_nr++;
+
+		ret = walk_path(&ctx, path);
+
+		free(path);
+	}
+
+	/* Are there paths remaining? Likely they are from indexed objects. */
+	if (!strmap_empty(&ctx.paths_to_lists)) {
+		struct hashmap_iter iter;
+		struct strmap_entry *entry;
+
+		strmap_for_each_entry(&ctx.paths_to_lists, &iter, entry)
+			push_to_stack(&ctx, entry->key);
+
+		while (!ret && ctx.path_stack.nr) {
+			char *path = prio_queue_get(&ctx.path_stack);
+			paths_nr++;
+
+			ret = walk_path(&ctx, path);
+
+			free(path);
+		}
+	}
+
+	trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr);
+	trace2_region_leave("path-walk", "path-walk", info->revs->repo);
+
+	clear_paths_to_lists(&ctx.paths_to_lists);
+	strset_clear(&ctx.path_stack_pushed);
+	clear_prio_queue(&ctx.path_stack);
+	return ret;
+}
+
+void path_walk_info_init(struct path_walk_info *info)
+{
+	struct path_walk_info empty = PATH_WALK_INFO_INIT;
+	memcpy(info, &empty, sizeof(empty));
+}
+
+void path_walk_info_clear(struct path_walk_info *info UNUSED)
+{
+	/*
+	 * This destructor is empty for now, as info->revs
+	 * is not owned by 'struct path_walk_info'.
+	 */
+}
--- a/path-walk.h
+++ b/path-walk.h
@ -0,0 +1,69 @@
+/*
+ * path-walk.h : Methods and structures for walking the object graph in batches
+ * by the paths that can reach those objects.
+ */
+#include "object.h" /* Required for 'enum object_type'. */
+
+struct rev_info;
+struct oid_array;
+
+/**
+ * The type of a function pointer for the method that is called on a list of
+ * objects reachable at a given path.
+ */
+typedef int (*path_fn)(const char *path,
+		       struct oid_array *oids,
+		       enum object_type type,
+		       void *data);
+
+struct path_walk_info {
+	/**
+	 * revs provides the definitions for the commit walk, including
+	 * which commits are UNINTERESTING or not. This structure is
+	 * expected to be owned by the caller.
+	 */
+	struct rev_info *revs;
+
+	/**
+	 * The caller wishes to execute custom logic on objects reachable at a
+	 * given path. Every reachable object will be visited exactly once, and
+	 * the first path to see an object wins. This may not be a stable choice.
+	 */
+	path_fn path_fn;
+	void *path_fn_data;
+
+	/**
+	 * Initialize which object types the path_fn should be called on. This
+	 * could also limit the walk to skip blobs if not set.
+	 */
+	int commits;
+	int trees;
+	int blobs;
+	int tags;
+
+	/**
+	 * When 'prune_all_uninteresting' is set and a path has all objects
+	 * marked as UNINTERESTING, then the path-walk will not visit those
+	 * objects. It will not call path_fn on those objects and will not
+	 * walk the children of such trees.
+	 */
+	int prune_all_uninteresting;
+};
+
+#define PATH_WALK_INFO_INIT {   \
+	.blobs = 1,		\
+	.trees = 1,		\
+	.commits = 1,		\
+	.tags = 1,		\
+}
+
+void path_walk_info_init(struct path_walk_info *info);
+void path_walk_info_clear(struct path_walk_info *info);
+
+/**
+ * Given the configuration of 'info', walk the commits based on 'info->revs' and
+ * call 'info->path_fn' on each discovered path.
+ *
+ * Returns nonzero on an error.
+ */
+int walk_objects_by_path(struct path_walk_info *info);
--- a/t/helper/meson.build
+++ b/t/helper/meson.build
@ -40,6 +40,7 @@ test_tool_sources = [
  'test-parse-pathspec-file.c',
  'test-partial-clone.c',
  'test-path-utils.c',
+  'test-path-walk.c',
  'test-pcre2-config.c',
  'test-pkt-line.c',
  'test-proc-receive.c',
--- a/t/helper/test-path-walk.c
+++ b/t/helper/test-path-walk.c
@ -0,0 +1,112 @@
+#define USE_THE_REPOSITORY_VARIABLE
+
+#include "test-tool.h"
+#include "environment.h"
+#include "hex.h"
+#include "object-name.h"
+#include "object.h"
+#include "pretty.h"
+#include "revision.h"
+#include "setup.h"
+#include "parse-options.h"
+#include "path-walk.h"
+#include "oid-array.h"
+
+static const char * const path_walk_usage[] = {
+	N_("test-tool path-walk <options> -- <revision-options>"),
+	NULL
+};
+
+struct path_walk_test_data {
+	uintmax_t batch_nr;
+
+	uintmax_t commit_nr;
+	uintmax_t tree_nr;
+	uintmax_t blob_nr;
+	uintmax_t tag_nr;
+};
+
+static int emit_block(const char *path, struct oid_array *oids,
+		      enum object_type type, void *data)
+{
+	struct path_walk_test_data *tdata = data;
+	const char *typestr;
+
+	if (type == OBJ_TREE)
+		tdata->tree_nr += oids->nr;
+	else if (type == OBJ_BLOB)
+		tdata->blob_nr += oids->nr;
+	else if (type == OBJ_COMMIT)
+		tdata->commit_nr += oids->nr;
+	else if (type == OBJ_TAG)
+		tdata->tag_nr += oids->nr;
+	else
+		BUG("we do not understand this type");
+
+	typestr = type_name(type);
+
+	/* This should never be output during tests. */
+	if (!oids->nr)
+		printf("%"PRIuMAX":%s:%s:EMPTY\n",
+		       tdata->batch_nr, typestr, path);
+
+	for (size_t i = 0; i < oids->nr; i++) {
+		struct object *o = lookup_unknown_object(the_repository,
+							 &oids->oid[i]);
+		printf("%"PRIuMAX":%s:%s:%s%s\n",
+		       tdata->batch_nr, typestr, path,
+		       oid_to_hex(&oids->oid[i]),
+		       o->flags & UNINTERESTING ? ":UNINTERESTING" : "");
+	}
+
+	tdata->batch_nr++;
+	return 0;
+}
+
+int cmd__path_walk(int argc, const char **argv)
+{
+	int res;
+	struct rev_info revs = REV_INFO_INIT;
+	struct path_walk_info info = PATH_WALK_INFO_INIT;
+	struct path_walk_test_data data = { 0 };
+	struct option options[] = {
+		OPT_BOOL(0, "blobs", &info.blobs,
+			 N_("toggle inclusion of blob objects")),
+		OPT_BOOL(0, "commits", &info.commits,
+			 N_("toggle inclusion of commit objects")),
+		OPT_BOOL(0, "tags", &info.tags,
+			 N_("toggle inclusion of tag objects")),
+		OPT_BOOL(0, "trees", &info.trees,
+			 N_("toggle inclusion of tree objects")),
+		OPT_BOOL(0, "prune", &info.prune_all_uninteresting,
+			 N_("toggle pruning of uninteresting paths")),
+		OPT_END(),
+	};
+
+	setup_git_directory();
+	revs.repo = the_repository;
+
+	argc = parse_options(argc, argv, NULL,
+			     options, path_walk_usage,
+			     PARSE_OPT_KEEP_UNKNOWN_OPT | PARSE_OPT_KEEP_ARGV0);
+
+	if (argc > 1)
+		setup_revisions(argc, argv, &revs, NULL);
+	else
+		usage(path_walk_usage[0]);
+
+	info.revs = &revs;
+	info.path_fn = emit_block;
+	info.path_fn_data = &data;
+
+	res = walk_objects_by_path(&info);
+
+	printf("commits:%" PRIuMAX "\n"
+	       "trees:%" PRIuMAX "\n"
+	       "blobs:%" PRIuMAX "\n"
+	       "tags:%" PRIuMAX "\n",
+	       data.commit_nr, data.tree_nr, data.blob_nr, data.tag_nr);
+
+	release_revisions(&revs);
+	return res;
+}
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@ -52,6 +52,7 @@ static struct test_cmd cmds[] = {
 	{ "parse-subcommand", cmd__parse_subcommand },
 	{ "partial-clone", cmd__partial_clone },
 	{ "path-utils", cmd__path_utils },
+	{ "path-walk", cmd__path_walk },
 	{ "pcre2-config", cmd__pcre2_config },
 	{ "pkt-line", cmd__pkt_line },
 	{ "proc-receive", cmd__proc_receive },
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@ -45,6 +45,7 @@ int cmd__parse_pathspec_file(int argc, const char** argv);
 int cmd__parse_subcommand(int argc, const char **argv);
 int cmd__partial_clone(int argc, const char **argv);
 int cmd__path_utils(int argc, const char **argv);
+int cmd__path_walk(int argc, const char **argv);
 int cmd__pcre2_config(int argc, const char **argv);
 int cmd__pkt_line(int argc, const char **argv);
 int cmd__proc_receive(int argc, const char **argv);
--- a/t/meson.build
+++ b/t/meson.build
@ -829,6 +829,7 @@ integration_tests = [
  't6500-gc.sh',
  't6501-freshen-objects.sh',
  't6600-test-reach.sh',
+  't6601-path-walk.sh',
  't6700-tree-depth.sh',
  't7001-mv.sh',
  't7002-mv-sparse-checkout.sh',
--- a/t/t6601-path-walk.sh
+++ b/t/t6601-path-walk.sh
@ -0,0 +1,368 @@
+#!/bin/sh
+
+TEST_PASSES_SANITIZE_LEAK=true
+
+test_description='direct path-walk API tests'
+
+. ./test-lib.sh
+
+test_expect_success 'setup test repository' '
+	git checkout -b base &&
+
+	# Make some objects that will only be reachable
+	# via non-commit tags.
+	mkdir child &&
+	echo file >child/file &&
+	git add child &&
+	git commit -m "will abandon" &&
+	git tag -a -m "tree" tree-tag HEAD^{tree} &&
+	echo file2 >file2 &&
+	git add file2 &&
+	git commit --amend -m "will abandon" &&
+	git tag tree-tag2 HEAD^{tree} &&
+
+	echo blob >file &&
+	blob_oid=$(git hash-object -t blob -w --stdin <file) &&
+	git tag -a -m "blob" blob-tag "$blob_oid" &&
+	echo blob2 >file2 &&
+	blob2_oid=$(git hash-object -t blob -w --stdin <file2) &&
+	git tag blob-tag2 "$blob2_oid" &&
+
+	rm -fr child file file2 &&
+
+	mkdir left &&
+	mkdir right &&
+	echo a >a &&
+	echo b >left/b &&
+	echo c >right/c &&
+	git add . &&
+	git commit --amend -m "first" &&
+	git tag -m "first" first HEAD &&
+
+	echo d >right/d &&
+	git add right &&
+	git commit -m "second" &&
+	git tag -a -m "second (under)" second.1 HEAD &&
+	git tag -a -m "second (top)" second.2 second.1 &&
+
+	# Set up file/dir collision in history.
+	rm a &&
+	mkdir a &&
+	echo a >a/a &&
+	echo bb >left/b &&
+	git add a left &&
+	git commit -m "third" &&
+	git tag -a -m "third" third &&
+
+	git checkout -b topic HEAD~1 &&
+	echo cc >right/c &&
+	git commit -a -m "topic" &&
+	git tag -a -m "fourth" fourth
+'
+
+test_expect_success 'all' '
+	test-tool path-walk -- --all >out &&
+
+	cat >expect <<-EOF &&
+	0:commit::$(git rev-parse topic)
+	0:commit::$(git rev-parse base)
+	0:commit::$(git rev-parse base~1)
+	0:commit::$(git rev-parse base~2)
+	1:tag:/tags:$(git rev-parse refs/tags/first)
+	1:tag:/tags:$(git rev-parse refs/tags/second.1)
+	1:tag:/tags:$(git rev-parse refs/tags/second.2)
+	1:tag:/tags:$(git rev-parse refs/tags/third)
+	1:tag:/tags:$(git rev-parse refs/tags/fourth)
+	1:tag:/tags:$(git rev-parse refs/tags/tree-tag)
+	1:tag:/tags:$(git rev-parse refs/tags/blob-tag)
+	2:blob:/tagged-blobs:$(git rev-parse refs/tags/blob-tag^{})
+	2:blob:/tagged-blobs:$(git rev-parse refs/tags/blob-tag2^{})
+	3:tree::$(git rev-parse topic^{tree})
+	3:tree::$(git rev-parse base^{tree})
+	3:tree::$(git rev-parse base~1^{tree})
+	3:tree::$(git rev-parse base~2^{tree})
+	3:tree::$(git rev-parse refs/tags/tree-tag^{})
+	3:tree::$(git rev-parse refs/tags/tree-tag2^{})
+	4:blob:a:$(git rev-parse base~2:a)
+	5:blob:file2:$(git rev-parse refs/tags/tree-tag2^{}:file2)
+	6:tree:a/:$(git rev-parse base:a)
+	7:tree:child/:$(git rev-parse refs/tags/tree-tag:child)
+	8:blob:child/file:$(git rev-parse refs/tags/tree-tag:child/file)
+	9:tree:left/:$(git rev-parse base:left)
+	9:tree:left/:$(git rev-parse base~2:left)
+	10:blob:left/b:$(git rev-parse base~2:left/b)
+	10:blob:left/b:$(git rev-parse base:left/b)
+	11:tree:right/:$(git rev-parse topic:right)
+	11:tree:right/:$(git rev-parse base~1:right)
+	11:tree:right/:$(git rev-parse base~2:right)
+	12:blob:right/c:$(git rev-parse base~2:right/c)
+	12:blob:right/c:$(git rev-parse topic:right/c)
+	13:blob:right/d:$(git rev-parse base~1:right/d)
+	blobs:10
+	commits:4
+	tags:7
+	trees:13
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'indexed objects' '
+	test_when_finished git reset --hard &&
+
+	# stage change into index, adding a blob but
+	# also invalidating the cache-tree for the root
+	# and the "left" directory.
+	echo bogus >left/c &&
+	git add left &&
+
+	test-tool path-walk -- --indexed-objects >out &&
+
+	cat >expect <<-EOF &&
+	0:blob:a:$(git rev-parse HEAD:a)
+	1:blob:left/b:$(git rev-parse HEAD:left/b)
+	2:blob:left/c:$(git rev-parse :left/c)
+	3:blob:right/c:$(git rev-parse HEAD:right/c)
+	4:blob:right/d:$(git rev-parse HEAD:right/d)
+	5:tree:right/:$(git rev-parse topic:right)
+	blobs:5
+	commits:0
+	tags:0
+	trees:1
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'branches and indexed objects mix well' '
+	test_when_finished git reset --hard &&
+
+	# stage change into index, adding a blob but
+	# also invalidating the cache-tree for the root
+	# and the "right" directory.
+	echo fake >right/d &&
+	git add right &&
+
+	test-tool path-walk -- --indexed-objects --branches >out &&
+
+	cat >expect <<-EOF &&
+	0:commit::$(git rev-parse topic)
+	0:commit::$(git rev-parse base)
+	0:commit::$(git rev-parse base~1)
+	0:commit::$(git rev-parse base~2)
+	1:tree::$(git rev-parse topic^{tree})
+	1:tree::$(git rev-parse base^{tree})
+	1:tree::$(git rev-parse base~1^{tree})
+	1:tree::$(git rev-parse base~2^{tree})
+	2:tree:a/:$(git rev-parse refs/tags/third:a)
+	3:tree:left/:$(git rev-parse base:left)
+	3:tree:left/:$(git rev-parse base~2:left)
+	4:blob:left/b:$(git rev-parse base:left/b)
+	4:blob:left/b:$(git rev-parse base~2:left/b)
+	5:tree:right/:$(git rev-parse topic:right)
+	5:tree:right/:$(git rev-parse base~1:right)
+	5:tree:right/:$(git rev-parse base~2:right)
+	6:blob:right/c:$(git rev-parse base~2:right/c)
+	6:blob:right/c:$(git rev-parse topic:right/c)
+	7:blob:right/d:$(git rev-parse base~1:right/d)
+	7:blob:right/d:$(git rev-parse :right/d)
+	8:blob:a:$(git rev-parse base~2:a)
+	blobs:7
+	commits:4
+	tags:0
+	trees:10
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic only' '
+	test-tool path-walk -- topic >out &&
+
+	cat >expect <<-EOF &&
+	0:commit::$(git rev-parse topic)
+	0:commit::$(git rev-parse base~1)
+	0:commit::$(git rev-parse base~2)
+	1:tree::$(git rev-parse topic^{tree})
+	1:tree::$(git rev-parse base~1^{tree})
+	1:tree::$(git rev-parse base~2^{tree})
+	2:blob:a:$(git rev-parse base~2:a)
+	3:tree:left/:$(git rev-parse base~2:left)
+	4:blob:left/b:$(git rev-parse base~2:left/b)
+	5:tree:right/:$(git rev-parse topic:right)
+	5:tree:right/:$(git rev-parse base~1:right)
+	5:tree:right/:$(git rev-parse base~2:right)
+	6:blob:right/c:$(git rev-parse base~2:right/c)
+	6:blob:right/c:$(git rev-parse topic:right/c)
+	7:blob:right/d:$(git rev-parse base~1:right/d)
+	blobs:5
+	commits:3
+	tags:0
+	trees:7
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic, not base' '
+	test-tool path-walk -- topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	0:commit::$(git rev-parse topic)
+	1:tree::$(git rev-parse topic^{tree})
+	2:blob:a:$(git rev-parse topic:a):UNINTERESTING
+	3:tree:left/:$(git rev-parse topic:left):UNINTERESTING
+	4:blob:left/b:$(git rev-parse topic:left/b):UNINTERESTING
+	5:tree:right/:$(git rev-parse topic:right)
+	6:blob:right/c:$(git rev-parse topic:right/c)
+	7:blob:right/d:$(git rev-parse topic:right/d):UNINTERESTING
+	blobs:4
+	commits:1
+	tags:0
+	trees:3
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'fourth, blob-tag2, not base' '
+	test-tool path-walk -- fourth blob-tag2 --not base >out &&
+
+	cat >expect <<-EOF &&
+	0:commit::$(git rev-parse topic)
+	1:tag:/tags:$(git rev-parse fourth)
+	2:blob:/tagged-blobs:$(git rev-parse refs/tags/blob-tag2^{})
+	3:tree::$(git rev-parse topic^{tree})
+	4:blob:a:$(git rev-parse base~1:a):UNINTERESTING
+	5:tree:left/:$(git rev-parse base~1:left):UNINTERESTING
+	6:blob:left/b:$(git rev-parse base~1:left/b):UNINTERESTING
+	7:tree:right/:$(git rev-parse topic:right)
+	8:blob:right/c:$(git rev-parse topic:right/c)
+	9:blob:right/d:$(git rev-parse base~1:right/d):UNINTERESTING
+	blobs:5
+	commits:1
+	tags:1
+	trees:3
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic, not base, only blobs' '
+	test-tool path-walk --no-trees --no-commits \
+		-- topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	0:blob:a:$(git rev-parse topic:a):UNINTERESTING
+	1:blob:left/b:$(git rev-parse topic:left/b):UNINTERESTING
+	2:blob:right/c:$(git rev-parse topic:right/c)
+	3:blob:right/d:$(git rev-parse topic:right/d):UNINTERESTING
+	blobs:4
+	commits:0
+	tags:0
+	trees:0
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+# No, this doesn't make a lot of sense for the path-walk API,
+# but it is possible to do.
+test_expect_success 'topic, not base, only commits' '
+	test-tool path-walk --no-blobs --no-trees \
+		-- topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	0:commit::$(git rev-parse topic)
+	commits:1
+	blobs:0
+	tags:0
+	trees:0
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic, not base, only trees' '
+	test-tool path-walk --no-blobs --no-commits \
+		-- topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	0:tree::$(git rev-parse topic^{tree})
+	1:tree:left/:$(git rev-parse topic:left):UNINTERESTING
+	2:tree:right/:$(git rev-parse topic:right)
+	commits:0
+	blobs:0
+	tags:0
+	trees:3
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic, not base, boundary' '
+	test-tool path-walk -- --boundary topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	0:commit::$(git rev-parse topic)
+	0:commit::$(git rev-parse base~1):UNINTERESTING
+	1:tree::$(git rev-parse topic^{tree})
+	1:tree::$(git rev-parse base~1^{tree}):UNINTERESTING
+	2:blob:a:$(git rev-parse base~1:a):UNINTERESTING
+	3:tree:left/:$(git rev-parse base~1:left):UNINTERESTING
+	4:blob:left/b:$(git rev-parse base~1:left/b):UNINTERESTING
+	5:tree:right/:$(git rev-parse topic:right)
+	5:tree:right/:$(git rev-parse base~1:right):UNINTERESTING
+	6:blob:right/c:$(git rev-parse base~1:right/c):UNINTERESTING
+	6:blob:right/c:$(git rev-parse topic:right/c)
+	7:blob:right/d:$(git rev-parse base~1:right/d):UNINTERESTING
+	blobs:5
+	commits:2
+	tags:0
+	trees:5
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'topic, not base, boundary with pruning' '
+	test-tool path-walk --prune -- --boundary topic --not base >out &&
+
+	cat >expect <<-EOF &&
+	0:commit::$(git rev-parse topic)
+	0:commit::$(git rev-parse base~1):UNINTERESTING
+	1:tree::$(git rev-parse topic^{tree})
+	1:tree::$(git rev-parse base~1^{tree}):UNINTERESTING
+	2:tree:right/:$(git rev-parse topic:right)
+	2:tree:right/:$(git rev-parse base~1:right):UNINTERESTING
+	3:blob:right/c:$(git rev-parse base~1:right/c):UNINTERESTING
+	3:blob:right/c:$(git rev-parse topic:right/c)
+	blobs:2
+	commits:2
+	tags:0
+	trees:4
+	EOF
+
+	test_cmp_sorted expect out
+'
+
+test_expect_success 'trees are reported exactly once' '
+	test_when_finished "rm -rf unique-trees" &&
+	test_create_repo unique-trees &&
+	(
+		cd unique-trees &&
+		mkdir initial &&
+		test_commit initial/file &&
+		git switch -c move-to-top &&
+		git mv initial/file.t ./ &&
+		test_tick &&
+		git commit -m moved &&
+		git update-ref refs/heads/other HEAD
+	) &&
+	test-tool -C unique-trees path-walk -- --all >out &&
+	tree=$(git -C unique-trees rev-parse HEAD:) &&
+	grep "$tree" out >out-filtered &&
+	test_line_count = 1 out-filtered
+'
+
+test_done
--- a/t/test-lib-functions.sh
+++ b/t/test-lib-functions.sh
@ -1268,6 +1268,16 @@ test_cmp () {
 	eval "$GIT_TEST_CMP" '"$@"'
 }

+# test_cmp_sorted runs test_cmp on sorted versions of the two
+# input files. Uses "$1.sorted" and "$2.sorted" as temp files.
+
+test_cmp_sorted () {
+	sort <"$1" >"$1.sorted" &&
+	sort <"$2" >"$2.sorted" &&
+	test_cmp "$1.sorted" "$2.sorted" &&
+	rm "$1.sorted" "$2.sorted"
+}
+
 # Check that the given config key has the expected value.
 #
 #    test_cmp_config [-C <dir>] <expected-value>