Merge branch 'en/diffcore-rename'

Performance optimization work on the rename detection continues. * en/diffcore-rename: merge-ort: call diffcore_rename() directly gitdiffcore doc: mention new preliminary step for rename detection diffcore-rename: guide inexact rename detection based on basenames diffcore-rename: complete find_basename_matches() diffcore-rename: compute basenames of source and dest candidates t4001: add a test comparing basename similarity and content similarity diffcore-rename: filter rename_src list when possible diffcore-rename: no point trying to find a match better than exact
2021-03-01 14:02:56 -08:00
parent 700696bcfc f78cf97617
commit 12bd17521c
4 changed files with 347 additions and 18 deletions
--- a/Documentation/gitdiffcore.txt
+++ b/Documentation/gitdiffcore.txt
@ -169,6 +169,26 @@ a similarity score different from the default of 50% by giving a
 number after the "-M" or "-C" option (e.g. "-M8" to tell it to use
 8/10 = 80%).
 Note that when rename detection is on but both copy and break
 detection are off, rename detection adds a preliminary step that first
 checks if files are moved across directories while keeping their
 filename the same.  If there is a file added to a directory whose
 contents is sufficiently similar to a file with the same name that got
 deleted from a different directory, it will mark them as renames and
 exclude them from the later quadratic step (the one that pairwise
 compares all unmatched files to find the "best" matches, determined by
 the highest content similarity).  So, for example, if a deleted
 docs/ext.txt and an added docs/config/ext.txt are similar enough, they
 will be marked as a rename and prevent an added docs/ext.md that may
 be even more similar to the deleted docs/ext.txt from being considered
 as the rename destination in the later step.  For this reason, the
 preliminary "match same filename" step uses a bit higher threshold to
 mark a file pair as a rename and stop considering other candidates for
 better matches.  At most, one comparison is done per file in this
 preliminary pass; so if there are several remaining ext.txt files
 throughout the directory hierarchy after exact rename detection, this
 preliminary step will be skipped for those files.
 Note.  When the "-C" option is used with `--find-copies-harder`
 option, 'git diff-{asterisk}' commands feed unmodified filepairs to
 diffcore mechanism as well as modified ones.  This lets the copy
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@ -367,6 +367,144 @@ static int find_exact_renames(struct diff_options *options)
 	return renames;
 }
 static const char *get_basename(const char *filename)
 {
 	/*
 	 * gitbasename() has to worry about special drives, multiple
 	 * directory separator characters, trailing slashes, NULL or
 	 * empty strings, etc.  We only work on filenames as stored in
 	 * git, and thus get to ignore all those complications.
 	 */
 	const char *base = strrchr(filename, '/');
 	return base ? base + 1 : filename;
 }
 static int find_basename_matches(struct diff_options *options,
 				 int minimum_score)
 {
 	/*
 	 * When I checked in early 2020, over 76% of file renames in linux
 	 * just moved files to a different directory but kept the same
 	 * basename.  gcc did that with over 64% of renames, gecko did it
 	 * with over 79%, and WebKit did it with over 89%.
 	 *
 	 * Therefore we can bypass the normal exhaustive NxM matrix
 	 * comparison of similarities between all potential rename sources
 	 * and destinations by instead using file basename as a hint (i.e.
 	 * the portion of the filename after the last '/'), checking for
 	 * similarity between files with the same basename, and if we find
 	 * a pair that are sufficiently similar, record the rename pair and
 	 * exclude those two from the NxM matrix.
 	 *
 	 * This *might* cause us to find a less than optimal pairing (if
 	 * there is another file that we are even more similar to but has a
 	 * different basename).  Given the huge performance advantage
 	 * basename matching provides, and given the frequency with which
 	 * people use the same basename in real world projects, that's a
 	 * trade-off we are willing to accept when doing just rename
 	 * detection.
 	 *
 	 * If someone wants copy detection that implies they are willing to
 	 * spend more cycles to find similarities between files, so it may
 	 * be less likely that this heuristic is wanted.  If someone is
 	 * doing break detection, that means they do not want filename
 	 * similarity to imply any form of content similiarity, and thus
 	 * this heuristic would definitely be incompatible.
 	 */
 	int i, renames = 0;
 	struct strintmap sources;
 	struct strintmap dests;
 	struct hashmap_iter iter;
 	struct strmap_entry *entry;
 	/*
 	 * The prefeteching stuff wants to know if it can skip prefetching
 	 * blobs that are unmodified...and will then do a little extra work
 	 * to verify that the oids are indeed different before prefetching.
 	 * Unmodified blobs are only relevant when doing copy detection;
 	 * when limiting to rename detection, diffcore_rename[_extended]()
 	 * will never be called with unmodified source paths fed to us, so
 	 * the extra work necessary to check if rename_src entries are
 	 * unmodified would be a small waste.
 	 */
 	int skip_unmodified = 0;
 	/*
 	 * Create maps of basename -> fullname(s) for remaining sources and
 	 * dests.
 	 */
 	strintmap_init_with_options(&sources, -1, NULL, 0);
 	strintmap_init_with_options(&dests, -1, NULL, 0);
 	for (i = 0; i < rename_src_nr; ++i) {
 		char *filename = rename_src[i].p->one->path;
 		const char *base;
 		/* exact renames removed in remove_unneeded_paths_from_src() */
 		assert(!rename_src[i].p->one->rename_used);
 		/* Record index within rename_src (i) if basename is unique */
 		base = get_basename(filename);
 		if (strintmap_contains(&sources, base))
 			strintmap_set(&sources, base, -1);
 		else
 			strintmap_set(&sources, base, i);
 	}
 	for (i = 0; i < rename_dst_nr; ++i) {
 		char *filename = rename_dst[i].p->two->path;
 		const char *base;
 		if (rename_dst[i].is_rename)
 			continue; /* involved in exact match already. */
 		/* Record index within rename_dst (i) if basename is unique */
 		base = get_basename(filename);
 		if (strintmap_contains(&dests, base))
 			strintmap_set(&dests, base, -1);
 		else
 			strintmap_set(&dests, base, i);
 	}
 	/* Now look for basename matchups and do similarity estimation */
 	strintmap_for_each_entry(&sources, &iter, entry) {
 		const char *base = entry->key;
 		intptr_t src_index = (intptr_t)entry->value;
 		intptr_t dst_index;
 		if (src_index == -1)
 			continue;
 		if (0 <= (dst_index = strintmap_get(&dests, base))) {
 			struct diff_filespec *one, *two;
 			int score;
 			/* Estimate the similarity */
 			one = rename_src[src_index].p->one;
 			two = rename_dst[dst_index].p->two;
 			score = estimate_similarity(options->repo, one, two,
 						    minimum_score, skip_unmodified);
 			/* If sufficiently similar, record as rename pair */
 			if (score < minimum_score)
 				continue;
 			record_rename_pair(dst_index, src_index, score);
 			renames++;
 			/*
 			 * Found a rename so don't need text anymore; if we
 			 * didn't find a rename, the filespec_blob would get
 			 * re-used when doing the matrix of comparisons.
 			 */
 			diff_free_filespec_blob(one);
 			diff_free_filespec_blob(two);
 		}
 	}
 	strintmap_clear(&sources);
 	strintmap_clear(&dests);
 	return renames;
 }
 #define NUM_CANDIDATE_PER_DST 4
 static void record_if_better(struct diff_score m[], struct diff_score *o)
 {
@ -454,6 +592,54 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
 	return count;
 }
 static void remove_unneeded_paths_from_src(int detecting_copies)
 {
 	int i, new_num_src;
 	if (detecting_copies)
 		return; /* nothing to remove */
 	if (break_idx)
 		return; /* culling incompatible with break detection */
 	/*
 	 * Note on reasons why we cull unneeded sources but not destinations:
 	 *   1) Pairings are stored in rename_dst (not rename_src), which we
 	 *      need to keep around.  So, we just can't cull rename_dst even
 	 *      if we wanted to.  But doing so wouldn't help because...
 	 *
 	 *   2) There is a matrix pairwise comparison that follows the
 	 *      "Performing inexact rename detection" progress message.
 	 *      Iterating over the destinations is done in the outer loop,
 	 *      hence we only iterate over each of those once and we can
 	 *      easily skip the outer loop early if the destination isn't
 	 *      relevant.  That's only one check per destination path to
 	 *      skip.
 	 *
 	 *      By contrast, the sources are iterated in the inner loop; if
 	 *      we check whether a source can be skipped, then we'll be
 	 *      checking it N separate times, once for each destination.
 	 *      We don't want to have to iterate over known-not-needed
 	 *      sources N times each, so avoid that by removing the sources
 	 *      from rename_src here.
 	 */
 	for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
 		/*
 		 * renames are stored in rename_dst, so if a rename has
 		 * already been detected using this source, we can just
 		 * remove the source knowing rename_dst has its info.
 		 */
 		if (rename_src[i].p->one->rename_used)
 			continue;
 		if (new_num_src < i)
 			memcpy(&rename_src[new_num_src], &rename_src[i],
 			       sizeof(struct diff_rename_src));
 		new_num_src++;
 	}
 	rename_src_nr = new_num_src;
 }
 void diffcore_rename(struct diff_options *options)
 {
 	int detect_rename = options->detect_rename;
@ -463,9 +649,11 @@ void diffcore_rename(struct diff_options *options)
 	struct diff_score *mx;
 	int i, j, rename_count, skip_unmodified = 0;
 	int num_destinations, dst_cnt;
 	int num_sources, want_copies;
 	struct progress *progress = NULL;
 	trace2_region_enter("diff", "setup", options->repo);
 	want_copies = (detect_rename == DIFF_DETECT_COPY);
 	if (!minimum_score)
 		minimum_score = DEFAULT_RENAME_SCORE;
@ -502,7 +690,7 @@ void diffcore_rename(struct diff_options *options)
 				p->one->rename_used++;
 			register_rename_src(p);
 		}
-		else if (detect_rename == DIFF_DETECT_COPY) {
+		else if (want_copies) {
 			/*
 			 * Increment the "rename_used" score by
 			 * one, to indicate ourselves as a user.
@ -527,17 +715,60 @@ void diffcore_rename(struct diff_options *options)
 	if (minimum_score == MAX_SCORE)
 		goto cleanup;
 	num_sources = rename_src_nr;
 	if (want_copies || break_idx) {
 		/*
-	 * Calculate how many renames are left (but all the source
+		 * Cull sources:
-	 * files still remain as options for rename/copies!)
+		 *   - remove ones corresponding to exact renames
 		 */
 		trace2_region_enter("diff", "cull after exact", options->repo);
 		remove_unneeded_paths_from_src(want_copies);
 		trace2_region_leave("diff", "cull after exact", options->repo);
 	} else {
 		/* Determine minimum score to match basenames */
 		double factor = 0.5;
 		char *basename_factor = getenv("GIT_BASENAME_FACTOR");
 		int min_basename_score;
 		if (basename_factor)
 			factor = strtol(basename_factor, NULL, 10)/100.0;
 		assert(factor >= 0.0 && factor <= 1.0);
 		min_basename_score = minimum_score +
 			(int)(factor * (MAX_SCORE - minimum_score));
 		/*
 		 * Cull sources:
 		 *   - remove ones involved in renames (found via exact match)
 		 */
 		trace2_region_enter("diff", "cull after exact", options->repo);
 		remove_unneeded_paths_from_src(want_copies);
 		trace2_region_leave("diff", "cull after exact", options->repo);
 		/* Utilize file basenames to quickly find renames. */
 		trace2_region_enter("diff", "basename matches", options->repo);
 		rename_count += find_basename_matches(options,
 						      min_basename_score);
 		trace2_region_leave("diff", "basename matches", options->repo);
 		/*
 		 * Cull sources, again:
 		 *   - remove ones involved in renames (found via basenames)
 		 */
 		trace2_region_enter("diff", "cull basename", options->repo);
 		remove_unneeded_paths_from_src(want_copies);
 		trace2_region_leave("diff", "cull basename", options->repo);
 	}
 	/* Calculate how many rename destinations are left */
 	num_destinations = (rename_dst_nr - rename_count);
 	num_sources = rename_src_nr; /* rename_src_nr reflects lower number */
 	/* All done? */
-	if (!num_destinations)
+	if (!num_destinations || !num_sources)
 		goto cleanup;
-	switch (too_many_rename_candidates(num_destinations, rename_src_nr,
+	switch (too_many_rename_candidates(num_destinations, num_sources,
 					   options)) {
 	case 1:
 		goto cleanup;
@ -553,7 +784,7 @@ void diffcore_rename(struct diff_options *options)
 	if (options->show_rename_progress) {
 		progress = start_delayed_progress(
 				_("Performing inexact rename detection"),
-				(uint64_t)num_destinations * (uint64_t)rename_src_nr);
+				(uint64_t)num_destinations * (uint64_t)num_sources);
 	}
 	mx = xcalloc(st_mult(NUM_CANDIDATE_PER_DST, num_destinations),
@ -563,7 +794,7 @@ void diffcore_rename(struct diff_options *options)
 		struct diff_score *m;
 		if (rename_dst[i].is_rename)
-			continue; /* dealt with exact match already. */
+			continue; /* exact or basename match already handled */
 		m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST];
 		for (j = 0; j < NUM_CANDIDATE_PER_DST; j++)
@ -573,6 +804,8 @@ void diffcore_rename(struct diff_options *options)
 			struct diff_filespec *one = rename_src[j].p->one;
 			struct diff_score this_src;
 			assert(!one->rename_used || want_copies || break_idx);
 			if (skip_unmodified &&
 			    diff_unmodified_pair(rename_src[j].p))
 				continue;
@ -594,7 +827,7 @@ void diffcore_rename(struct diff_options *options)
 		}
 		dst_cnt++;
 		display_progress(progress,
-				 (uint64_t)dst_cnt * (uint64_t)rename_src_nr);
+				 (uint64_t)dst_cnt * (uint64_t)num_sources);
 	}
 	stop_progress(&progress);
@ -602,7 +835,7 @@ void diffcore_rename(struct diff_options *options)
 	STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare);
 	rename_count += find_renames(mx, dst_cnt, minimum_score, 0);
-	if (detect_rename == DIFF_DETECT_COPY)
+	if (want_copies)
 		rename_count += find_renames(mx, dst_cnt, minimum_score, 1);
 	free(mx);
 	trace2_region_leave("diff", "inexact renames", options->repo);
--- a/merge-ort.c
+++ b/merge-ort.c
@ -535,6 +535,23 @@ static void setup_path_info(struct merge_options *opt,
 	result->util = mi;
 }
 static void add_pair(struct merge_options *opt,
 		     struct name_entry *names,
 		     const char *pathname,
 		     unsigned side,
 		     unsigned is_add /* if false, is_delete */)
 {
 	struct diff_filespec *one, *two;
 	struct rename_info *renames = &opt->priv->renames;
 	int names_idx = is_add ? side : 0;
 	one = alloc_filespec(pathname);
 	two = alloc_filespec(pathname);
 	fill_filespec(is_add ? two : one,
 		      &names[names_idx].oid, 1, names[names_idx].mode);
 	diff_queue(&renames->pairs[side], one, two);
 }
 static void collect_rename_info(struct merge_options *opt,
 				struct name_entry *names,
 				const char *dirname,
@ -544,6 +561,7 @@ static void collect_rename_info(struct merge_options *opt,
 				unsigned match_mask)
 {
 	struct rename_info *renames = &opt->priv->renames;
 	unsigned side;
 	/* Update dirs_removed, as needed */
 	if (dirmask == 1 || dirmask == 3 || dirmask == 5) {
@ -554,6 +572,21 @@ static void collect_rename_info(struct merge_options *opt,
 		if (sides & 2)
 			strset_add(&renames->dirs_removed[2], fullname);
 	}
 	if (filemask == 0 || filemask == 7)
 		return;
 	for (side = MERGE_SIDE1; side <= MERGE_SIDE2; ++side) {
 		unsigned side_mask = (1 << side);
 		/* Check for deletion on side */
 		if ((filemask & 1) && !(filemask & side_mask))
 			add_pair(opt, names, fullname, side, 0 /* delete */);
 		/* Check for addition on side */
 		if (!(filemask & 1) && (filemask & side_mask))
 			add_pair(opt, names, fullname, side, 1 /* add */);
 	}
 }
 static int collect_merge_info_callback(int n,
@ -2079,6 +2112,27 @@ static int process_renames(struct merge_options *opt,
 	return clean_merge;
 }
 static void resolve_diffpair_statuses(struct diff_queue_struct *q)
 {
 	/*
 	 * A simplified version of diff_resolve_rename_copy(); would probably
 	 * just use that function but it's static...
 	 */
 	int i;
 	struct diff_filepair *p;
 	for (i = 0; i < q->nr; ++i) {
 		p = q->queue[i];
 		p->status = 0; /* undecided */
 		if (!DIFF_FILE_VALID(p->one))
 			p->status = DIFF_STATUS_ADDED;
 		else if (!DIFF_FILE_VALID(p->two))
 			p->status = DIFF_STATUS_DELETED;
 		else if (DIFF_PAIR_RENAME(p))
 			p->status = DIFF_STATUS_RENAMED;
 	}
 }
 static int compare_pairs(const void *a_, const void *b_)
 {
 	const struct diff_filepair *a = *((const struct diff_filepair **)a_);
@ -2089,8 +2143,6 @@ static int compare_pairs(const void *a_, const void *b_)
 /* Call diffcore_rename() to compute which files have changed on given side */
 static void detect_regular_renames(struct merge_options *opt,
 				   struct tree *merge_base,
 				   struct tree *side,
 				   unsigned side_index)
 {
 	struct diff_options diff_opts;
@ -2108,11 +2160,11 @@ static void detect_regular_renames(struct merge_options *opt,
 	diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;
 	diff_setup_done(&diff_opts);
 	diff_queued_diff = renames->pairs[side_index];
 	trace2_region_enter("diff", "diffcore_rename", opt->repo);
-	diff_tree_oid(&merge_base->object.oid, &side->object.oid, "",
+	diffcore_rename(&diff_opts);
 		      &diff_opts);
 	diffcore_std(&diff_opts);
 	trace2_region_leave("diff", "diffcore_rename", opt->repo);
 	resolve_diffpair_statuses(&diff_queued_diff);
 	if (diff_opts.needed_rename_limit > renames->needed_limit)
 		renames->needed_limit = diff_opts.needed_rename_limit;
@ -2212,8 +2264,8 @@ static int detect_and_process_renames(struct merge_options *opt,
 	memset(&combined, 0, sizeof(combined));
 	trace2_region_enter("merge", "regular renames", opt->repo);
-	detect_regular_renames(opt, merge_base, side1, MERGE_SIDE1);
+	detect_regular_renames(opt, MERGE_SIDE1);
-	detect_regular_renames(opt, merge_base, side2, MERGE_SIDE2);
+	detect_regular_renames(opt, MERGE_SIDE2);
 	trace2_region_leave("merge", "regular renames", opt->repo);
 	trace2_region_enter("merge", "directory renames", opt->repo);
--- a/t/t4001-diff-rename.sh
+++ b/t/t4001-diff-rename.sh
@ -262,4 +262,28 @@ test_expect_success 'diff-tree -l0 defaults to a big rename limit, not zero' '
 	grep "myotherfile.*myfile" actual
 '
 test_expect_success 'basename similarity vs best similarity' '
 	mkdir subdir &&
 	test_write_lines line1 line2 line3 line4 line5 \
 			 line6 line7 line8 line9 line10 >subdir/file.txt &&
 	git add subdir/file.txt &&
 	git commit -m "base txt" &&
 	git rm subdir/file.txt &&
 	test_write_lines line1 line2 line3 line4 line5 \
 			  line6 line7 line8 >file.txt &&
 	test_write_lines line1 line2 line3 line4 line5 \
 			  line6 line7 line8 line9 >file.md &&
 	git add file.txt file.md &&
 	git commit -a -m "rename" &&
 	git diff-tree -r -M --name-status HEAD^ HEAD >actual &&
 	# subdir/file.txt is 88% similar to file.md, 78% similar to file.txt,
 	# but since same basenames are checked first...
 	cat >expected <<-\EOF &&
 	A	file.md
 	R078	subdir/file.txt	file.txt
 	EOF
 	test_cmp expected actual
 '
 test_done