From 11f944dd6bdabd003325c85dc60b16389d012361 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 Feb 2011 19:55:19 -0800 Subject: [PATCH 1/3] for_each_hash: allow passing a 'void *data' pointer to callback For the find_exact_renames() function, this allows us to pass the diff_options structure pointer to the low-level routines. We will use that to distinguish between the "rename" and "copy" cases. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- builtin/describe.c | 4 ++-- diffcore-rename.c | 14 ++++++++------ hash.c | 4 ++-- hash.h | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/builtin/describe.c b/builtin/describe.c index 342129fdbd..3ba26dc819 100644 --- a/builtin/describe.c +++ b/builtin/describe.c @@ -63,7 +63,7 @@ static inline struct commit_name *find_commit_name(const unsigned char *peeled) return n; } -static int set_util(void *chain) +static int set_util(void *chain, void *data) { struct commit_name *n; for (n = chain; n; n = n->next) { @@ -289,7 +289,7 @@ static void describe(const char *arg, int last_one) fprintf(stderr, "searching to describe %s\n", arg); if (!have_util) { - for_each_hash(&names, set_util); + for_each_hash(&names, set_util, NULL); have_util = 1; } diff --git a/diffcore-rename.c b/diffcore-rename.c index df41be56de..e5e88feb54 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -247,7 +247,8 @@ struct file_similarity { }; static int find_identical_files(struct file_similarity *src, - struct file_similarity *dst) + struct file_similarity *dst, + struct diff_options *options) { int renames = 0; @@ -306,11 +307,12 @@ static void free_similarity_list(struct file_similarity *p) } } -static int find_same_files(void *ptr) +static int find_same_files(void *ptr, void *data) { int ret; struct file_similarity *p = ptr; struct file_similarity *src = NULL, *dst = NULL; + struct diff_options *options = data; /* Split the hash list up into sources and destinations */ do { @@ -329,7 +331,7 @@ static int find_same_files(void *ptr) * If we have both sources *and* destinations, see if * we can match them up */ - ret = (src && dst) ? find_identical_files(src, dst) : 0; + ret = (src && dst) ? find_identical_files(src, dst, options) : 0; /* Free the hashes and return the number of renames found */ free_similarity_list(src); @@ -377,7 +379,7 @@ static void insert_file_table(struct hash_table *table, int src_dst, int index, * and then during the second round we try to match * cache-dirty entries as well. */ -static int find_exact_renames(void) +static int find_exact_renames(struct diff_options *options) { int i; struct hash_table file_table; @@ -390,7 +392,7 @@ static int find_exact_renames(void) insert_file_table(&file_table, 1, i, rename_dst[i].two); /* Find the renames */ - i = for_each_hash(&file_table, find_same_files); + i = for_each_hash(&file_table, find_same_files, options); /* .. and free the hash data structure */ free_hash(&file_table); @@ -467,7 +469,7 @@ void diffcore_rename(struct diff_options *options) * We really want to cull the candidates list early * with cheap tests in order to avoid doing deltas. */ - rename_count = find_exact_renames(); + rename_count = find_exact_renames(options); /* Did we only want exact renames? */ if (minimum_score == MAX_SCORE) diff --git a/hash.c b/hash.c index 1cd4c9d5c0..749ecfe484 100644 --- a/hash.c +++ b/hash.c @@ -81,7 +81,7 @@ void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table) return insert_hash_entry(hash, ptr, table); } -int for_each_hash(const struct hash_table *table, int (*fn)(void *)) +int for_each_hash(const struct hash_table *table, int (*fn)(void *, void *), void *data) { int sum = 0; unsigned int i; @@ -92,7 +92,7 @@ int for_each_hash(const struct hash_table *table, int (*fn)(void *)) void *ptr = array->ptr; array++; if (ptr) { - int val = fn(ptr); + int val = fn(ptr, data); if (val < 0) return val; sum += val; diff --git a/hash.h b/hash.h index 69e33a47b9..b875ce67c4 100644 --- a/hash.h +++ b/hash.h @@ -30,7 +30,7 @@ struct hash_table { extern void *lookup_hash(unsigned int hash, const struct hash_table *table); extern void **insert_hash(unsigned int hash, void *ptr, struct hash_table *table); -extern int for_each_hash(const struct hash_table *table, int (*fn)(void *)); +extern int for_each_hash(const struct hash_table *table, int (*fn)(void *, void *), void *data); extern void free_hash(struct hash_table *table); static inline void init_hash(struct hash_table *table) From 0940e5f211452ec2520d1b04233acddf0a872c06 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 Feb 2011 20:10:32 -0800 Subject: [PATCH 2/3] diffcore-rename: properly honor the difference between -M and -C We would allow rename detection to do copy detection even when asked purely for renames. That confuses users, but more importantly it can terminally confuse the recursive merge rename logic. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diffcore-rename.c | 53 +++++++++++++++++----------------- t/t4003-diff-rename-1.sh | 2 +- t/t4004-diff-rename-symlink.sh | 2 +- t/t4005-diff-rename-2.sh | 2 +- t/t4008-diff-break-rewrite.sh | 4 +-- t/t4009-diff-rename-4.sh | 2 +- 6 files changed, 32 insertions(+), 33 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index e5e88feb54..b9b039d4a3 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -278,6 +278,8 @@ static int find_identical_files(struct file_similarity *src, } /* Give higher scores to sources that haven't been used already */ score = !source->rename_used; + if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY) + continue; score += basename_same(source, target); if (score > best_score) { best = p; @@ -416,6 +418,27 @@ static void record_if_better(struct diff_score m[], struct diff_score *o) m[worst] = *o; } +static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies) +{ + int count = 0, i; + + for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) { + struct diff_rename_dst *dst; + + if ((mx[i].dst < 0) || + (mx[i].score < minimum_score)) + break; /* there is no more usable pair. */ + dst = &rename_dst[mx[i].dst]; + if (dst->pair) + continue; /* already done, either exact or fuzzy. */ + if (!copies && rename_src[mx[i].src].one->rename_used) + continue; + record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); + count++; + } + return count; +} + void diffcore_rename(struct diff_options *options) { int detect_rename = options->detect_rename; @@ -538,33 +561,9 @@ void diffcore_rename(struct diff_options *options) /* cost matrix sorted by most to least similar pair */ qsort(mx, dst_cnt * NUM_CANDIDATE_PER_DST, sizeof(*mx), score_compare); - for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) { - struct diff_rename_dst *dst; - - if ((mx[i].dst < 0) || - (mx[i].score < minimum_score)) - break; /* there is no more usable pair. */ - dst = &rename_dst[mx[i].dst]; - if (dst->pair) - continue; /* already done, either exact or fuzzy. */ - if (rename_src[mx[i].src].one->rename_used) - continue; - record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); - rename_count++; - } - - for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) { - struct diff_rename_dst *dst; - - if ((mx[i].dst < 0) || - (mx[i].score < minimum_score)) - break; /* there is no more usable pair. */ - dst = &rename_dst[mx[i].dst]; - if (dst->pair) - continue; /* already done, either exact or fuzzy. */ - record_rename_pair(mx[i].dst, mx[i].src, mx[i].score); - rename_count++; - } + rename_count += find_renames(mx, dst_cnt, minimum_score, 0); + if (detect_rename == DIFF_DETECT_COPY) + rename_count += find_renames(mx, dst_cnt, minimum_score, 1); free(mx); cleanup: diff --git a/t/t4003-diff-rename-1.sh b/t/t4003-diff-rename-1.sh index c6130c4019..bfa8835638 100755 --- a/t/t4003-diff-rename-1.sh +++ b/t/t4003-diff-rename-1.sh @@ -29,7 +29,7 @@ test_expect_success \ # copy-and-edit one, and rename-and-edit the other. We do not say # anything about rezrov. -GIT_DIFF_OPTS=--unified=0 git diff-index -M -p $tree >current +GIT_DIFF_OPTS=--unified=0 git diff-index -C -p $tree >current cat >expected <<\EOF diff --git a/COPYING b/COPYING.1 copy from COPYING diff --git a/t/t4004-diff-rename-symlink.sh b/t/t4004-diff-rename-symlink.sh index 92a65f4852..6e562c80d1 100755 --- a/t/t4004-diff-rename-symlink.sh +++ b/t/t4004-diff-rename-symlink.sh @@ -35,7 +35,7 @@ test_expect_success SYMLINKS \ # a new creation. test_expect_success SYMLINKS 'setup diff output' " - GIT_DIFF_OPTS=--unified=0 git diff-index -M -p $tree >current && + GIT_DIFF_OPTS=--unified=0 git diff-index -C -p $tree >current && cat >expected <<\EOF diff --git a/bozbar b/bozbar new file mode 120000 diff --git a/t/t4005-diff-rename-2.sh b/t/t4005-diff-rename-2.sh index 1ba359d478..77d7f4946f 100755 --- a/t/t4005-diff-rename-2.sh +++ b/t/t4005-diff-rename-2.sh @@ -29,7 +29,7 @@ test_expect_success \ # and COPYING.2 are based on COPYING, and do not say anything about # rezrov. -git diff-index -M $tree >current +git diff-index -C $tree >current cat >expected <<\EOF :100644 100644 6ff87c4664981e4397625791c8ea3bbb5f2279a3 0603b3238a076dc6c8022aedc6648fa523a17178 C1234 COPYING COPYING.1 diff --git a/t/t4008-diff-break-rewrite.sh b/t/t4008-diff-break-rewrite.sh index d79d9e1e71..73b4a24f5e 100755 --- a/t/t4008-diff-break-rewrite.sh +++ b/t/t4008-diff-break-rewrite.sh @@ -173,8 +173,8 @@ test_expect_success \ 'compare_diff_raw expected current' test_expect_success \ - 'run diff with -B -M' \ - 'git diff-index -B -M "$tree" >current' + 'run diff with -B -C' \ + 'git diff-index -B -C "$tree" >current' cat >expected <<\EOF :100644 100644 f5deac7be59e7eeab8657fd9ae706fd6a57daed2 08bb2fb671deff4c03a4d4a0a1315dff98d5732c C095 file0 file1 diff --git a/t/t4009-diff-rename-4.sh b/t/t4009-diff-rename-4.sh index de3f17478e..f22c8e3dba 100755 --- a/t/t4009-diff-rename-4.sh +++ b/t/t4009-diff-rename-4.sh @@ -29,7 +29,7 @@ test_expect_success \ # and COPYING.2 are based on COPYING, and do not say anything about # rezrov. -git diff-index -z -M $tree >current +git diff-index -z -C $tree >current cat >expected <<\EOF :100644 100644 6ff87c4664981e4397625791c8ea3bbb5f2279a3 0603b3238a076dc6c8022aedc6648fa523a17178 C1234 From 3a4d67692b5a80213ca47a603fa5505a5990cc87 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 18 Feb 2011 20:12:06 -0800 Subject: [PATCH 3/3] diffcore-rename: improve estimate_similarity() heuristics The logic to quickly dismiss potential rename pairs was broken. It would too eagerly dismiss possible renames when all of the difference was due to pure new data (or deleted data). Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- diffcore-rename.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index b9b039d4a3..0cd4c1305b 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -170,7 +170,7 @@ static int estimate_similarity(struct diff_filespec *src, * and the final score computation below would not have a * divide-by-zero issue. */ - if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) + if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) return 0; if (!src->cnt_data && diff_populate_filespec(src, 0))