From 055e57876698f44e917152187e7850d8f7e7e0b5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 12 Dec 2012 06:36:35 -0500 Subject: [PATCH 1/5] .mailmap: match up some obvious names/emails This patch updates git's .mailmap in cases where multiple names are matched to a single email. The "master" name for each email was chosen by: 1. If the only difference is in the presence or absence of accented characters, the accented form is chosen (under the assumption that it is the natural spelling, and accents are sometimes stripped in email). 2. Otherwise, the most commonly used name is chosen. 3. If all names are equally common, the most recently used name is chosen. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- .mailmap | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.mailmap b/.mailmap index bcf4f8770f..69301bd6e6 100644 --- a/.mailmap +++ b/.mailmap @@ -9,7 +9,9 @@ Alex Bennée Alexander Gavrilov Aneesh Kumar K.V Brian M. Carlson +Cheng Renquan Chris Shoemaker +Dan Johnson Dana L. How Dana L. How Daniel Barkalow @@ -18,13 +20,16 @@ David Kågedal David S. Miller Deskin Miller Dirk Süsserott +Eric S. Raymond Erik Faye-Lund Fredrik Kuivinen +Frédéric Heitzmann H. Peter Anvin H. Peter Anvin H. Peter Anvin Horst H. von Brand İsmail Dönmez +Jakub Narębski Jay Soffian Joachim Berdal Haga Johannes Sixt @@ -41,11 +46,14 @@ Junio C Hamano Junio C Hamano Junio C Hamano Karl Hasselström +Kevin Leung Kent Engstrom Lars Doelle Lars Doelle Li Hong Lukas Sandström +Marc-André Lureau +Mark Rada Martin Langhoff Martin von Zweigbergk Michael Coleman @@ -63,11 +71,13 @@ Ralf Thielow Ramsay Allan Jones René Scharfe Robert Fitzsimons +Robert Zeh Sam Vilain Santi Béjar Sean Estabrooks Shawn O. Pearce Steven Grimm +Tay Ray Chuan Theodore Ts'o Thomas Rast Tony Luck From 32d979eaf5a9bbe770aa27e1a8cfed1eeb0969e1 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 12 Dec 2012 06:38:06 -0500 Subject: [PATCH 2/5] .mailmap: fix broken entry for Martin Langhoff Commit adc3192 (Martin Langhoff has a new e-mail address, 2010-10-05) added a mailmap entry, but forgot that both the old and new email addresses need to appear for one to be mapped to the other (i.e., we do not key mailmap emails by name). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- .mailmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 69301bd6e6..e370e867ad 100644 --- a/.mailmap +++ b/.mailmap @@ -54,7 +54,7 @@ Li Hong Lukas Sandström Marc-André Lureau Mark Rada -Martin Langhoff +Martin Langhoff Martin von Zweigbergk Michael Coleman Michael J Gruber From c4878fd924c922d12fdfe4e97e4559679342c583 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 12 Dec 2012 06:38:26 -0500 Subject: [PATCH 3/5] .mailmap: normalize emails for Jeff King I never meant anything special by using my @github.com address; it is merely a mistake that it has sometimes bled through to patches. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index e370e867ad..4a27b7fe0f 100644 --- a/.mailmap +++ b/.mailmap @@ -31,6 +31,7 @@ Horst H. von Brand İsmail Dönmez Jakub Narębski Jay Soffian +Jeff King Joachim Berdal Haga Johannes Sixt Johannes Sixt From 0e230644273ba903d6bc57bdf6db82acf4fad4d3 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 12 Dec 2012 06:41:04 -0500 Subject: [PATCH 4/5] .mailmap: normalize emails for Linus Torvalds Linus used a lot of different per-machine email addresses in the early days. This means that "git shortlog -nse" does not aggregate his counts, and he is listed well below where he should be (8th instead of 3rd). Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- .mailmap | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.mailmap b/.mailmap index 4a27b7fe0f..c7e8618300 100644 --- a/.mailmap +++ b/.mailmap @@ -52,6 +52,12 @@ Kent Engstrom Lars Doelle Lars Doelle Li Hong +Linus Torvalds +Linus Torvalds +Linus Torvalds +Linus Torvalds +Linus Torvalds +Linus Torvalds Lukas Sandström Marc-André Lureau Mark Rada From 53474eb92ff0571e0b1eacd88d638692b96a2018 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Wed, 12 Dec 2012 06:41:41 -0500 Subject: [PATCH 5/5] contrib: update stats/mailmap script This version changes quite a few things: 1. The original parsed the mailmap file itself, and it did it wrong (it did not understand entries with an extra email key). Instead, this version uses git's "%aE" and "%aN" formats to have git perform the mapping, meaning we do not have to read .mailmap at all, but still operate on the current state that git sees (and it also works properly from subdirs). 2. The original would find multiple names for an email, but not the other way around. This version can do either or both. If we find multiple emails for a name, the resolution is less obvious than the other way around. However, it can still be a starting point for a human to investigate. 3. The original would order only by count, not by recency. This version can do either. Combined with showing the counts, it can be easier to decide how to resolve. 4. This version shows similar entries in a blank-delimited stanza, which makes it more clear which options you are picking from. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- contrib/stats/mailmap.pl | 96 ++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 32 deletions(-) diff --git a/contrib/stats/mailmap.pl b/contrib/stats/mailmap.pl index 4b852e2455..9513f5e35b 100755 --- a/contrib/stats/mailmap.pl +++ b/contrib/stats/mailmap.pl @@ -1,38 +1,70 @@ -#!/usr/bin/perl -w -my %mailmap = (); -open I, "<", ".mailmap"; -while () { - chomp; - next if /^#/; - if (my ($author, $mail) = /^(.*?)\s+<(.+)>$/) { - $mailmap{$mail} = $author; - } -} -close I; +#!/usr/bin/perl -my %mail2author = (); -open I, "git log --pretty='format:%ae %an' |"; -while () { - chomp; - my ($mail, $author) = split(/\t/, $_); - next if exists $mailmap{$mail}; - $mail2author{$mail} ||= {}; - $mail2author{$mail}{$author} ||= 0; - $mail2author{$mail}{$author}++; -} -close I; +use warnings 'all'; +use strict; +use Getopt::Long; -while (my ($mail, $authorcount) = each %mail2author) { - # %$authorcount is ($author => $count); - # sort and show the names from the most frequent ones. - my @names = (map { $_->[0] } - sort { $b->[1] <=> $a->[1] } - map { [$_, $authorcount->{$_}] } - keys %$authorcount); - if (1 < @names) { - for (@names) { - print "$_ <$mail>\n"; +my $match_emails; +my $match_names; +my $order_by = 'count'; +Getopt::Long::Configure(qw(bundling)); +GetOptions( + 'emails|e!' => \$match_emails, + 'names|n!' => \$match_names, + 'count|c' => sub { $order_by = 'count' }, + 'time|t' => sub { $order_by = 'stamp' }, +) or exit 1; +$match_emails = 1 unless $match_names; + +my $email = {}; +my $name = {}; + +open(my $fh, '-|', "git log --format='%at <%aE> %aN'"); +while(<$fh>) { + my ($t, $e, $n) = /(\S+) <(\S+)> (.*)/; + mark($email, $e, $n, $t); + mark($name, $n, $e, $t); +} +close($fh); + +if ($match_emails) { + foreach my $e (dups($email)) { + foreach my $n (vals($email->{$e})) { + show($n, $e, $email->{$e}->{$n}); } + print "\n"; } } +if ($match_names) { + foreach my $n (dups($name)) { + foreach my $e (vals($name->{$n})) { + show($n, $e, $name->{$n}->{$e}); + } + print "\n"; + } +} +exit 0; +sub mark { + my ($h, $k, $v, $t) = @_; + my $e = $h->{$k}->{$v} ||= { count => 0, stamp => 0 }; + $e->{count}++; + $e->{stamp} = $t unless $t < $e->{stamp}; +} + +sub dups { + my $h = shift; + return grep { keys($h->{$_}) > 1 } keys($h); +} + +sub vals { + my $h = shift; + return sort { + $h->{$b}->{$order_by} <=> $h->{$a}->{$order_by} + } keys($h); +} + +sub show { + my ($n, $e, $h) = @_; + print "$n <$e> ($h->{$order_by})\n"; +}