From ac4bbb41b25949adff3afa96b2821f648abad8b3 Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Mon, 16 Jul 2012 14:00:46 +0200 Subject: [PATCH 01/10] git-remote-mediawiki: don't split namespaces with spaces Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index accd70a94c..a6ad8cf33c 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -1275,7 +1275,7 @@ sub get_mw_namespace_id { # Look at configuration file, if the record for that namespace is # already cached. Namespaces are stored in form: # "Name_of_namespace:Id_namespace", ex.: "File:6". - my @temp = split(/[ \n]/, run_git("config --get-all remote." + my @temp = split(/[\n]/, run_git("config --get-all remote." . $remotename .".namespaceCache")); chomp(@temp); foreach my $ns (@temp) { From dcb1ea620d1e8bd1d7e7622b7ca69050d0c119d7 Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Mon, 16 Jul 2012 14:00:47 +0200 Subject: [PATCH 02/10] git-remote-mediawiki: actually send empty comment when they're empty Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index a6ad8cf33c..a2da52f4d6 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -51,6 +51,9 @@ use constant EMPTY_CONTENT => "\n"; # used to reflect file creation or deletion in diff. use constant NULL_SHA1 => "0000000000000000000000000000000000000000"; +# Used on Git's side to reflect empty edit messages on the wiki +use constant EMPTY_MESSAGE => '*Empty MediaWiki Message*'; + my $remotename = $ARGV[0]; my $url = $ARGV[1]; @@ -935,7 +938,7 @@ sub mw_import_revids { my %commit; $commit{author} = $rev->{user} || 'Anonymous'; - $commit{comment} = $rev->{comment} || '*Empty MediaWiki Message*'; + $commit{comment} = $rev->{comment} || EMPTY_MESSAGE; $commit{title} = mediawiki_smudge_filename($page_title); $commit{mw_revision} = $rev->{revid}; $commit{content} = mediawiki_smudge($rev->{'*'}); @@ -1050,6 +1053,10 @@ sub mw_push_file { my $oldrevid = shift; my $newrevid; + if ($summary eq EMPTY_MESSAGE) { + $summary = ''; + } + my $new_sha1 = $diff_info_split[3]; my $old_sha1 = $diff_info_split[2]; my $page_created = ($old_sha1 eq NULL_SHA1); From 2045e293eb69204ac06910c7570718efb09b7059 Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Mon, 16 Jul 2012 21:46:37 +0200 Subject: [PATCH 03/10] git-remote-mediawiki: make mediafiles export optional It is possible to use git-remote-mediawiki on a tree with both .mw files and other files. Before git-remote-mediawiki learnt how to export mediafiles, such mixed trees allowed the user to maintain both the wiki and other files for the same project in the same repository. With the newly added support for exporting mediafiles, pushing such mixed trees would upload unrelated files as mediafiles, which may not be desired. Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index a2da52f4d6..8e46e4e7c7 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -66,11 +66,16 @@ chomp(@tracked_pages); my @tracked_categories = split(/[ \n]/, run_git("config --get-all remote.". $remotename .".categories")); chomp(@tracked_categories); -# Import media files too. +# Import media files on pull my $import_media = run_git("config --get --bool remote.". $remotename .".mediaimport"); chomp($import_media); $import_media = ($import_media eq "true"); +# Export media files on push +my $export_media = run_git("config --get --bool remote.". $remotename .".mediaexport"); +chomp($export_media); +$export_media = !($export_media eq "false"); + my $wiki_login = run_git("config --get remote.". $remotename .".mwLogin"); # Note: mwPassword is discourraged. Use the credential system instead. my $wiki_passwd = run_git("config --get remote.". $remotename .".mwPassword"); @@ -1068,6 +1073,11 @@ sub mw_push_file { $extension = ""; } if ($extension eq "mw") { + my $ns = get_mw_namespace_id_for_page($complete_file_name); + if ($ns && $ns == get_mw_namespace_id("File") && (!$export_media)) { + print STDERR "Ignoring media file related page: $complete_file_name\n"; + return ($oldrevid, "ok"); + } my $file_content; if ($page_deleted) { # Deleting a page usually requires @@ -1107,10 +1117,12 @@ sub mw_push_file { } $newrevid = $result->{edit}->{newrevid}; print STDERR "Pushed file: $new_sha1 - $title\n"; - } else { + } elsif ($export_media) { $newrevid = mw_upload_file($complete_file_name, $new_sha1, $extension, $page_deleted, $summary); + } else { + print STDERR "Ignoring media file $title\n"; } $newrevid = ($newrevid or $oldrevid); return ($newrevid, "ok"); @@ -1328,3 +1340,11 @@ sub get_mw_namespace_id { die "No such namespace $name on MediaWiki."; } } + +sub get_mw_namespace_id_for_page { + if (my ($namespace) = $_[0] =~ /^([^:]*):/) { + return get_mw_namespace_id($namespace); + } else { + return; + } +} From 93e92d4d7c8883b346c8975c636286d6f6479142 Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Mon, 16 Jul 2012 21:46:38 +0200 Subject: [PATCH 04/10] git-remote-mediawiki: get rid of O(N^2) loop The algorithm to find a path from the local revision to the remote one was calling "git rev-list" and parsing its output N times. Run rev-list only once, and fill a hashtable with the result to optimize the body of the loop. Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index 8e46e4e7c7..fb1e9e09e4 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -1196,16 +1196,26 @@ sub mw_push_revision { if ($last_local_revid > 0) { my $parsed_sha1 = $remoteorigin_sha1; # Find a path from last MediaWiki commit to pushed commit + print STDERR "Computing path from local to remote ...\n"; + my @local_ancestry = split(/\n/, run_git("rev-list --boundary --parents $local ^$parsed_sha1")); + my %local_ancestry; + foreach my $line (@local_ancestry) { + if (my ($child, $parents) = $line =~ m/^-?([a-f0-9]+) ([a-f0-9 ]+)/) { + foreach my $parent (split(' ', $parents)) { + $local_ancestry{$parent} = $child; + } + } elsif (!$line =~ m/^([a-f0-9]+)/) { + die "Unexpected output from git rev-list: $line"; + } + } while ($parsed_sha1 ne $HEAD_sha1) { - my @commit_info = grep(/^$parsed_sha1/, split(/\n/, run_git("rev-list --children $local"))); - if (!@commit_info) { + my $child = $local_ancestry{$parsed_sha1}; + if (!$child) { + printf STDERR "Cannot find a path in history from remote commit to last commit\n"; return error_non_fast_forward($remote); } - my @commit_info_split = split(/ |\n/, $commit_info[0]); - # $commit_info_split[1] is the sha1 of the commit to export - # $commit_info_split[0] is the sha1 of its direct child - push(@commit_pairs, \@commit_info_split); - $parsed_sha1 = $commit_info_split[1]; + push(@commit_pairs, [$parsed_sha1, $child]); + $parsed_sha1 = $child; } } else { # No remote mediawiki revision. Export the whole From 5c481745fbed5aa0eb63726c9e05538ededa63bb Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Mon, 16 Jul 2012 21:46:39 +0200 Subject: [PATCH 05/10] git-remote-mediawiki: use --force when adding notes When notes are created to record a push, it normally doesn't exist yet. However, when a push is interrupted and then restarted, it may happen that a commit already has notes attached, and we want to reflect the newly created remote revision, hence use 'git notes add -f' to override the existing one Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index fb1e9e09e4..517a4db3ff 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -1263,7 +1263,7 @@ sub mw_push_revision { } } unless ($dumb_push) { - run_git("notes --ref=$remotename/mediawiki add -m \"mediawiki_revision: $mw_revision\" $sha1_commit"); + run_git("notes --ref=$remotename/mediawiki add -f -m \"mediawiki_revision: $mw_revision\" $sha1_commit"); run_git("update-ref -m \"Git-MediaWiki push\" refs/mediawiki/$remotename/master $sha1_commit $sha1_child"); } } From f690ddfdb7d06eaf5f62d8deb969289d59bed9cd Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Mon, 16 Jul 2012 21:46:40 +0200 Subject: [PATCH 06/10] git-remote-mediawiki: show progress information when listing pages Initial phases of push and pull with git-remote-mediawiki can be long on a large wiki. Let the user know what's going on. Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index 517a4db3ff..729a0bc842 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -371,6 +371,8 @@ sub get_mw_first_pages { sub get_mw_pages { mw_connect_maybe(); + print STDERR "Listing pages on remote wiki...\n"; + my %pages; # hash on page titles to avoid duplicates my $user_defined; if (@tracked_pages) { @@ -394,6 +396,7 @@ sub get_mw_pages { get_all_mediafiles(\%pages); } } + print STDERR (scalar keys %pages) . " pages found.\n"; return %pages; } From a393f48823ba619092f8057ed807a20358e0165c Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Mon, 16 Jul 2012 21:46:41 +0200 Subject: [PATCH 07/10] git-remote-mediawiki: show progress information when getting last remote revision Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index 729a0bc842..8badff601c 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -579,6 +579,8 @@ sub get_last_remote_revision { my $max_rev_num = 0; + print STDERR "Getting last revision id on tracked pages...\n"; + foreach my $page (@pages) { my $id = $page->{pageid}; From ebd5fe1c15b8e576042017d6b07b7e0ed96eaf66 Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Mon, 16 Jul 2012 21:46:42 +0200 Subject: [PATCH 08/10] git-remote-mediawiki: properly deal with invalid remote revisions Some wiki, including https://git.wiki.kernel.org/ have invalid revision numbers (i.e. the actual revision numbers are non-contiguous). Don't die when encountering one. Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index 8badff601c..5eab96b63c 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -910,6 +910,10 @@ sub mw_import_revids { my $last_timestamp = 0; # Placeholer in case $rev->timestamp is undefined foreach my $pagerevid (@$revision_ids) { + # Count page even if we skip it, since we display + # $n/$total and $total includes skipped pages. + $n++; + # fetch the content of the pages my $query = { action => 'query', @@ -924,6 +928,11 @@ sub mw_import_revids { die "Failed to retrieve modified page for revision $pagerevid"; } + if (defined($result->{query}->{badrevids}->{$pagerevid})) { + # The revision id does not exist on the remote wiki. + next; + } + if (!defined($result->{query}->{pages})) { die "Invalid revision $pagerevid."; } @@ -932,10 +941,6 @@ sub mw_import_revids { my $result_page = $result_pages[0]; my $rev = $result_pages[0]->{revisions}->[0]; - # Count page even if we skip it, since we display - # $n/$total and $total includes skipped pages. - $n++; - my $page_title = $result_page->{title}; if (!exists($pages->{$page_title})) { From fa316e090a58421c7caf50eefa58f6cb30c6df89 Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Tue, 17 Jul 2012 16:05:59 +0200 Subject: [PATCH 09/10] git-remote-mediawiki: fix incorrect test usage in test Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/t/push-pull-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/mw-to-git/t/push-pull-tests.sh b/contrib/mw-to-git/t/push-pull-tests.sh index 6692a0f40f..9da2dc5ff0 100644 --- a/contrib/mw-to-git/t/push-pull-tests.sh +++ b/contrib/mw-to-git/t/push-pull-tests.sh @@ -104,7 +104,7 @@ test_push_pull () { git push ) && - test ! wiki_page_exist Foo + test_must_fail wiki_page_exist Foo ' test_expect_success 'Merge conflict expected and solving it' ' From 2272400c93332629e8c34dcf3b1207159352e539 Mon Sep 17 00:00:00 2001 From: Matthieu Moy Date: Tue, 17 Jul 2012 16:06:00 +0200 Subject: [PATCH 10/10] git-remote-mediawiki: allow page names with a ':' Traditionnally, pages named Foo:Bar are page 'Bar' in namespace 'Foo'. However, it is also possible to call a page Foo:Bar if 'Foo' is not a namespace. In this case, the actual name of the page is 'Foo:Bar', in the main namespace. Since we can't tell with only the filename, query the wiki for a namespace 'Foo' in these cases, but deal with the case where no such namespace is found. Signed-off-by: Matthieu Moy Signed-off-by: Junio C Hamano --- contrib/mw-to-git/git-remote-mediawiki | 49 ++++++++++++++------- contrib/mw-to-git/t/t9362-mw-to-git-utf8.sh | 20 +++++++++ 2 files changed, 53 insertions(+), 16 deletions(-) diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki index 5eab96b63c..dc137409b1 100755 --- a/contrib/mw-to-git/git-remote-mediawiki +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -968,8 +968,11 @@ sub mw_import_revids { # Differentiates classic pages and media files. my ($namespace, $filename) = $page_title =~ /^([^:]*):(.*)$/; my %mediafile; - if ($namespace && get_mw_namespace_id($namespace) == get_mw_namespace_id("File")) { - %mediafile = get_mw_mediafile_for_page_revision($filename, $rev->{timestamp}); + if ($namespace) { + my $id = get_mw_namespace_id($namespace); + if ($id && $id == get_mw_namespace_id("File")) { + %mediafile = get_mw_mediafile_for_page_revision($filename, $rev->{timestamp}); + } } # If this is a revision of the media page for new version # of a file do one common commit for both file and media page. @@ -1319,7 +1322,11 @@ sub get_mw_namespace_id { chomp(@temp); foreach my $ns (@temp) { my ($n, $id) = split(/:/, $ns); - $namespace_id{$n} = $id; + if ($id eq 'notANameSpace') { + $namespace_id{$n} = {is_namespace => 0}; + } else { + $namespace_id{$n} = {is_namespace => 1, id => $id}; + } $cached_mw_namespace_id{$n} = 1; } } @@ -1337,28 +1344,38 @@ sub get_mw_namespace_id { while (my ($id, $ns) = each(%{$result->{query}->{namespaces}})) { if (defined($ns->{id}) && defined($ns->{canonical})) { - $namespace_id{$ns->{canonical}} = $ns->{id}; + $namespace_id{$ns->{canonical}} = {is_namespace => 1, id => $ns->{id}}; if ($ns->{'*'}) { # alias (e.g. french Fichier: as alias for canonical File:) - $namespace_id{$ns->{'*'}} = $ns->{id}; + $namespace_id{$ns->{'*'}} = {is_namespace => 1, id => $ns->{id}}; } } } } - my $id = $namespace_id{$name}; + my $ns = $namespace_id{$name}; + my $id; - if (defined $id) { - # Store explicitely requested namespaces on disk - if (!exists $cached_mw_namespace_id{$name}) { - run_git("config --add remote.". $remotename - .".namespaceCache \"". $name .":". $id ."\""); - $cached_mw_namespace_id{$name} = 1; - } - return $id; - } else { - die "No such namespace $name on MediaWiki."; + unless (defined $ns) { + print STDERR "No such namespace $name on MediaWiki.\n"; + $ns = {is_namespace => 0}; + $namespace_id{$name} = $ns; } + + if ($ns->{is_namespace}) { + $id = $ns->{id}; + } + + # Store "notANameSpace" as special value for inexisting namespaces + my $store_id = ($id || 'notANameSpace'); + + # Store explicitely requested namespaces on disk + if (!exists $cached_mw_namespace_id{$name}) { + run_git("config --add remote.". $remotename + .".namespaceCache \"". $name .":". $store_id ."\""); + $cached_mw_namespace_id{$name} = 1; + } + return $id; } sub get_mw_namespace_id_for_page { diff --git a/contrib/mw-to-git/t/t9362-mw-to-git-utf8.sh b/contrib/mw-to-git/t/t9362-mw-to-git-utf8.sh index 8635878452..246d47d8fb 100755 --- a/contrib/mw-to-git/t/t9362-mw-to-git-utf8.sh +++ b/contrib/mw-to-git/t/t9362-mw-to-git-utf8.sh @@ -169,6 +169,26 @@ test_expect_failure 'special character at the begining of file name from mw to g test_path_is_file mw_dir_11/[char_2 ' +test_expect_success 'Pull page with title containing ":" other than namespace separator' ' + wiki_editpage Foo:Bar content false && + ( + cd mw_dir_11 && + git pull + ) && + test_path_is_file mw_dir_11/Foo:Bar.mw +' + +test_expect_success 'Push page with title containing ":" other than namespace separator' ' + ( + cd mw_dir_11 && + echo content >NotANameSpace:Page.mw && + git add NotANameSpace:Page.mw && + git commit -m "add page with colon" && + git push + ) && + wiki_page_exist NotANameSpace:Page +' + test_expect_success 'test of correct formating for file name from mw to git' ' wiki_reset && git clone mediawiki::'"$WIKI_URL"' mw_dir_12 &&