#!/usr/bin/perl use strict; use warnings; # A simple example on how to migrate a TWiki to Google Sites automatically. # Copyright (C) 2010 Ivan Zahariev (famzah) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . use LWP; # http://www.perl.com/pub/a/2002/08/20/perlandlwp.html use HTML::Entities; use URI::Escape; use Data::Dumper; use XML::Simple; use Text::WikiFormat; my $GoogleEmail = ''; my $GooglePasswd = ''; my $GoogleDomainName = 'site'; my $GoogleSiteName = ''; my $wikidata = '/home/famzah/Documents/wiki-data'; my @webs_to_import = qw/ CppMysql Development Electronics Linux RealLife /; my @ignore_pages = qw/ WebAtom WebChanges WebHome WebIndex WebLeftBar WebNotify WebPreferences /; my $ua; my $atoken; sub http_req($$$;$$$) { my ($type, $url, $form_values, $headers, $content, $possible_error_codes) = @_; my $response; my @lines; my $request; my $hdrname; my $get_s; my ($k, $v); if (!defined($headers)) { $headers = []; } if (scalar @{$headers} % 2 != 0) { die("Headers not even"); } if ($type eq 'POST') { # no support for custom headers $response = $ua->post($url, $form_values); } elsif ($type eq 'GET') { $get_s = ''; while (($k, $v) = each %{$form_values}) { $get_s = $get_s.sprintf('%s=%s&', uri_escape($k), uri_escape($v)); } if (length($get_s)) { $get_s = "?$get_s"; } $response = $ua->get("$url$get_s", @{$headers}); } elsif ($type eq 'XML') { $request = HTTP::Request->new(POST => $url); $request->content_type('application/atom+xml'); while (scalar @{$headers}) { $hdrname = shift @{$headers}; $request->header($hdrname, shift @{$headers}); } $request->content($content); $response = $ua->request($request); } else { die("Bad request type: $type"); } if (!defined($possible_error_codes)) { $possible_error_codes = []; } if (!$response->is_success) { if (!grep({$response->code == $_} @{$possible_error_codes})) { print Dumper($content)."\n"; die("$type request to '$url' failed: ".$response->status_line.": ".$response->content); } } @lines = split(/\n/, $response->content); return ($response->code, \@lines, $response); } sub auth() { my ($r_code, $r_lines); ($r_code, $r_lines) = http_req('POST', 'https://www.google.com/accounts/ClientLogin', { 'accountType' => 'HOSTED_OR_GOOGLE', 'Email' => $GoogleEmail, 'Passwd' => $GooglePasswd, 'service' => 'jotspot', # http://code.google.com/intl/bg/apis/gdata/faq.html#clientlogin 'source' => 'famzah-twikimigrator-1.0', }, [], # headers [], # content [403], ); if ($r_code == 403) { die( "Authentication failed, see http://code.google.com/intl/bg/apis/accounts/docs/AuthForInstalledApps.html ". "for more info on how to react:\n".Dumper($r_lines) ); } foreach (@{$r_lines}) { if ($_ =~ /^Auth=(\S+)$/) { return $1; } } die("Authentication failed. HTTP code was successful but the body contains no 'Auth':\n".Dumper($r_lines)); } sub camelcase2dashes($) { my ($page) = @_; $page =~ s/([a-z0-9])([A-Z])/$1-$2/g; $page =~ s/(\-[A-Z]+?)([A-Z][a-z0-9])/$1-$2/g; $page =~ s/My-SQL/MySQL/g; $page =~ s/Open-SSL/OpenSSL/g; $page = lc($page); return $page; } sub render_wiki_to_html($) { my ($page_content) = @_; my @lines; my $line; @lines = split(/\n/, $page_content); foreach $line (@lines) { # convert TWiki to MediaWiki tags, because that's what Text::WikiFormat expects if ($line =~ /^(\s+)(\*.+)$/) { if (length($1) % 3 == 0) { $line = ' 'x((length($1)/3)*4).$2; } else { warn "Bad ident: $line"; } } if ($line =~ /^\Q---+\E (.+)$/) { $line = "= $1 ="; } if ($line =~ /^\Q---++\E (.+)$/) { $line = "== $1 =="; } if ($line =~ /^\Q---+++\E (.+)$/) { $line = "=== $1 ==="; } #$line =~ s/\*(.+?)\*/'''$1'''/g; if ($line =~ /^\-{3,}\s*$/) { $line = '
'; } # various tries to make this HTML compliant and accepted by Google API $line =~ s/
/
/g; $line =~ s/
/
/g; $line =~ s//

/g; $line =~ s/<\/verbatim>/<\/p><\/verbatim>/g; } $page_content = join("\n", @lines); @lines = split(/(

|<\/p><\/verbatim>)/, $page_content); foreach $line (@lines) { if ($line eq '

' || $line eq '

') { next; } $line = HTML::Entities::encode_entities_numeric($line); # contents inside must be HTML escaped } $page_content = join('', @lines); $page_content = Text::WikiFormat::format( $page_content, { strong_tag => qr/\*(.+?)\*/, # TWiki } ); # fix only URL links @lines = split(/([^<]+<\/a>)/, $page_content); foreach $line (@lines) { if ($line =~ /^{$web} = {}; foreach $pagefile (glob("$wikidata/$web/*.txt")) { if ($pagefile !~ /^.+\/\Q$web\E\/(.+)\.txt$/) { die("Unable to extract page name: $pagefile"); } $page = $1; if ($page eq 'WebHome') { $page = '^webhome^'; } if (grep({$page eq $_} @ignore_pages)) { next; } print "\t$page => "; $page = camelcase2dashes($page); print "$page\n"; $webpages->{$web}->{$page} = { 'filename' => $pagefile }; @lines = `cat \"$pagefile\"`; # yes, lazy and lame... $content = ''; foreach $line (@lines) { if ($line =~ /^%META:TOPICPARENT{name="(\S+)"}%\s*$/) { $webpages->{$web}->{$page}->{'parent'} = $1; next; } if ($line =~ /^%META:(TOPICINFO|TOPICMOVED){.+%\s*$/) { next; } if ($line =~ /^%META:FILEATTACHMENT{.+%\s*$/) { $content .= $line; next; } if ($line =~ /^%META:(.+)%\s*$/) { warn "META not parsed: $1\n"; } $content .= $line; } $webpages->{$web}->{$page}->{'content'} = render_wiki_to_html($content); $webpages->{$web}->{$page}->{'uploaded'} = 0; } } return $webpages; } sub get_created_pages() { my ($r_code, $r_lines); my $xml; my $pages; my $entry; my $id; my $lobj; my $page_url; my %ret = (); my $req_url; $req_url = "https://sites.google.com/feeds/content/$GoogleDomainName/$GoogleSiteName?kind=webpage"; do { #print "REQUEST: $req_url\n"; ($r_code, $r_lines) = http_req('GET', $req_url, {}, # we'll compose the GET parameters manually [ 'GData-Version' => '1.2', 'Authorization' => "GoogleLogin auth=$atoken", # http://code.google.com/intl/bg/apis/gdata/docs/auth/overview.html#ClientLogin ], [], # content ); $xml = new XML::Simple; $pages = $xml->XMLin( join('', @{$r_lines}), ); #print Dumper($pages); #print Dumper($pages->{'entry'}); #print join('', @{$r_lines}); foreach $entry (keys %{$pages->{'entry'}}) { if ($entry !~ /\/\Q$GoogleDomainName\E\/\Q$GoogleSiteName\E\/(\d+)$/) { #die("Unable to parse ID: $entry"); next; } $id = $1; $page_url = undef; foreach $lobj (@{$pages->{'entry'}->{$entry}->{'link'}}) { if ($lobj->{'rel'} eq 'alternate') { if ($lobj->{'href'} !~ /\/\Q$GoogleDomainName\E\/\Q$GoogleSiteName\E\/(.+)$/) { die("Unable to parse URL: ".$lobj->{'href'}); } $page_url = $1; last; } } if (!defined($page_url)) { die("Unable to find URL: ".Dumper($pages->{'entry'}->{$entry})); } if (exists($ret{$page_url})) { #die("Duplicate URL: $page_url"); } $ret{$page_url} = $id; } $req_url = undef; if (exists($pages->{'link'})) { # get next page foreach (@{$pages->{'link'}}) { if ($_->{'rel'} eq 'next') { $req_url = $_->{'href'}; last; } } } } while(defined($req_url)); return \%ret; } sub create_page($$;$) { my ($webpage, $page_content, $parentpage) = @_; my ($r_code, $r_lines); my $req_content; my $response; my $parent_xml = ''; my $parent_id; my $pages_info; if ($webpage !~ /^[a-zA-Z0-9-_]+$/) { die("Bad page name: $webpage"); } if ($webpage ne 'gps-control') { # try only with this page, only for debug #return; } #$page_content = HTML::Entities::encode_entities_numeric($page_content); if (defined($parentpage)) { if ($parentpage !~ /^(.+)\/([^\/]+)$/) { die("Unable to parse URL: $parentpage"); } $parentpage = $1; $pages_info = get_created_pages(); if (!exists($pages_info->{$parentpage})) { print Dumper($pages_info)."\n"; die("Parent page '$parentpage' does not exist"); } $parent_id = HTML::Entities::encode_entities_numeric($pages_info->{$parentpage}); $parent_xml = < EOF } $req_content = < $parent_xml $webpage $page_content EOF ($r_code, $r_lines, $response) = http_req('XML', "https://sites.google.com/feeds/content/$GoogleDomainName/$GoogleSiteName", {}, [ 'GData-Version' => '1.2', 'Authorization' => "GoogleLogin auth=$atoken", # http://code.google.com/intl/bg/apis/gdata/docs/auth/overview.html#ClientLogin ], $req_content, [409], ); if ($r_code == 409) { if ($response->content =~ /^Duplicate insert/) { warn "Duplicate page '$webpage', skipping.\n"; } else { die("other err 409?"); } } } sub upload_pages($) { my ($webpages) = @_; my $web; my @parents; my $page; my @new_parents; foreach $web (keys %{$webpages}) { if ($web ne 'RealLife') { # try only with this web, only for debug #next; } print "Web '$web'\n"; create_page($web, $webpages->{$web}->{'^webhome^'}->{'content'}); delete $webpages->{$web}->{'^webhome^'}; $webpages->{$web}->{'WebHome'}->{'url'} = $web; @new_parents = ('WebHome'); do { @parents = @new_parents; # create pages by starting from the root parent @new_parents = (); foreach $page (keys %{$webpages->{$web}}) { if ($page eq 'WebHome') { next; } if (!defined($webpages->{$web}->{$page}->{'parent'})) { $webpages->{$web}->{$page}->{'parent'} = 'WebHome'; } if ($webpages->{$web}->{$page}->{'uploaded'}) { next; } if (grep({$webpages->{$web}->{$page}->{'parent'} eq $_} @parents)) { push(@new_parents, $page); $webpages->{$web}->{$page}->{'url'} = lc($webpages->{$web}->{$webpages->{$web}->{$page}->{'parent'}}->{'url'}.'/'.$page); printf( "\tCreate page '$page' [%s]: %s\n", $webpages->{$web}->{$page}->{'url'}, $webpages->{$web}->{$page}->{'parent'} ); create_page($page, $webpages->{$web}->{$page}->{'content'}, $webpages->{$web}->{$page}->{'url'}); $webpages->{$web}->{$page}->{'uploaded'} = 1; } } } while (scalar @new_parents); foreach $page (keys %{$webpages->{$web}}) { # pages whose parents couldn't be found, a bit of code duplication here :-/ if ($page eq 'WebHome') { next; } if ($webpages->{$web}->{$page}->{'uploaded'}) { next; } $webpages->{$web}->{$page}->{'parent'} = 'WebHome'; $webpages->{$web}->{$page}->{'url'} = lc("$web/$page"); printf( "\tCreate page '$page' [%s]: %s\n", $webpages->{$web}->{$page}->{'url'}, $webpages->{$web}->{$page}->{'parent'} ); create_page($page, $webpages->{$web}->{$page}->{'content'}, $webpages->{$web}->{$page}->{'url'}); $webpages->{$web}->{$page}->{'uploaded'} = 1; } } } sub enable_debug_sent_request() { $ua->add_handler( 'request_send' => sub { my ($request, $ua, $h) = @_; #print $request->as_string; print Dumper($request); return 1; # stop the request } ); } sub main() { $ua = new LWP::UserAgent; $ua->timeout(15); push(@{$ua->requests_redirectable}, 'POST'); $atoken = auth(); #enable_debug_sent_request(); #create_page('parent-test', "parent page content"); #create_page('child-page', "child page content", 'parent-test'); #print Dumper(get_created_pages()); upload_pages(analyze_pages()); } main();