#!/usr/bin/perl
use strict;
use warnings;
# A simple example on how to migrate a TWiki to Google Sites automatically.
# Copyright (C) 2010 Ivan Zahariev (famzah)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
use LWP; # http://www.perl.com/pub/a/2002/08/20/perlandlwp.html
use HTML::Entities;
use URI::Escape;
use Data::Dumper;
use XML::Simple;
use Text::WikiFormat;
my $GoogleEmail = '';
my $GooglePasswd = '';
my $GoogleDomainName = 'site';
my $GoogleSiteName = '';
my $wikidata = '/home/famzah/Documents/wiki-data';
my @webs_to_import = qw/ CppMysql Development Electronics Linux RealLife /;
my @ignore_pages = qw/ WebAtom WebChanges WebHome WebIndex WebLeftBar WebNotify WebPreferences /;
my $ua;
my $atoken;
sub http_req($$$;$$$) {
my ($type, $url, $form_values, $headers, $content, $possible_error_codes) = @_;
my $response;
my @lines;
my $request;
my $hdrname;
my $get_s;
my ($k, $v);
if (!defined($headers)) {
$headers = [];
}
if (scalar @{$headers} % 2 != 0) {
die("Headers not even");
}
if ($type eq 'POST') { # no support for custom headers
$response = $ua->post($url, $form_values);
} elsif ($type eq 'GET') {
$get_s = '';
while (($k, $v) = each %{$form_values}) {
$get_s = $get_s.sprintf('%s=%s&', uri_escape($k), uri_escape($v));
}
if (length($get_s)) {
$get_s = "?$get_s";
}
$response = $ua->get("$url$get_s", @{$headers});
} elsif ($type eq 'XML') {
$request = HTTP::Request->new(POST => $url);
$request->content_type('application/atom+xml');
while (scalar @{$headers}) {
$hdrname = shift @{$headers};
$request->header($hdrname, shift @{$headers});
}
$request->content($content);
$response = $ua->request($request);
} else {
die("Bad request type: $type");
}
if (!defined($possible_error_codes)) {
$possible_error_codes = [];
}
if (!$response->is_success) {
if (!grep({$response->code == $_} @{$possible_error_codes})) {
print Dumper($content)."\n";
die("$type request to '$url' failed: ".$response->status_line.": ".$response->content);
}
}
@lines = split(/\n/, $response->content);
return ($response->code, \@lines, $response);
}
sub auth() {
my ($r_code, $r_lines);
($r_code, $r_lines) = http_req('POST',
'https://www.google.com/accounts/ClientLogin',
{
'accountType' => 'HOSTED_OR_GOOGLE',
'Email' => $GoogleEmail,
'Passwd' => $GooglePasswd,
'service' => 'jotspot', # http://code.google.com/intl/bg/apis/gdata/faq.html#clientlogin
'source' => 'famzah-twikimigrator-1.0',
},
[], # headers
[], # content
[403],
);
if ($r_code == 403) {
die(
"Authentication failed, see http://code.google.com/intl/bg/apis/accounts/docs/AuthForInstalledApps.html ".
"for more info on how to react:\n".Dumper($r_lines)
);
}
foreach (@{$r_lines}) {
if ($_ =~ /^Auth=(\S+)$/) {
return $1;
}
}
die("Authentication failed. HTTP code was successful but the body contains no 'Auth':\n".Dumper($r_lines));
}
sub camelcase2dashes($) {
my ($page) = @_;
$page =~ s/([a-z0-9])([A-Z])/$1-$2/g;
$page =~ s/(\-[A-Z]+?)([A-Z][a-z0-9])/$1-$2/g;
$page =~ s/My-SQL/MySQL/g;
$page =~ s/Open-SSL/OpenSSL/g;
$page = lc($page);
return $page;
}
sub render_wiki_to_html($) {
my ($page_content) = @_;
my @lines;
my $line;
@lines = split(/\n/, $page_content);
foreach $line (@lines) { # convert TWiki to MediaWiki tags, because that's what Text::WikiFormat expects
if ($line =~ /^(\s+)(\*.+)$/) {
if (length($1) % 3 == 0) {
$line = ' 'x((length($1)/3)*4).$2;
} else {
warn "Bad ident: $line";
}
}
if ($line =~ /^\Q---+\E (.+)$/) {
$line = "= $1 =";
}
if ($line =~ /^\Q---++\E (.+)$/) {
$line = "== $1 ==";
}
if ($line =~ /^\Q---+++\E (.+)$/) {
$line = "=== $1 ===";
}
#$line =~ s/\*(.+?)\*/'''$1'''/g;
if ($line =~ /^\-{3,}\s*$/) {
$line = '
';
}
# various tries to make this HTML compliant and accepted by Google API
$line =~ s/
/
/g;
$line =~ s/
/
/g;
$line =~ s///g;
$line =~ s/<\/verbatim>/<\/p><\/verbatim>/g;
}
$page_content = join("\n", @lines);
@lines = split(/(|<\/p><\/verbatim>)/, $page_content);
foreach $line (@lines) {
if ($line eq '' || $line eq '
') {
next;
}
$line = HTML::Entities::encode_entities_numeric($line); # contents inside must be HTML escaped
}
$page_content = join('', @lines);
$page_content = Text::WikiFormat::format(
$page_content,
{
strong_tag => qr/\*(.+?)\*/, # TWiki
}
);
# fix only URL links
@lines = split(/([^<]+<\/a>)/, $page_content);
foreach $line (@lines) {
if ($line =~ /^{$web} = {};
foreach $pagefile (glob("$wikidata/$web/*.txt")) {
if ($pagefile !~ /^.+\/\Q$web\E\/(.+)\.txt$/) {
die("Unable to extract page name: $pagefile");
}
$page = $1;
if ($page eq 'WebHome') {
$page = '^webhome^';
}
if (grep({$page eq $_} @ignore_pages)) {
next;
}
print "\t$page => ";
$page = camelcase2dashes($page);
print "$page\n";
$webpages->{$web}->{$page} = {
'filename' => $pagefile
};
@lines = `cat \"$pagefile\"`; # yes, lazy and lame...
$content = '';
foreach $line (@lines) {
if ($line =~ /^%META:TOPICPARENT{name="(\S+)"}%\s*$/) {
$webpages->{$web}->{$page}->{'parent'} = $1;
next;
}
if ($line =~ /^%META:(TOPICINFO|TOPICMOVED){.+%\s*$/) {
next;
}
if ($line =~ /^%META:FILEATTACHMENT{.+%\s*$/) {
$content .= $line;
next;
}
if ($line =~ /^%META:(.+)%\s*$/) {
warn "META not parsed: $1\n";
}
$content .= $line;
}
$webpages->{$web}->{$page}->{'content'} = render_wiki_to_html($content);
$webpages->{$web}->{$page}->{'uploaded'} = 0;
}
}
return $webpages;
}
sub get_created_pages() {
my ($r_code, $r_lines);
my $xml;
my $pages;
my $entry;
my $id;
my $lobj;
my $page_url;
my %ret = ();
my $req_url;
$req_url = "https://sites.google.com/feeds/content/$GoogleDomainName/$GoogleSiteName?kind=webpage";
do {
#print "REQUEST: $req_url\n";
($r_code, $r_lines) = http_req('GET',
$req_url,
{}, # we'll compose the GET parameters manually
[
'GData-Version' => '1.2',
'Authorization' => "GoogleLogin auth=$atoken", # http://code.google.com/intl/bg/apis/gdata/docs/auth/overview.html#ClientLogin
],
[], # content
);
$xml = new XML::Simple;
$pages = $xml->XMLin(
join('', @{$r_lines}),
);
#print Dumper($pages);
#print Dumper($pages->{'entry'});
#print join('', @{$r_lines});
foreach $entry (keys %{$pages->{'entry'}}) {
if ($entry !~ /\/\Q$GoogleDomainName\E\/\Q$GoogleSiteName\E\/(\d+)$/) {
#die("Unable to parse ID: $entry");
next;
}
$id = $1;
$page_url = undef;
foreach $lobj (@{$pages->{'entry'}->{$entry}->{'link'}}) {
if ($lobj->{'rel'} eq 'alternate') {
if ($lobj->{'href'} !~ /\/\Q$GoogleDomainName\E\/\Q$GoogleSiteName\E\/(.+)$/) {
die("Unable to parse URL: ".$lobj->{'href'});
}
$page_url = $1;
last;
}
}
if (!defined($page_url)) {
die("Unable to find URL: ".Dumper($pages->{'entry'}->{$entry}));
}
if (exists($ret{$page_url})) {
#die("Duplicate URL: $page_url");
}
$ret{$page_url} = $id;
}
$req_url = undef;
if (exists($pages->{'link'})) { # get next page
foreach (@{$pages->{'link'}}) {
if ($_->{'rel'} eq 'next') {
$req_url = $_->{'href'};
last;
}
}
}
} while(defined($req_url));
return \%ret;
}
sub create_page($$;$) {
my ($webpage, $page_content, $parentpage) = @_;
my ($r_code, $r_lines);
my $req_content;
my $response;
my $parent_xml = '';
my $parent_id;
my $pages_info;
if ($webpage !~ /^[a-zA-Z0-9-_]+$/) {
die("Bad page name: $webpage");
}
if ($webpage ne 'gps-control') { # try only with this page, only for debug
#return;
}
#$page_content = HTML::Entities::encode_entities_numeric($page_content);
if (defined($parentpage)) {
if ($parentpage !~ /^(.+)\/([^\/]+)$/) {
die("Unable to parse URL: $parentpage");
}
$parentpage = $1;
$pages_info = get_created_pages();
if (!exists($pages_info->{$parentpage})) {
print Dumper($pages_info)."\n";
die("Parent page '$parentpage' does not exist");
}
$parent_id = HTML::Entities::encode_entities_numeric($pages_info->{$parentpage});
$parent_xml = <
EOF
}
$req_content = <
$parent_xml
$webpage
$page_content
EOF
($r_code, $r_lines, $response) = http_req('XML',
"https://sites.google.com/feeds/content/$GoogleDomainName/$GoogleSiteName",
{},
[
'GData-Version' => '1.2',
'Authorization' => "GoogleLogin auth=$atoken", # http://code.google.com/intl/bg/apis/gdata/docs/auth/overview.html#ClientLogin
],
$req_content,
[409],
);
if ($r_code == 409) {
if ($response->content =~ /^Duplicate insert/) {
warn "Duplicate page '$webpage', skipping.\n";
} else {
die("other err 409?");
}
}
}
sub upload_pages($) {
my ($webpages) = @_;
my $web;
my @parents;
my $page;
my @new_parents;
foreach $web (keys %{$webpages}) {
if ($web ne 'RealLife') { # try only with this web, only for debug
#next;
}
print "Web '$web'\n";
create_page($web, $webpages->{$web}->{'^webhome^'}->{'content'});
delete $webpages->{$web}->{'^webhome^'};
$webpages->{$web}->{'WebHome'}->{'url'} = $web;
@new_parents = ('WebHome');
do {
@parents = @new_parents; # create pages by starting from the root parent
@new_parents = ();
foreach $page (keys %{$webpages->{$web}}) {
if ($page eq 'WebHome') {
next;
}
if (!defined($webpages->{$web}->{$page}->{'parent'})) {
$webpages->{$web}->{$page}->{'parent'} = 'WebHome';
}
if ($webpages->{$web}->{$page}->{'uploaded'}) {
next;
}
if (grep({$webpages->{$web}->{$page}->{'parent'} eq $_} @parents)) {
push(@new_parents, $page);
$webpages->{$web}->{$page}->{'url'} =
lc($webpages->{$web}->{$webpages->{$web}->{$page}->{'parent'}}->{'url'}.'/'.$page);
printf(
"\tCreate page '$page' [%s]: %s\n",
$webpages->{$web}->{$page}->{'url'},
$webpages->{$web}->{$page}->{'parent'}
);
create_page($page, $webpages->{$web}->{$page}->{'content'}, $webpages->{$web}->{$page}->{'url'});
$webpages->{$web}->{$page}->{'uploaded'} = 1;
}
}
} while (scalar @new_parents);
foreach $page (keys %{$webpages->{$web}}) { # pages whose parents couldn't be found, a bit of code duplication here :-/
if ($page eq 'WebHome') {
next;
}
if ($webpages->{$web}->{$page}->{'uploaded'}) {
next;
}
$webpages->{$web}->{$page}->{'parent'} = 'WebHome';
$webpages->{$web}->{$page}->{'url'} = lc("$web/$page");
printf(
"\tCreate page '$page' [%s]: %s\n",
$webpages->{$web}->{$page}->{'url'},
$webpages->{$web}->{$page}->{'parent'}
);
create_page($page, $webpages->{$web}->{$page}->{'content'}, $webpages->{$web}->{$page}->{'url'});
$webpages->{$web}->{$page}->{'uploaded'} = 1;
}
}
}
sub enable_debug_sent_request() {
$ua->add_handler(
'request_send' => sub {
my ($request, $ua, $h) = @_;
#print $request->as_string;
print Dumper($request);
return 1; # stop the request
}
);
}
sub main() {
$ua = new LWP::UserAgent;
$ua->timeout(15);
push(@{$ua->requests_redirectable}, 'POST');
$atoken = auth();
#enable_debug_sent_request();
#create_page('parent-test', "parent page content");
#create_page('child-page', "child page content", 'parent-test');
#print Dumper(get_created_pages());
upload_pages(analyze_pages());
}
main();