#!/usr/bin/perl
#
# Database client access and a smattering of SQL knowledge are assumed
# All command line stuff is bash
#
# Remember, any damage resulting from the use of this code is the fault of the user
#
# Enjoy - MjM
# <mmrtnt@botaday.com> "Mike Martinet"
#
# Command line to run postx in one months directory of blog posts
# for f in `grep -e "a name='[^c]" *.html | grep -v comments | \
# sed -e "s/\(.*\):.*<a name='\([^']*\).*/\2 \1/" | sort | awk '{print $2}'`;
# do postx.pl $f;
# done
#
# /******************************************************************************/
# Queries to clean up a Drupal content database.
# !!! YOU DO NOT WANT TO RUN THESE ON A COMPLETED DATABASE !!!
# These are to clean out the database after a practice or failed run with postx
#
# delete from comments;
# delete from node;
# delete from node_revisions;
# alter table comments AUTO_INCREMENT 1;
# alter table node AUTO_INCREMENT 1;
# alter table node_revisions AUTO_INCREMENT 1;
#
# /******************************************************************************/
#
use strict;
use time::local;
use dbi;
use net::ftp;
my( $infile ) = @argv;
$infile = defined( $infile ) ? $infile : "*.html";
# not used
my $home=$env{ 'home' };
# database located on remote machine, db port tunneled back through ssh to 127.0.0.1:3366
my $dbhost = "127.0.0.1";
my $db = "mikemart_botaday";
my( $file, @postarr, $line, $name, @contarr, %cmthash, $cmtbody, $cmttime, $cmthp );
my( $pid, $timestamp, $img, $loc, @tmparr, $tmpstr, $cmtid, $cmtauth, $key, $idx );
my( $cmtcount, $maxquery, $nodequery, $revquery, $cmtquery, $msql, $nsql, $rsql, $csql );
my( @filearr, $nodeid, @cmtarr, $cmtsubj, $query, $sql, @imgarr, $ftp, $newline, %tmphash );
# Initiate mysql database connection
my $dbh = dbi->connect("dbi:mysql:database=$db;host=$dbhost:3366;user=mikemart_mmrtnt;password=")
or die print "\nerror:couldn't connect to $dbhost database: $db\n";
# query to insert the node in the drupal database
$nodequery = "insert into node ( type, title, uid, status, created, changed, comment, promote ) ";
$nodequery.= "values( ?, ?, ?, ?, ?, ?, ?, ? )" ;
$nsql = $dbh->prepare( $nodequery ) or die "unable to prepare $nodequery" . $dbh->errstr;
# query to insert the node revision - node revisions contain post body
$revquery = "insert into node_revisions ( nid, uid, title, body, timestamp, format ) ";
$revquery.= "values( ?, ?, ?, ?, ?, ? )";
$rsql = $dbh->prepare( $revquery ) or die "unable to prepare $revquery" . $dbh->errstr;
# query to insert comments
$cmtquery = "insert into comments ( nid, uid, subject, hostname, comment, timestamp, status, format, thread, name, homepage ) ";
$cmtquery.= "values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )";
$csql = $dbh->prepare( $cmtquery ) or die "unable to prepare $cmtquery" . $dbh->errstr;
# query to get latest nid
$maxquery = "select max(nid) from node";
$msql = $dbh->prepare( $maxquery ) or die "unable to prepare $maxquery" . $dbh->errstr;
# synchronize newly created nodes and node_revisions
$query = "update node n, node_revisions r set n.vid = r.vid where n.nid = r.nid";
$sql = $dbh->prepare( $query ) or die "unable to prepare $query" . $dbh->errstr;
# flags for the epoch converter
my $posttime = 1;
my $cmttime = 2;
# I was running this using my production site and a test site.
# the fromblog is the original blogger name of the blog
#### do not change ####
my $fromblog = "botaday";
#### ############# ####
# the toblog is the drupal blog name
my $toblog = "botaday";
# the toblogurl is the domain that the drupal system is running under
my $toblogurl = "botaday.com";
# where to send images. not currently used with ftp
my $imgloc = "public_html/$toblog/sites/$toblogurl/files/blogimg";
# dont do database updates if set to zero
# you can run this code with dodb set to 0 and debug set to 1 to just see what it's
# trying to do.
my $dodb = 0;
my $debug = 1;
my $host = "ftp.mikemartinet.com";
# ftp stuff
my $user = "botaday\@mikemartinet.com";
my $pass = '';
# alternate url password for testing - suggested, a test drupal install
#my $pass = '';
#
# if not a wildcard, then we are doing just one file
# a blogger directory/file brought down with wget looks like this:
# botaday.blogspot.com/2009/04/bubba-bot.html
if( $infile ne "*.html" )
{ @filearr = ( $infile ); }
else
{
# this makes an array of names of files that have post ids
# this should be the post
# this may not be true for all blogger configurations!
foreach $file ( < $infile > )
{
open fh, "$file"
or die "\n\nerror: $file could not be opened.\n\n";
@postarr = <fh>;
close fh;
while( $line = shift( @postarr ) )
{
if( $line =~ /^<a name='(?!c)/ )
{
( $pid ) = ( $line =~ /.*'(.*)'.*/ );
$tmphash{ $pid } = $file;
}
}
}
for $key ( sort keys %tmphash )
{ push( @filearr, $tmphash{ $key } ); }
}
# each html file in the array should be a post
foreach $file ( < @filearr > )
{
print "\nopening $file\n\n";
open fh, $file
or die "\n\nerror: $file could not be opened.\n\n";
@postarr = <fh>;
# walk through the post file a line at a time
while( $line = shift( @postarr ) )
{
$line =~ s/\n//g;
if( $debug > 1 )
{ print "line = $line\n"; }
# capture bloggers post id for this post
if( $line =~ /^<a name=(?!\'comments)/ )
{ ( $pid ) = ( $line =~ /.*'(.*)'.*/ ); }
elsif( $line =~ /^<a name=\'comments/ )
{;}
# get the post title as displayed
# example: <title>botaday: bubba bot</title>
elsif( $line =~ /^<title>/ )
{ ( $name ) = ( $line =~ /^<title>$fromblog: (.*)<\/title>/ ); }
# start of the post body
elsif( $line =~ /<div class=\'post-body entry-content\'>/ )
{
# capture lines until the end of the post body
while( $line !~ /<div class=\'post-footer\'>/ )
{
$line = shift( @postarr );
# get rid of unnecessary text
$line =~ s/<div style=\'clear: both;\'>//m;
$line =~ s/<\/div>$//g;
$line =~ s/\/>"$//m;
# push all remaining post body lines into the content array
if( $line !~ /<div class=\'post-footer\'>/ )
{ push( @contarr, $line ); }
}
}
# this means we have hit a comment
elsif( $line =~ /<dt class=\'comment-author/ )
{
# make a string out of lines containing comment data
# item-control is in the line at the end of the comment
while( $line !~ /item-control/ )
{
$line = shift( @postarr );
$tmpstr.= $line;
}
# various regexs to pull items out of string
# the comment id.
( $cmtid ) = ( $tmpstr =~ /.*<a name='(.*)'>.*/m );
# the comment author name
( $cmtauth ) = ( $tmpstr =~ /.*?<\/a>(.*)said.*/sm );
# example data
#<a href='http://www.blogger.com/profile/15112190511038404999' rel='nofollow'>catnapping</a>
# the comment author website - if available
( $cmthp ) = ( $cmtauth =~ /.*(http.*?)'.*/ );
# the actual comment
( $cmtbody ) = ( $tmpstr =~ /.*?<dd class='comment-body'>(.*)<\/dd>.*/sm );
# date and time
( $cmttime ) = ( $tmpstr =~ /.*permalink'>(.*)<\/a>.*/sm );
# remove carriage returns
$cmtid =~ s/\n//g;
$cmthp =~ s/\n//g;
$cmtauth =~ s/\n//g;
$cmtbody =~ s/\n//g;
$cmttime =~ s/\n//g;
if( $debug > 0 )
{ print "cmttime = $cmttime\n"; }
$cmttime = &epochtime( $cmttime, $cmttime );
$cmthash{ $cmtid } = [ $cmtauth, $cmthp, $cmtbody, $cmttime ];
if( $debug > 0 )
{
print "\ntmpstr = $tmpstr\n";
print "cmttime = $cmttime\n";
print "cmtid = $cmtid\n";
print "cmtauth = $cmtauth\n";
print "cmthp = $cmthp\n";
print "cmtbody = $cmtbody\n";
}
$tmpstr = "";
}
# this line is the post time stamp
# example data
# <abbr class='published' title='2005-09-30t08:14:00-07:00'>9/30/2005 # 08:14:00 am</abbr></a>
elsif( $line =~ /<abbr class=\'published\'/ )
{
( $timestamp ) = ( $line =~ /.*?>(.*)<\/abbr.*/ );
# drupal stores dates in epoch time. the epochtime function will convert the date/time
# string in the example above into epoch time
# it would be good to make sure that the timestamps from the blog are in the same timezone
# as the drupal destination
$timestamp = &epochtime( $timestamp, $posttime );
}
}
# are there images?
if( grep( /src/, @contarr ) )
{
# iterate through the post body and pull out images
# we are going to wget any images and then ftp them to the files/blogimg directory
while( $line = shift( @contarr ) )
{
if( $line =~ m/src=/ )
{
$tmpstr = $newline = "";
# onblur is javascript that blogger wraps image links in
# if it doesn't exist in your blog sources, you'll have to
# find another string to key off of
if( $line =~ /onblur/ )
{ ( $tmpstr ) = ( $line =~ /.*?<\/a>(.*)/ ); }
# location to copy image from
( $loc ) = ( $line =~ /.*?src="(.*?)".*/ );
# actual image name
( $img ) = ( $loc =~ /http:.*\/(.*).*/ );
# picasa (blogger) keeps images in s400/s800/s1600 directories.
# we want to get the highest resolution available
# change the lower resolution links to high
( $loc ) =~ s/s(4|8)00/s1600/;
if( $debug > 0 )
{
print "loc = $loc\n";
print "img = $img\n";
}
# if not updating the database, dont get images
if( $dodb > 0 )
{
print "getting $loc\n";
# this should be done with lwp::simple for cross-platform
`wget -qn $loc`;
push( @imgarr, $img );
}
# update link in line
$newline = "<?php \n";
$newline.= "print \"<a href='http://$toblogurl/sites/$toblogurl/files/blogimg/$img'>\";\n";
$newline.= "print theme('imagecache', 'post_img', 'sites/$toblogurl/files/blogimg/$img','','');\n";
$newline.= "print \"</a>\\n\"; ?><p>";
$newline.=$tmpstr;
$tmpstr = "";
}
$newline =~ s/\n//g;
if( length( $newline ) > 0 )
{ push( @tmparr, $newline ); }
$tmpstr = $newline = "";
}
}
else # do this for posts with no images
{ @tmparr = @contarr; }
close fh;
$cmtcount = ( keys %cmthash );
if( $debug > 0 )
{
print "name = $name\n";
print "timestamp = $timestamp\n";
print "cmtcount = $cmtcount\n";
print "tmparr = ".join( ", ", @tmparr )."\n\n";
}
# we got everything we need. do the databse update
&dbupdate( $name, $timestamp, $cmtcount, $sql, $nsql, $msql, $rsql, $csql, \%cmthash, \@tmparr );
undef @contarr;
undef @tmparr;
undef %cmthash;
}
if( scalar( @imgarr ) > 0 )
{
# send the image files to the server
# the image location directory is set up in
# the ftp preferences on the host
# it can be set locally if the permissions
# on the host allow for it.
$ftp = net::ftp->new( $host, debug => 0 )
or die "cannot connect to $host: $@";
$ftp->login( $user, $pass )
or die "cannot login ", $ftp->message;
$ftp->binary();
foreach $img ( @imgarr )
{
print "sending $img\n";
$ftp->put( $img ) or die $ftp->{errstr};
}
$ftp->quit;
}
undef @imgarr;
$nsql->finish;
$msql->finish;
$rsql->finish;
$csql->finish;
$dbh->disconnect;
#-------------------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------------------#
# Database update subroutine
#-------------------------------------------------------------------------------------------#
sub dbUpdate
#-------------------------------------------------------------------------------------------#
{
my( $name, $timestamp, $cmtCount, $sql, $nsql, $msql, $rsql, $csql, $cmtHash, $tmpArr ) = @_;
$tmpStr = join( "", @$tmpArr );
if( $DEBUG > 0 )
{
print "\ndbUpdate.name = $name\n";
print "dbUpdate.timestamp = $timestamp\n";
print "dbUpdate.cmtCount = $cmtCount\n";
print "dbUpdate.tmpArr = ".join( " ", @$tmpArr )."\n";
}
if( $DODB > 0 )
{
# Insert node
$nsql->execute( 'blog', $name, 1, 1, $timestamp, $timestamp, $cmtCount, 1 )
or die "Unable to execute $nodeQuery. " . $nsql->errstr;
# Get new node nid
$msql->execute() or die "Unable to execute $maxQuery. " . $msql->errstr;
( $nodeID ) = $msql->fetchrow_array();
# Insert node revision
$rsql->execute( $nodeID, 1, $name, $tmpStr, $timestamp, 3 )
or die "Unable to execute $revQuery. " . $rsql->errstr;
}
if( $cmtCount > 0 )
{
# Updates comment thread count
$idx = 0;
for $key ( keys %$cmtHash )
{
# Format comment data for insertion
@cmtArr = @{$cmtHash{ $key }};
$cmtAuth = $cmtArr[ 0 ];
( $cmtAuth ) = $cmtAuth =~ /href/ ? ( $cmtAuth =~ /.*>(.*)<\/a>/ ) : $cmtAuth;
$cmtSubj = @cmtArr[ 2 ];
$cmtSubj =~ s/<.*?>//g;
$cmtSubj = substr( $cmtSubj, 0, 26 );
# tmpStr used for debug only
if( $DEBUG > 0 )
{
$tmpStr = "$nodeID, 1, $cmtSubj, 'localhost', ".$cmtArr[ 2 ].", ";
$tmpStr.= $cmtArr[ 3 ].", 1, 2, ".sprintf( "%02d/", $idx++ ).", $cmtAuth, ".$cmtArr[ 1 ];
print "\ndbUpdate.tmpStr = $tmpStr \n";
$idx--;
}
# Insert comments
if( $DODB > 0 )
{
$csql->execute( $nodeID, 0, $cmtSubj, 'localhost', $cmtArr[ 2 ], $cmtArr[ 3 ], 0, 2, sprintf( "%02d/", $idx++ ), $cmtAuth, $cmtArr[ 1 ] )
or die "Unable to execute $cmtQuery. " . $rsql->errstr;
}
}
}
# Synchronize vids (version ids) and nids (node ids)
if( $DODB > 0 )
{ $sql->execute() or die "Unable to execute $query. " . $sql->errstr; }
undef @$tmpArr;
return;
}
#-------------------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------------------#
# returns epoch time in seconds
# the timestamp returned is in the timezone of the machine this code runs on
# that can be changed by looking up the documentation for timelocal()
#-------------------------------------------------------------------------------------------#
sub epochtime
#-------------------------------------------------------------------------------------------#
{
#-------------------------------------------------------------------------------------------#
# timestamp = 1/12/2012 07:14:00 pm
# commenttime = 27/9/06 04:44
my( $time, $flag ) = @_;
if( $debug > 0 )
{
print "\nepochtime.time = $time\n";
print "epochtime.flag = $flag\n";
}
my( $sec, $min, $hour, $mday, $mon, $year, $apm );
if( $flag == $posttime )
{
( $mon, $mday, $year, $hour, $min, $sec, $apm ) = ( $time =~ /(\d{1,2})\/(\d{1,2})\/(\d{4}) (\d{2}):(\d{2}):(\d{2}) (a|pm)/ );
$hour = $apm eq "pm" ? $hour += 12 : $hour;
$hour = $hour == 24 ? 0 : $hour;
if( $debug > 1 )
{
print "epochtime-post.mon = $mon\n";
print "epochtime-post.mday = $mday\n";
print "epochtime-post.year = $year\n";
print "epochtime-post.hour = $hour\n";
print "epochtime-post.min = $min\n";
print "epochtime-post.sec = $sec\n";
}
}
else
{
( $mday, $mon, $year, $hour, $min ) = ( $time =~ /(\d{1,2})\/(\d{1,2})\/(\d{2}) (\d{2}):(\d{2})/ );
$year += 2000;
$sec = 0;
if( $debug > 1 )
{
print "epochtime-cmt.mon = $mon\n";
print "epochtime-cmt.mday = $mday\n";
print "epochtime-cmt.year = $year\n";
print "epochtime-cmt.hour = $hour\n";
print "epochtime-cmt.min = $min\n";
print "epochtime-cmt.sec = $sec\n";
}
}
$mon -= 1;
return timelocal($sec,$min,$hour,$mday,$mon,$year);
}
#-------------------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------------------#
# Various useful SQL
#-------------------------------------------------------------------------------------------#
#select nid, vid, type, title uid, status, created, changed, comment from node;
#select nid, vid, uid, title, log, timestamp, format from node_revisions;
#select cid, pid, nid, uid, subject, hostname, timestamp, status, format, thread, name, mail from comments;
#delete from node where nid > 4; alter table node AUTO_INCREMENT 1;
#-------------------------------------------------------------------------------------------#
#delete from node_revisions where nid > 4 ; alter table node_revisions AUTO_INCREMENT 1;
#delete from comments where cid > 2;