Postx

#!/usr/bin/perl
 #
 # Database client access and a smattering of SQL knowledge are assumed
 # All command line stuff is bash
 #
 # Remember, any damage resulting from the use of this code is the fault of the user
 #
 # Enjoy - MjM
 # <mmrtnt@botaday.com> "Mike Martinet"
 # 
 # Command line to run postx in one months directory of blog posts
 # for f in `grep -e "a name='[^c]" *.html | grep -v comments | \
 # sed -e "s/\(.*\):.*<a name='\([^']*\).*/\2 \1/" | sort | awk '{print $2}'`;
 # do postx.pl $f;
 # done
 #
 # /******************************************************************************/
 # Queries to clean up a Drupal content database.
 # !!! YOU DO NOT WANT TO RUN THESE ON A COMPLETED DATABASE !!!
 # These are to clean out the database after a practice or failed run with postx
 #
 # delete from comments; 
 # delete from node;
 # delete from node_revisions;
 # alter table comments AUTO_INCREMENT 1; 
 # alter table node AUTO_INCREMENT 1; 
 # alter table node_revisions AUTO_INCREMENT 1;
 #
 # /******************************************************************************/
 #
 
 use strict;
 use time::local;
 use dbi;
 use net::ftp;
 
 my( $infile ) = @argv;
 $infile = defined( $infile ) ? $infile : "*.html";
 
 # not used
 my $home=$env{ 'home' };
 
 # database located on remote machine, db port tunneled back through ssh to 127.0.0.1:3366 
 my $dbhost = "127.0.0.1";
 
 my $db = "mikemart_botaday";
 
 my( $file, @postarr, $line, $name, @contarr, %cmthash, $cmtbody, $cmttime, $cmthp );
 my( $pid, $timestamp, $img, $loc, @tmparr, $tmpstr, $cmtid, $cmtauth, $key, $idx );
 my( $cmtcount, $maxquery, $nodequery, $revquery, $cmtquery, $msql, $nsql, $rsql, $csql );
 my( @filearr, $nodeid, @cmtarr, $cmtsubj, $query, $sql, @imgarr, $ftp, $newline, %tmphash );
 
 # Initiate mysql database connection
 my $dbh = dbi->connect("dbi:mysql:database=$db;host=$dbhost:3366;user=mikemart_mmrtnt;password=")
	 or die print "\nerror:couldn't connect to $dbhost database: $db\n";
 
 # query to insert the node in the drupal database
 $nodequery = "insert into node ( type, title, uid, status, created, changed, comment, promote ) ";
 $nodequery.= "values( ?, ?, ?, ?, ?, ?, ?, ? )" ;
 $nsql = $dbh->prepare( $nodequery ) or die "unable to prepare $nodequery" . $dbh->errstr;
 
 # query to insert the node revision - node revisions contain post body
 $revquery = "insert into node_revisions ( nid, uid, title, body, timestamp, format ) ";
 $revquery.= "values( ?, ?, ?, ?, ?, ? )";
 $rsql = $dbh->prepare( $revquery ) or die "unable to prepare $revquery" . $dbh->errstr;
 
 # query to insert comments
 $cmtquery = "insert into comments ( nid, uid, subject, hostname, comment, timestamp, status, format, thread, name, homepage ) ";
 $cmtquery.= "values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )";
 $csql = $dbh->prepare( $cmtquery ) or die "unable to prepare $cmtquery" . $dbh->errstr;
 
 # query to get latest nid
 $maxquery = "select max(nid) from node";
 $msql = $dbh->prepare( $maxquery ) or die "unable to prepare $maxquery" . $dbh->errstr;
 
 # synchronize newly created nodes and node_revisions
 $query = "update node n, node_revisions r set n.vid = r.vid where n.nid = r.nid";
 $sql = $dbh->prepare( $query ) or die "unable to prepare $query" . $dbh->errstr;
 
 # flags for the epoch converter
 my $posttime = 1;
 my $cmttime = 2;
 
 # I was running this using my production site and a test site.
 # the fromblog is the original blogger name of the blog
 #### do not change ####
 my $fromblog = "botaday";
 #### ############# ####
 
 # the toblog is the drupal blog name 
 my $toblog = "botaday";
 
 # the toblogurl is the domain that the drupal system is running under
 my $toblogurl = "botaday.com";
 
 # where to send images. not currently used with ftp
 my $imgloc = "public_html/$toblog/sites/$toblogurl/files/blogimg";
 
 # dont do database updates if set to zero
 # you can run this code with dodb set to 0 and debug set to 1 to just see what it's 
 # trying to do.
 my $dodb = 0;
 my $debug = 1;
 
 my $host = "ftp.mikemartinet.com";
 
 # ftp stuff
 my $user = "botaday\@mikemartinet.com";
 my $pass = '';
 
 # alternate url password for testing - suggested, a test drupal install
 #my $pass = '';
 #
 # if not a wildcard, then we are doing just one file
 # a blogger directory/file brought down with wget looks like this:
 # botaday.blogspot.com/2009/04/bubba-bot.html
 if( $infile ne "*.html" )
	  { @filearr = ( $infile ); }
 else
 {
	  # this makes an array of names of files that have post ids
	  # this should be the post
	  # this may not be true for all blogger configurations!
	  foreach $file ( < $infile > )
	  {
			open fh, "$file"
				 or die "\n\nerror: $file could not be opened.\n\n";
			@postarr = <fh>;
			close fh;
			while( $line = shift( @postarr ) )
			{
				 if( $line =~ /^<a name='(?!c)/ )
				 {
					  ( $pid ) = ( $line =~ /.*'(.*)'.*/ );
					  $tmphash{ $pid } = $file;
				 }
			}
	  }
	  for $key ( sort keys %tmphash )
			{ push( @filearr, $tmphash{ $key } ); }
 }
 # each html file in the array should be a post
 foreach $file ( < @filearr > )
 {
	  print "\nopening $file\n\n";
	  open fh, $file
			or die "\n\nerror: $file could not be opened.\n\n";
	  @postarr = <fh>;
	  # walk through the post file a line at a time
	  while( $line = shift( @postarr ) )
	  {
			$line =~ s/\n//g;
			if( $debug > 1 )
				 { print "line = $line\n"; }
			# capture bloggers post id for this post
			if( $line =~ /^<a name=(?!\'comments)/ )
				 { ( $pid ) = ( $line =~ /.*'(.*)'.*/ ); }
			elsif( $line =~ /^<a name=\'comments/ )
				 {;}
			# get the post title as displayed
			# example: <title>botaday: bubba bot</title>
			elsif( $line =~ /^<title>/ )
				 { ( $name ) = ( $line =~ /^<title>$fromblog: (.*)<\/title>/ ); }
			# start of the post body
			elsif( $line =~ /<div class=\'post-body entry-content\'>/ )
			{
				 # capture lines until the end of the post body
				 while( $line !~ /<div class=\'post-footer\'>/ )
				 {
					  $line = shift( @postarr );
					  # get rid of unnecessary text
					  $line =~ s/<div style=\'clear: both;\'>//m;
					  $line =~ s/<\/div>$//g;
					  $line =~ s/\/>"$//m;
					  # push all remaining post body lines into the content array
					  if( $line !~ /<div class=\'post-footer\'>/ )
							{ push( @contarr, $line ); }
				 }
			}
			# this means we have hit a comment
			elsif( $line =~ /<dt class=\'comment-author/ )
			{
				 # make a string out of lines containing comment data
				 # item-control is in the line at the end of the comment
				 while( $line !~ /item-control/ )
				 {
					  $line = shift( @postarr );
					  $tmpstr.= $line;
				 }
				 # various regexs to pull items out of string
				 # the comment id. 
				 ( $cmtid ) = ( $tmpstr =~ /.*<a name='(.*)'>.*/m );
				 # the comment author name
				 ( $cmtauth ) = ( $tmpstr =~ /.*?<\/a>(.*)said.*/sm );
 
				 # example data
				 #<a href='http://www.blogger.com/profile/15112190511038404999' rel='nofollow'>catnapping</a>
				 # the comment author website - if available
				 ( $cmthp ) = ( $cmtauth =~ /.*(http.*?)'.*/ );
				 # the actual comment
				 ( $cmtbody ) = ( $tmpstr =~ /.*?<dd class='comment-body'>(.*)<\/dd>.*/sm );
				 # date and time
				 ( $cmttime ) = ( $tmpstr =~ /.*permalink'>(.*)<\/a>.*/sm );
				 # remove carriage returns
				 $cmtid =~ s/\n//g;
				 $cmthp =~ s/\n//g;
				 $cmtauth =~ s/\n//g;
				 $cmtbody =~ s/\n//g;
				 $cmttime =~ s/\n//g;
				 if( $debug > 0 )
					  { print "cmttime = $cmttime\n"; }
				 $cmttime = &epochtime( $cmttime, $cmttime );
				 $cmthash{ $cmtid } = [ $cmtauth, $cmthp, $cmtbody, $cmttime ];
				 if( $debug > 0 )
				 {
					  print "\ntmpstr = $tmpstr\n";
					  print "cmttime = $cmttime\n";
					  print "cmtid = $cmtid\n";
					  print "cmtauth = $cmtauth\n";
					  print "cmthp = $cmthp\n";
					  print "cmtbody = $cmtbody\n";
				 }
				 $tmpstr = "";
			}
			# this line is the post time stamp
			# example data
			# <abbr class='published' title='2005-09-30t08:14:00-07:00'>9/30/2005 # 08:14:00 am</abbr></a>
			elsif( $line =~ /<abbr class=\'published\'/ )
			{
				 ( $timestamp ) = ( $line =~ /.*?>(.*)<\/abbr.*/ );
				 # drupal stores dates in epoch time.  the epochtime function will convert the date/time
				 # string in the example above into epoch time
				 # it would be good to make sure that the timestamps from the blog are in the same timezone
				 # as the drupal destination
				 $timestamp = &epochtime( $timestamp, $posttime );
			}
	  }
	  # are there images?
	  if( grep( /src/, @contarr ) )
	  {
			# iterate through the post body and pull out images
			# we are going to wget any images and then ftp them to the files/blogimg directory
			while( $line = shift( @contarr ) )
			{
				 if( $line =~ m/src=/ )
				 {
					  $tmpstr = $newline = "";
					  # onblur is javascript that blogger wraps image links in
					  # if it doesn't exist in your blog sources, you'll have to
					  # find another string to key off of
					  if( $line =~ /onblur/ )
							{ ( $tmpstr ) = ( $line =~ /.*?<\/a>(.*)/ ); }
					  # location to copy image from
					  ( $loc ) = ( $line =~ /.*?src="(.*?)".*/ );
					  # actual image name
					  ( $img ) = ( $loc =~ /http:.*\/(.*).*/ );
					  # picasa (blogger) keeps images in s400/s800/s1600 directories.  
					  # we want to get the highest resolution available
					  # change the lower resolution links to high
					  ( $loc ) =~ s/s(4|8)00/s1600/;
					  if( $debug > 0 )
					  {
							print "loc = $loc\n";
							print "img = $img\n";
					  }
					  # if not updating the database, dont get images
					  if( $dodb > 0 )
					  {
							print "getting $loc\n";
							# this should be done with lwp::simple for cross-platform
							`wget -qn $loc`;
							push( @imgarr, $img );
					  }
					  # update link in line
					  $newline = "<?php \n";
					  $newline.= "print \"<a href='http://$toblogurl/sites/$toblogurl/files/blogimg/$img'>\";\n";
					  $newline.= "print theme('imagecache', 'post_img', 'sites/$toblogurl/files/blogimg/$img','','');\n";
					  $newline.= "print \"</a>\\n\"; ?><p>";
					  $newline.=$tmpstr;
					  $tmpstr = "";
				 }
			$newline =~ s/\n//g;
			if( length( $newline ) > 0 )
				 { push( @tmparr, $newline ); }
			$tmpstr = $newline = "";
			}
	  }
	  else # do this for posts with no images
			{ @tmparr = @contarr; }
	  close fh;
	  $cmtcount = ( keys %cmthash );
	  if( $debug > 0 )
	  {
			print "name = $name\n";
			print "timestamp = $timestamp\n";
			print "cmtcount = $cmtcount\n";
			print "tmparr = ".join( ", ", @tmparr )."\n\n";
	  }
	  # we got everything we need.  do the databse update
	  &dbupdate( $name, $timestamp, $cmtcount, $sql, $nsql, $msql, $rsql, $csql, \%cmthash, \@tmparr );
	  undef @contarr;
	  undef @tmparr;
	  undef %cmthash;
 }
 if( scalar( @imgarr ) > 0 )
 {
	  # send the image files to the server
	  # the image location directory is set up in
	  # the ftp preferences on the host
	  # it can be set locally if the permissions
	  # on the host allow for it.
	  $ftp = net::ftp->new( $host, debug => 0 )
			or die "cannot connect to $host: $@";
	  $ftp->login( $user, $pass )
			or die "cannot login ", $ftp->message;
	  $ftp->binary();
	  foreach $img ( @imgarr )
	  {
			print "sending $img\n";
			$ftp->put( $img ) or die $ftp->{errstr};
	  }
	  $ftp->quit;
 }
 undef @imgarr;
 $nsql->finish;
 $msql->finish;
 $rsql->finish;
 $csql->finish;
 $dbh->disconnect;
 
 #-------------------------------------------------------------------------------------------#
 #-------------------------------------------------------------------------------------------#
 # Database update subroutine
 #-------------------------------------------------------------------------------------------#
 sub dbUpdate
 #-------------------------------------------------------------------------------------------#
 {
	  my( $name, $timestamp, $cmtCount, $sql, $nsql, $msql, $rsql, $csql, $cmtHash, $tmpArr ) = @_;
	  $tmpStr = join( "", @$tmpArr );
	  if( $DEBUG > 0 )
	  {
			print "\ndbUpdate.name = $name\n";
			print "dbUpdate.timestamp = $timestamp\n";
			print "dbUpdate.cmtCount = $cmtCount\n";
			print "dbUpdate.tmpArr = ".join( " ", @$tmpArr )."\n";
	  }
	  if( $DODB > 0 )
	  {
			# Insert node
			$nsql->execute( 'blog', $name, 1, 1, $timestamp, $timestamp, $cmtCount, 1 )
				 or die "Unable to execute $nodeQuery. " . $nsql->errstr;
			# Get new node nid
			$msql->execute() or die "Unable to execute $maxQuery. " . $msql->errstr;
			( $nodeID ) = $msql->fetchrow_array();
			# Insert node revision
			$rsql->execute( $nodeID, 1, $name, $tmpStr, $timestamp, 3 )
				 or die "Unable to execute $revQuery. " . $rsql->errstr;
	  }
	  if( $cmtCount > 0 )
	  {
			# Updates comment thread count
			$idx = 0;
			for $key ( keys %$cmtHash )
			{
				 # Format comment data for insertion
				 @cmtArr = @{$cmtHash{ $key }};
				 $cmtAuth = $cmtArr[ 0 ];
				 ( $cmtAuth ) = $cmtAuth =~ /href/ ? ( $cmtAuth =~ /.*>(.*)<\/a>/ ) : $cmtAuth;
				 $cmtSubj = @cmtArr[ 2 ];
				 $cmtSubj =~ s/<.*?>//g;
				 $cmtSubj = substr( $cmtSubj, 0, 26 );
				 # tmpStr used for debug only
				 if( $DEBUG > 0 )
				 {
					  $tmpStr = "$nodeID, 1, $cmtSubj, 'localhost', ".$cmtArr[ 2 ].", ";
					  $tmpStr.= $cmtArr[ 3 ].", 1, 2, ".sprintf( "%02d/", $idx++ ).", $cmtAuth, ".$cmtArr[ 1 ];
					  print "\ndbUpdate.tmpStr = $tmpStr \n";
					  $idx--;
				 }
				 # Insert comments
				 if( $DODB > 0 )
				 {
					  $csql->execute( $nodeID, 0, $cmtSubj, 'localhost', $cmtArr[ 2 ], $cmtArr[ 3 ], 0, 2, sprintf( "%02d/", $idx++ ), $cmtAuth, $cmtArr[ 1 ] )
							or die "Unable to execute $cmtQuery. " . $rsql->errstr;
				 }
			}
	  }
	  # Synchronize vids (version ids) and nids (node ids)
	  if( $DODB > 0 )
			{ $sql->execute() or die "Unable to execute $query. " . $sql->errstr; }
	  undef @$tmpArr;
	  return;
 }
 #-------------------------------------------------------------------------------------------#
 
 #-------------------------------------------------------------------------------------------#
 # returns epoch time in seconds
 # the timestamp returned is in the timezone of the machine this code runs on
 # that can be changed by looking up the documentation for timelocal()
 #-------------------------------------------------------------------------------------------#
 sub epochtime
 #-------------------------------------------------------------------------------------------#
 {
 #-------------------------------------------------------------------------------------------#
	  # timestamp = 1/12/2012 07:14:00 pm
	  # commenttime = 27/9/06 04:44
	  my( $time, $flag ) = @_;
	  if( $debug > 0 )
	  {
			print "\nepochtime.time = $time\n";
			print "epochtime.flag = $flag\n";
	  }
	  my( $sec, $min, $hour, $mday, $mon, $year, $apm );
	  if( $flag == $posttime )
	  {
			( $mon, $mday, $year, $hour, $min, $sec, $apm ) = ( $time =~ /(\d{1,2})\/(\d{1,2})\/(\d{4}) (\d{2}):(\d{2}):(\d{2}) (a|pm)/ );
			$hour = $apm eq "pm" ? $hour += 12 : $hour;
			$hour = $hour == 24 ? 0 : $hour;
			if( $debug > 1 )
			{
				 print "epochtime-post.mon = $mon\n";
				 print "epochtime-post.mday = $mday\n";
				 print "epochtime-post.year = $year\n";
				 print "epochtime-post.hour = $hour\n";
				 print "epochtime-post.min = $min\n";
				 print "epochtime-post.sec = $sec\n";
			}
	  }
	  else
	  {
			( $mday, $mon, $year, $hour, $min ) = ( $time =~ /(\d{1,2})\/(\d{1,2})\/(\d{2}) (\d{2}):(\d{2})/ );
			$year += 2000;
			$sec = 0;
			if( $debug > 1 )
			{
				 print "epochtime-cmt.mon = $mon\n";
				 print "epochtime-cmt.mday = $mday\n";
				 print "epochtime-cmt.year = $year\n";
				 print "epochtime-cmt.hour = $hour\n";
				 print "epochtime-cmt.min = $min\n";
				 print "epochtime-cmt.sec = $sec\n";
			}
	  }
	  $mon -= 1;
	  return timelocal($sec,$min,$hour,$mday,$mon,$year);
 }
 #-------------------------------------------------------------------------------------------#
 
 #-------------------------------------------------------------------------------------------#
 # Various useful SQL
 #-------------------------------------------------------------------------------------------#
 #select nid, vid, type, title uid, status, created, changed, comment from node;
 #select nid, vid, uid, title, log, timestamp, format from node_revisions;
 #select cid, pid, nid, uid, subject, hostname, timestamp, status, format, thread, name, mail from comments;
 #delete from node where nid > 4; alter table node AUTO_INCREMENT 1;
 #-------------------------------------------------------------------------------------------#
 #delete from node_revisions where nid > 4 ; alter table node_revisions AUTO_INCREMENT 1;
 #delete from comments where cid > 2;