<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">#!/usr/bin/perl -w
use LWP::UserAgent;
use Date::Parse;
use HTML::Filter;
use CGI;

my $query;
my $debug = $ENV{'DEBUG'};
my $doc;
my $base;

# CGI-mode hooks
if ( $ENV{'REMOTE_ADDR'}) {
  $| = 1;
  $query = new CGI;
  print $query-&gt;header;
  if ( $ENV{'REMOTE_USER'}) {
	print "&lt;p&gt;snorq is generating your page, please wait...&lt;/p&gt;&lt;pre&gt;";
  } else {
	print "&lt;p&gt;You'll have to log in to run snorq!&lt;/p&gt;";
	exit;
  }
}

chdir( "$ENV{'HOME'}/public_html/misc/avantgo" );

my $stylesheet_link = 
  '&lt;link rel="stylesheet" href="../../waider.css" type="text/css"&gt;';

my %pages;

#$pages{'Cringely'} =
#  [
#   'http://www.pbs.org/cgi-registry/cringely/thisweek.pl?pulpit'
#  ];

#$filters{'Cringely'} =
#  [
#   '&lt;!--========================== Content between these lines ==========================--&gt;',
#   '&lt;!--========================== Content between these lines ==========================--&gt;',
#   '^\s*', '&lt;html&gt;&lt;head&gt;&lt;title&gt;Cringely&lt;/title&gt;&lt;/head&gt;&lt;body&gt;',
#   '$', '&lt;/body&gt;'
#  ];

$pages{'jwzrants'} =
  [
   'http://www.jwz.org/gruntle',
   '&lt;TR&gt;&lt;TD VALIGN=TOP ALIGN=RIGHT&gt;&lt;B&gt;&lt;A HREF="([^"]+)"'
  ];

$pages{'Dilbert'} =
  [
   'http://www.dilbert.com/comics/dilbert/archive/',
   'img.*?src="(/comics/dilbert/archive/images/dilbert\d+.gif)"'
  ];

$pages{'TomTomorrow'} =
  [
   'http://www.workingforchange.com/column_lst.cfm?AuthrId=43',
   'HREF="(article.cfm\?ItemID=\d+)"',
   'src="(http://workingforchange.speedera.net/www.workingforchange.com/webgraphics/wfc/TM.*?)"'
  ];

# Alas, Bobbins is no more.
#$pages{'Bobbins'} =
#  [
#   'http://www.bobbins.org/',
#   'img.*?src="(/comics/\d+.*?\.(png|gif))"'
#  ];

# But hoorah, there is scarygoround!
$pages{'ScaryGoRound'} =
  [
   'http://www.scarygoround.com/',
   'img src="(strips/.*?\.png)"'
  ];

# $pages{'StateSecrets'} =
#  [
#   'http://www.statesecrets.com/homepage.html',
#   '&lt;a href="(\d{4}/\d{2}/\d{2}/index.html)"',
#   '"(.*?daypage.html)"'
#  ];

#  $filters{ 'StateSecrets' } =
#    [
#     '\&lt;!--title--\&gt;',
#     '\&lt;!--end story block--\&gt;',

#     # And now, the hairy stuff
#     '^', '&lt;html&gt;&lt;head&gt;&lt;title&gt;StateSecrets&lt;/title&gt;&lt;/head&gt;&lt;body&gt;&lt;h2&gt;',
#     '\&lt;/font.*?\&lt;!--credit--\&gt;', "&lt;/h2&gt;\n&lt;!--credit--&gt;", # title

#     '\&lt;!--credit--\&gt;\s?\&lt;.*?\&gt;by \&lt;a.*?\&gt;', '&lt;h3&gt;',
#     '\&lt;/a.*\&lt;!--text here--\&gt;', "&lt;/h3&gt;\n&lt;!--text--&gt;", # credit

#     '\&lt;!--text--\&gt;.*?&lt;p&gt;', '&lt;p&gt;', # Text block

#     # These are all to trim garbage off the end. No idea why it took
#     # this much effort, but it now works, so I'm not going to touch it.
#     '&lt;/?font.*?&gt;', '', # no font tags, thanks
#     '&lt;/?t[dra].*?&gt;', '', # lose the tables
#     '&lt;img.*?&gt;', '', # images
#     '\&lt;!--filler here--\&gt;.+?$', '', # cut the tail of the page off

#     "\n+", "\n" # blank lines  # '$', '&lt;/body&gt;&lt;/html&gt;'
#    ];

#$pages{'Slashdot'} =
#  [
#   'http://slashdot.org/index.pl?light=1&amp;noboxes=1'
#  ];

#$filters{'Slashdot'} =
#  [
#   # Start after...
#   'faq.*?\&lt;h2\&gt;',
#   # Stop after...
#   '\&lt;p\&gt;\&lt;p\&gt;\[', # arbitrary, but it works!
#   # Add in a H2 at the top.
#   '^', '&lt;html&gt;&lt;head&gt;&lt;title&gt;Slashdot&lt;/title&gt;&lt;/head&gt;&lt;body&gt;&lt;h2&gt;',
#   '$', '&lt;/body&gt;&lt;/html&gt;'
#  ];

#$pages{'NTK'} =
#  [
#   'http://www.ntk.net/'
#  ];

#$filters{'NTK'} =
#  [
#   '\&lt;pre\&gt;',
#   '\&lt;/pre\&gt;',
#   '^.*? _', '&lt;html&gt;&lt;head&gt;&lt;title&gt;NTK&lt;/title&gt;&lt;/head&gt;&lt;body&gt;&lt;pre&gt; _',
#                                  # restuff. Don't ask about the " _", really.
#   '(&lt;/pre&gt;\s+)?$', '&lt;/pre&gt;&lt;/body&gt;&lt;/html&gt;' # ditto. Makes extra, but screw it.
#  ];

$pages{'RedMeat'} =
  [
   'http://www.redmeat.com/redmeat/current/index.html',
   'img src="(index\-\d+.gif)"'
  ];

# $pages{'DNALounge'} =
#  [
#   'http://www.dnalounge.com/backstage/log/latest.html'
#  ];

# $filters{'DNALounge'} =
#  [
#   '&lt;!-- %%SUBHEADING%% --&gt;',
#   '&lt;P&gt;&lt;BR CLEAR=BOTH&gt;',
#   '&lt;/?font.*?&gt;', '',
#   'bgcolor="#......"', ''
#  ];

# Hah. This is fun.
my $t1 = scalar( localtime( time - ( 60 * 60 * 24 * 30 )));
my $t2 = scalar( localtime( time ));
my @bits1 = split( /\s+/, $t1 );
my @bits2 = split( /\s+/, $t2 );

# I sold my shares. Actually, they were compulsorily
# acquired. Bastards. DIE DIE FORNICATE DIE 666 DIE, as Jerrell would
# say.

#$pages{'Eircom'} =
#  [
#   "http://www.ise.ie/php3/graph_make.php3" .
#   "?CompID=189&amp;Index=None&amp;CompID_A=189&amp;CompID_B=None" .
#   "&amp;Start=" . $bits1[ 2 ] . "+" . $bits1[ 1 ] . "+" . $bits1[ 4 ] .
#   "&amp;End=" . $bits2[ 2 ] . "+" . $bits2[ 1 ] . "+" . $bits2[ 4 ]
#  ];

#$pages {'Goats'} =
#  [
#   'http://www.goats.com/',
#   'img src="(/comix/\d+/goats\d+\.gif)"'
#  ];

#$pages {'JerkCity'} =
#  [
#   'http://www.jerkcity.com/',
#   'img [^&gt;]*src="/(jerkcity\d+.gif)'
#  ];

#$pages {'Loadza' } =
#  [
#   'http://www.unison.ie/loadza/display_results.php3?cat_id=258'
#  ];

#$pages {'Register'} =
#  [
#   'http://www.theregister.co.uk/'
#  ];

#$filters{'Register'} =
#  [
#   '&lt;DIV&gt;&lt;DIV',
#   'Register Services',
#   '^', '&lt;DIV',
#   '&lt;table.*?$', '',
#   ' class=".*?"', '',
#   '&lt;/?font.*?&gt;',''
#  ];

# also no more
#$pages {'Tatemae'} =
#  [
#   'http://www.cloudiness.com/tatemae/'
#  ];

#$pages{'Doonesbury'} =
#  [
#   'http://www.doonesbury.com/strip/dailydose/',
#   'img.*?src="[^"]*(/comics/db/\d+/db\d+.gif)"',
#  ];

# ----------------------------- page layout, such as it is --------------------
my ( $host, $now );
$now = scalar( localtime );
$host = `/bin/hostname`;

my $compilation =&lt;&lt;"COMP";
&lt;html&gt;
  &lt;head&gt;
    &lt;meta name="generator" content="snorq"&gt;
    &lt;meta name="HandheldFriendly" content="True"&gt;
    &lt;title&gt;Compilation Page for $ENV{'LOGNAME'}&lt;/title&gt;
    &lt;link rel="shortcut icon" HREF="/favicon.ico" TYPE="image/x-icon"&gt;
    $stylesheet_link
  &lt;/head&gt;
  &lt;body&gt;
    &lt;h1&gt;Compilation Page for $ENV{'LOGNAME'}&lt;/h1&gt;
    &lt;table border="0" cellpadding="0"&gt;
      &lt;tr&gt;
        &lt;td&gt;&lt;!-- feed me jwzrants --&gt;&lt;/td&gt;
      &lt;/tr&gt;
    &lt;/table&gt;
    &lt;!-- feed me text --&gt;
    &lt;!-- feed me pix --&gt;
    &lt;hr&gt;
    &lt;address&gt;Generated by &lt;a href="http://www.waider.ie/hacks/snorq.pl"&gt;snorq&lt;/a&gt; on $host at $now&lt;/address&gt;
  &lt;/body&gt;
&lt;/html&gt;
COMP

# ------------------------------- end of setup --------------------------------

my $ua = new LWP::UserAgent;
$ua-&gt;agent( "Snorq/0.1" . $ua-&gt;agent );
my ( $req, $res );
$ua-&gt;env_proxy();

for my $page ( sort keys %pages ) {
  if ( $#ARGV != -1 ) {
	next unless grep /$page/i, @ARGV;
  }

  print "$page\n" if $debug;

  # Figure out what we're getting!
  my $content = "";
  my $contenttype = "";
  my $numrules = $#{$pages{$page}};
  my $n = -1; # gack
  my $url;

 RULE:
  for my $rule ( @{$pages{$page}} ) {

	# increment rule number
	$n++;

	print "   rule ", $n + 1, " of ", $numrules + 1, " : $rule\n" if $debug;

	if ( !$content ) {
	  # First rule is always a URL
	  $url = $rule;
	} else {
	  ( $url ) = $content =~ m/$rule/si;
	  if (!defined( $url )) {
		print "   error extracting $rule\n" if $debug;
		$content = undef;
		last RULE;
	  }
	}

	# Patch in base and stuff
	if ( defined $base ) {
	  $uri = new URI $url;
	  if ( !defined( $uri-&gt;scheme ) or !$uri-&gt;scheme ) {
		$uri = new URI $url, $base-&gt;scheme;
	  }

	  # Gack! relative URL!
	  if ( $uri-&gt;path !~ m|^/| ) {
		local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1; # gack gack
		$uri = URI-&gt;new($url)-&gt;abs( $base );
	  }

	  if ( !defined( $uri-&gt;host )) {
		$uri-&gt;scheme( $base-&gt;scheme );
		$uri-&gt;host( $base-&gt;host );
	  }

	  $url = $uri-&gt;as_string;
	}

	print "   fetching $url\n" if $debug;

	$cached = 0;

	# if this is the terminal rule, try a HEAD instead of a GET
	if ( $n == $numrules ) {
	  $req = new HTTP::Request
		HEAD =&gt; $url;

	  $res = $ua-&gt;request( $req );

	  if ( $res-&gt;is_success ) {
		my $utime;

		# See if we get a datestamp
		$date = $res-&gt;headers-&gt;header( 'Last-Modified' );
		if ( defined( $date )) {
		  print "   Last Mod: $date\n" if $debug;
		  $utime = str2time( $date );
		} else {
		  $utime = 0;
		}
		$contenttype = $res-&gt;content_type;

		# And this is what we call a "hack"
		$filename = "${page}_$contenttype";
		$filename =~ s|/|.|g;

		if ( -f $filename ) {
		  (undef, undef, undef, undef, undef, undef, undef, undef,
		   undef, $mtime, undef, undef, undef ) = stat( $filename );
		  if ( $mtime &gt; $utime ) {
			$cached = 1;
		  } else {
			$cached = 0;
		  }
		}
	  } else {
		print "   head failed, for some reason.\n" if $debug;
	  }
	}

	# Screw caching, since it seems not to work.
	$cached = 0;

	if ( $cached ) {
	  print "   cached, not fetching\n" if $debug;
	} else {
	  $req = new HTTP::Request
		GET =&gt; $url;

	  $res = $ua-&gt;request( $req );

	  if ( $res-&gt;is_success ) {
		$content = $res-&gt;content;
		$contenttype = $res-&gt;content_type;

		# And this is what we call a "hack"
		$filename = "${page}_$contenttype";
		$filename =~ s|/|.|g;

		$base = $res-&gt;base;
	  } else {
		print "   error fetching data\n" if $debug;
		$page = $res-&gt;as_string;
		undef $content;
		last RULE;
	  }
	}

	next if !defined( $content );
	next if $n &lt; $numrules;

	print "   Item $page, content type $contenttype successfully fetched.\n" if $debug;

	# Now, filter the page.
	if ( defined( $filters{$page})) {
	  print "   filtering it: " if $debug;

	  print "start..." if $debug;
	  my @filters = reverse @{$filters{$page}};

	  my $filter = pop @filters;
	  $content =~ s/^.*?$filter//si;

	  print "end..." if $debug;
	  $filter = pop @filters;
	  $content =~ s/$filter.*?$//si;

	  if ( $#filters != -1 ) {
		print "body..." if $debug;

		while ( $#filters != -1 ) {
		  my $search = pop @filters;
		  my $replace = pop @filters;

		  $content =~ s/$search/$replace/sgie;
		}
	  }
	  print "done.\n" if $debug;
	}

  }

  # Don't bother doing more if we couldn't get the page
  next unless $content;

  # Fix up URLs
  if ( $contenttype =~ /^text\/html/i ) {
	print "   Repatching URLs to $base\n" if $debug;
	$doc = "";
	my $parser = HTML::Parser-&gt;new( api_version =&gt; 3,
									start_h =&gt; [\&amp;p_start,
												"tagname, text, attr"],
									default_h =&gt;
									[ sub { $doc .= shift }, "text"]
								  );
	$parser-&gt;parse( $content );
	$parser-&gt;eof;
	$content = $doc;
  }

  # Save the damn thing
  open( PAGE, "&gt;$filename" );
  binmode(PAGE); # GRR.
  print PAGE $content;
  close( PAGE );

  # Figure out the link type, and add it.
  if ( $contenttype =~ /^image/i ) {
	print "   Slicing image... [$page/$contenttype]" if $debug;
	$new = carve_image( $page, $contenttype );
	unlink( $filename ); # don't leave the old image lying around
	print "done.\n" if $debug;

	# See if it's got a place of its own to go into.
	if (!( $compilation =~
		   s|(&lt;!-- feed me $page --&gt;)|$new\n|)) {
	  $compilation =~ s|(&lt;!-- feed me pix --&gt;)|$new\n$1|;
	}
  } else {
	my $srcurl = "";
	$srcurl = " (&lt;a href=\"$url\"&gt;from $url&lt;/a&gt;)&lt;br&gt;" if $debug;
	$srcurl .= " ($date)" if $date;
	if (!(	$compilation =~
			s|(&lt;!-- feed me $page --&gt;)|&lt;a href="$filename"&gt;$page&lt;/a&gt;$srcurl\n|
)) {
	  $compilation =~
		s|(&lt;!-- feed me text --&gt;)|&lt;a href="$filename"&gt;$page&lt;/a&gt;$srcurl\n$1|;
	}
  }
}

open( PAGE, "&gt;index.html" );
print PAGE $compilation;
close( PAGE );

if ( defined( $query )) {
  print "&lt;/pre&gt;&lt;p&gt;All done. Redirecting...&lt;/p&gt;\n";
  print &lt;&lt;"EOT";
&lt;script&gt;document.location = "http://www.waider.ie/avantgo/misc/"&lt;/script&gt;
EOT
}

# This is ghastly, but noone seems to have a nice image processing
# module for Perl that I could use instead.
sub carve_image {
  my ( $name, $type ) = @_;
  my $html = "";

  my  $filename = "${name}_$type";
  $filename =~ s|/|.|g;

  # Make directory FIXME nuke it if it exists
  mkdir $name, 0755 unless -d $name;

  # Convert to a pnm
  if ( $type eq "image/png" ) {
	`pngtopnm $filename &gt; $name/$filename 2&gt;/dev/null`;
  } else {
	`anytopnm $filename &gt; $name/$filename 2&gt;/dev/null`;
  }

  # Get dimensions (use Image::Info for this!)
  $pnmfile = `pnmfile $name/$filename`;
  ( $wide, $high ) = $pnmfile =~ m/:.*?,\s(\d+)\sby\s(\d+).*?/i;

  return qq( &lt;pre&gt;$pnmfile&lt;/pre&gt;\n)
	if !defined( $wide ) or !defined( $high );


  $html = qq(&lt;table cellpadding="0" cellspacing="0" border="0"&gt;\n);

  for ( $y = 0; $y &lt; $high; $y += 140 ) {
	if ( $y + 140 &gt; $high ) {
	  $h = $high - $y;
	} else {
	  $h = 140;
	}

	$html.="&lt;tr&gt;";

	for ( $x = 0; $x &lt; $wide; $x += 150 ) {
	  if ( $x + 150 &gt; $wide ) {
		$w = $wide - $x;
	  } else {
		$w = 150;
	  }

	  `pnmcut $x $y $w $h $name/$filename | ppmquant 256 2&gt;/dev/null | ppmtogif &gt; $name/$ {name}_$ {x}_$ {y}.gif 2&gt;/dev/null`;
	  $html .= qq(&lt;td&gt;&lt;img src="$name/$ {name}_$ {x}_$ {y}.gif" width="$w" height="$h"&gt;&lt;/td&gt;);
	}
	$html.="&lt;/tr&gt;\n";
  }
  $html .= "&lt;/table&gt;\n";

  # Cleanup
  unlink( "$name/$filename" );

  return $html;
}

sub patchurl
  {
	my $base = shift;
	my $url = shift;

	my $uri = new URI $url;

	eval {
	  if ( !defined( $uri-&gt;scheme ) or !$uri-&gt;scheme ) {
		$uri = new URI $url, ($base-&gt;scheme || 'http'); # what the hell?
	  }

	  # Gack! relative URL!
	  if ( $uri-&gt;path !~ m|^/| ) {
		local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1; # gack gack
		$uri = URI-&gt;new($url)-&gt;abs( $base );
	  }

	  if ( !defined( $uri-&gt;host )) {
		$uri-&gt;scheme( $base-&gt;scheme || 'http' );
		$uri-&gt;host( $base-&gt;host );
	  }
	};

	$uri-&gt;scheme( 'http' ) unless $uri-&gt;scheme; # thanks, slashdot

	return $url if $@; # bail out if there's an error.

	$uri-&gt;as_string;
  }

sub p_start
  {
	my $tag = $_[1];
	if (( $_[0] eq "a" ) || ( $_[0] eq "img" ) || ( $_[0] eq "link" ) ||
		( $_[0] eq "script" ) || ( $_[0] eq "form" ) || ( $_[0] eq "input" )) {
	  $tag = "&lt;$_[0]";
	  for my $a ( keys %{$_[2]} ) {
		my $t = $_[2]-&gt;{$a};
		if ( $a =~ /^href|src|action$/i ) {
		  $t = patchurl( $base, $t );
		  $tag .= qq( $a="$t" );
		} else {
		  $tag .= qq( $a="$t" );
		}
	  }
	  $tag =~ s/\s+$//; # just in case
	  $tag .= "&gt;";
	}
	$doc .= $tag;
  }

</pre></body></html>