#!/usr/bin/perl
#
# All the cool kids are doing RSS. ME TOO!
#
use lib qw( /sw/lib/perl5 );
use lib qw( /sw/lib/perl5/site_perl );
use XML::RSS;
use XML::Atom::Feed;
use LWP::UserAgent;
use Date::Parse;
use POSIX;
use Digest::MD5 qw( md5_hex );
use Data::Dumper;
use strict;
use HTML::Entities;
use Unicode::String qw( utf8 );
use DBI;
use DBD::mysql;
use Storable qw( freeze );
use Encode qw( encode_utf8 decode_utf8 );
use HTML::TokeParser;
use Getopt::Long;

BEGIN {
    # web user will not have HOME set, so be cheeky and assume it's my
    # own homedir that's required.
    if ( !( $ENV{HOME}||"" )) {
        my ( $name, $passwd, $uid, $gid, $quota, $comment, $gcos, $dir, $shell,
             $expire ) = getpwnam( "waider" );
        $ENV{HOME} = $dir;
    }
}
use lib "$ENV{HOME}/src/perl";
use WaiderDotIe qw( getconfig );

# IWBNI:
# * Users, with passwords and preferences and what not.
# * Clean up cache dir/database if a feed is removed
# * Clean out expired items

my $conf = getconfig();

# connect to the database
my $dbh = DBI->connect( "DBI:mysql:database=$conf->{database};host=$conf->{host};port=$conf->{port}" . ( $conf->{ssl} ? ';mysql_ssl=1' : '' ), $conf->{dbuser}, $conf->{dbpass} )
  or die $DBI::errstr;

# in case of flakiness
$dbh->{mysql_auto_reconnect} = 1;

# get the list of feeds + urls
my @urls;
my $feeds = $dbh->selectall_arrayref( 'SELECT name, url, feedid, error FROM feeds' )
  or die $DBI::errstr;
@urls = @{$feeds};

my $debug = 0;
my $reparse = 0;
my $refresh = 0;

GetOptions( 'debug!' => \$debug,
            'url=s' => sub {
                my $url = $_[1];
                my @newurls;
                if ( @newurls = grep  { $_->[1] eq $url } @urls ) {
                    @urls = @newurls;
                } else {
                    $dbh->do( 'INSERT INTO feeds(url) VALUES(?)', undef, $url );
                    my $feedid = $dbh->last_insert_id( undef, undef, undef, undef );
                    @urls = ( [ "New feed", $url, $feedid ] );
                }
            },
            'refresh!' => \$refresh,
            'reparse!' => \$reparse ) or die;
my $only = shift;

$XML::RSS::AUTO_ADD = 1; # force acceptance of extra namespaces

my $ua = new LWP::UserAgent;
$ua->agent( "RSS/0.1 " . $ua->agent );
$ua->env_proxy();

# Piping hot!
$| = 1;

my $fileprefix; # boo. global.

for my $feed ( @urls ) {
    my ( $title, $url, $feedid, $lasterror ) = @{$feed};
    my ( $rss, $channel, $res, $req, $content );
    my $lastupdated = time;

    if ( defined( $only )) {
        print STDERR "> $feedid: $title\n" if $debug;
        next unless $title eq $only;
    }

    # see what we have in the database
    my $feeddata =
      $dbh->selectall_arrayref( 'SELECT feedid, name, url, ' .
                                'UNIX_TIMESTAMP(lastupdate), rawfeed, ' .
                                'etag, modified, cleanfeed, charset ' .
                                'FROM feeds WHERE feedid=?', undef,
                                $feedid );

    # feed failure, just skip to the next one.
    if ( !$feeddata ) {
        next;
    }

    my %thisfeed =
      (
       feedid     => $feeddata->[0]->[0],
       name       => $feeddata->[0]->[1],
       url        => $feeddata->[0]->[2],
       lastupdate => $feeddata->[0]->[3],
       rawfeed    => $feeddata->[0]->[4],
       etag       => $feeddata->[0]->[5],
       modified   => $feeddata->[0]->[6],
       cleanfeed  => $feeddata->[0]->[7],
       charset    => $feeddata->[0]->[8],
      );

    if ( $refresh or !$thisfeed{rawfeed}) {
        $thisfeed{lastupdate} = undef;
        $thisfeed{rawfeed} = undef;
        $thisfeed{etag} = undef;
        $thisfeed{modified} = undef;
        $thisfeed{cleanfeed} = undef;
    }

    print STDERR "Doing '$thisfeed{name}' ($feedid)\n" if $debug;
    $fileprefix = "/var/tmp/";
    if  ( $title ) {
        my $t = $title;
        $t =~ s@[^a-z0-9]@_@gi;
        $fileprefix .= $t;
    } else {
        $fileprefix .= $feedid;
    }

    # First, check against the cachefile whether we need to update or
    # not. This includes checking how often the cachefile itself says we
    # should update. NB this operates on the *cleaned* feed.
    if ( $thisfeed{cleanfeed} ) {
        my $updateFreq = 1800;  # minimum 30 minutes between refreshes
        $rss = new XML::RSS;
        eval {
            $rss->parse( $thisfeed{cleanfeed} ) or warn $!;
        };
        # check if it needs to be refreshed
        $channel = $rss->{channel};
        if ( defined( $channel )) {
            my $syn = $channel->{syn};
            if ( defined( $syn )) {
                $updateFreq = $syn->{updateFrequency};
                my $updatePeriod = $syn->{updatePeriod};

                if ( $updatePeriod eq 'hourly' ) {
                    $updateFreq *= ( 60 * 60 );
                } elsif ( $updatePeriod eq 'daily' ) {
                    $updateFreq *= ( 60 * 60 * 24 );
                } else {
                    warn "No idea what to do with $updatePeriod\n";
                    $updateFreq *= ( 60 * 60 * 24 );
                }
            }
        }

        # now check against the file date
        my $mtime = $thisfeed{lastupdate};
        $lastupdated = $mtime;
        if ( time > $mtime + $updateFreq ) {
            undef $rss;         # forces reload
        } else {
            print STDERR "  not due for update yet\n" if $debug;
            next unless $only or $refresh or $reparse;
        }
    } else {
        print STDERR "  no clean feed found\n" if $debug;
    }

    $lasterror = '' if $refresh;

    if ( !$rss ) {
        $url =~ s/^feed:/http:/;
        print STDERR "  Fetching $url...\n" if $debug;

        $req = new HTTP::Request GET => $url;

        # Support ETag and If-Modified-Since
        # No point if the cache file is missing, mind you.
        if ( $thisfeed{cleanfeed}||'' ) {
            if ( $thisfeed{etag}||'' ) {
                $req->push_header( 'If-None-Match', $thisfeed{etag} );
            }
            if ( $thisfeed{modified}||'' ) {
                $req->push_header( 'If-Modified-Since', $thisfeed{modified} );
            }
        }

        $res = $ua->request( $req );

        # check if we're being redirected...
        my $final_url = $res->base;

        if ( $res->is_success ) {
            $content = $res->content;
            $lastupdated = time;
            my ( $save, $saveres, @saveparams );

            if ( !$final_url->eq( $url )) {
                print STDERR "  looks like it moved to " . $final_url . "\n" if $debug;
                $save = $dbh->prepare( 'UPDATE feeds SET url=?,rawfeed=?,error=NULL WHERE feedid=?' );
                push @saveparams, $final_url;
            } else {
                $save = $dbh->prepare( 'UPDATE feeds SET rawfeed=?,error=NULL WHERE feedid=?' );
            }
            push @saveparams, $content, $thisfeed{feedid};
            $saveres = $save->execute( @saveparams );

            if ( !defined( $saveres )) {
                die $DBI::errstr;
            }

            # save the ETag/Last-Modified bits
            for my $hdr ( "ETag", "Last-Modified", "Content-Type" ) {
                if ( $debug ) {
                    print STDERR "   $hdr: " . ( $res->headers->header( $hdr ) ||
                                          "unset" ) . "\n";
                }
                if ( my $val = $res->headers->header( $hdr )) {
                    if ( $hdr eq "ETag" ) {
                        $save = $dbh->prepare( 'UPDATE feeds SET etag=? WHERE feedid=?' );
                    } elsif ( $hdr eq "Last-Modified" ) {
                        $save = $dbh->prepare( 'UPDATE feeds SET modified=? WHERE feedid=?' );

                        # snag the last-modified date if it's present
                        my $tval = str2time( $val );
                        $lastupdated = $tval if $tval;
                    } else {
                        $save = $dbh->prepare( 'UPDATE feeds SET charset=? WHERE feedid=?' );
                        $thisfeed{charset} = $val;
                    }

                    $saveres = $save->execute( $val, $thisfeed{feedid} );
                    if ( !defined( $saveres )) {
                        warn "saving headers: $DBI::errstr";
                    }
                }
            }
        } else {
            # Not modifed
            if ( $res->code == 304 ) {
                print STDERR "  Page not modified.\n" if $debug;
                next unless $reparse;
            } else {
                print STDERR "  Failed to fetch page: " . $res->code . " " . $res->message . "\n" if $debug;
                next;
            }
        }

        $reparse = 1;
    }

    if ( $reparse ) {
        # if the content is undefined, then snork it up from the cache file.
        if ( !defined( $content ) or $content eq "" ) {
            print STDERR "  Using cached version\n" if $debug;
            if ( $thisfeed{rawfeed}||"" ) {
                $content = $thisfeed{rawfeed};
            } else {
                print STDERR "  No cached feed for $title!\n"
                  if $debug;
            }
        }

        # If we still have no content, flee the premises.
        if ( !defined( $content )) {
            print STDERR "No content\n" if $debug;
            next;
        }

        my ( $base ) = $url =~ m@^(.*://?.+)/@;
        my ( $site ) = $url =~ m@^(.*://?[^/]+)/@;
        my $original = $content;
        my $preparsed;

        my ( $charset, $contenttype );
        if ( $contenttype = $thisfeed{charset} ) {
            if ( $contenttype =~ /\bcharset=([^ ;]+)/ ) {
                print STDERR "  Charset: $1\n" if $debug;
                $charset = $1;
            } else {
                $charset = "";
            }
            $contenttype =~ s/;.*$//;
            print STDERR "  Cleaned Type: $contenttype\n" if $debug;
        }

        if (( $contenttype ne "text/xml" ) &&
            # is it REALLY html?
            ( $content =~ /^<\?xml/ )) {
            $contenttype = "text/xml";
            print STDERR "  Fixed content type to $contenttype\n" if $debug;
        }

        if ( $contenttype ne "text/xml" and $contenttype ne "application/xml" and $contenttype ne "application/atom+xml" ) {
            logerror( "content-type '$contenttype' incorrect", $lasterror, $thisfeed{feedid}, $url, $title, $content );
            next;
        }

        # if it's an Atom feed, switch it to RSS. This is horribly rough.
        if ( $content =~
             m@(http://purl.org/atom|xmlns=["']?http://www.w3.org/2005/Atom)@si ) {
            print STDERR "  Converting to RSS..." if $debug;
            eval {
                $preparsed = atom_to_rss( $content, $url );
            };
            if ( $@ ) {
                logerror( "Atom conversion: " . $@, $lasterror, $feedid, $url, $title||"<unknown>", $content );
                next;
            }
            print STDERR "done.\n" if $debug;
        }

        # throw away everything before the first XML declaration so we
        # can get a clean parse if at all possible.
        $content =~ s/^.*?(<\?xml)/$1/s;

        # should be a one-off, but.
        $content =~ s/&pound/\&amp;pound/gs;

        # debugging
        if ( $debug ) {
            open( PREPARSE, ">$fileprefix.preparse" ) or warn $!;
            binmode( PREPARSE );
            print PREPARSE $content;
            close( PREPARSE );
        }

        print STDERR "  Parsing RSS " if $debug;
        eval {
            # clean up fail-files
            unlink( "$fileprefix.failed" );
            unlink( "$fileprefix.parsed" );
            unlink( "$fileprefix.new" );

            if ( !defined( $preparsed )) {
                $rss = new XML::RSS( version => "1.0", encoding => 'UTF-8',
                                     encode_output => 1 );
                if ( !$rss->parse( $content )) {
                    my $err = "";
                    $err = $@ if $@;
                    $err .= " ($!)" if $! and $! ne "Success";
                    $err ||= "unknown error";
                    # Eval will catch
                    print STDERR " $err\n" if $debug;
                    die "\nparser failed: $err. File in $fileprefix.failed";
                }
            } else {
                $rss = $preparsed;
            }

            print STDERR $rss->{version} . "\n" if $debug;

            if ( $debug ) {
                open( PARSED, ">$fileprefix.parsed" ) or warn $!;
                binmode( PARSED );
                print PARSED Dumper( $rss );
                close( PARSED );
            }

            # pubDate needs to be in strict Mail date format for RSS2
            $channel = $rss->{channel};
            if ( defined( $channel->{pubDate})) {
                print STDERR "  Fixing pubDate for channel: " if $debug;
                my $fixdate = str2time( $channel->{pubDate});
                if ( defined( $fixdate )) {
                    $channel->{pubDate} = strftime( "%a, %d %b %Y %H:%M:%S %z",
                                                    localtime( $fixdate ));
                } else {
                    # best we can do
                    $channel->{pubDate} =~ s/ (\d?\d:\d\d) / $1:00 /;
                }
                print STDERR $channel->{pubDate} . "\n" if $debug;
            }

            # XML::RSS barfs on feed images without titles. But it
            # barfs if there's no image tag, too. What a piece of
            # shit.
            delete $rss->{image}->{url}
            if defined( $rss->{image} );

            # Fix a variety of per-item bogosity
            map {
                # trim title
                $_->{title} =~ s/\s+$//s;
                $_->{title} =~ s/^\s+$//s;

                # ARGH. XML::RSS won't save items with blank titles.
                $_->{title} = "(untitled)" if !( $_->{title});

                # RTE: wtf?
                if ( exists( $_->{item} )) {
                    delete $_->{item};
                }

                # and the fucking stupidity with the entities
                #$_->{title} =~ s/&(?!(amp|gt|lt))/&amp;/gs;

                # RSS 2.0 gives a content:encoded block which contains
                # a formatted version of the post. Ideally I'd like to
                # use this. Both description and content:encoded have
                # entity escapes, though.
                my $ctag = $_->{content};
                if ( defined( $ctag ) and ref $ctag eq "HASH" and
                     defined( $ctag->{encoded})) {
                    $_->{description} = $ctag->{encoded};
                }

                # cope with 2.0 RSS feed.
                if ( defined( $_->{guid})) {
                    # don't overwrite good links
                    $_->{link} = $_->{guid} unless $_->{link};
                }

                # Some of the pubDate stuff is messed up.
                if ( defined( $_->{pubDate} )) {
                    my $fixdate = str2time( $_->{pubDate});
                    if ( defined( $fixdate )) {
                        $_->{pubDate} = strftime( "%a, %d %b %Y %H:%M:%S %z",
                                                  localtime( $fixdate ));
                    } else {
                        # best we can do
                        $_->{pubDate} =~ s/ (\d?\d:\d\d) / $1:00 /;
                    }
                }

                # HURGH. I don't know whose fault this is, but I don't like it.
                #$_->{description} =~ s/=&amp;quot;(.+?)&amp;quot;/="$1"/gis;

                # DIE PUNY HUMANS, so to speak. Turns out Warren ain't
                # alone. also, I should probably do this with a
                # parser.
                $_->{description} =~ s@(href|src|data)="/@$1="$site/@gs;
                $_->{description} =~
                  s@(href|src|data)="(?!(http|ftp))@$1="$base/@gs;

                # final cleanup: nuke leading/trailing space
                $_->{description} =~ s/^\s+//;
                $_->{description} =~ s/\s+$//;
            } @{$rss->{items}};

            if ( @{$rss->{items}}) {
                $rss->{charset} = $charset;
                timestamp( $title, $rss, $lastupdated, $thisfeed{feedid} );
            } else {
                die "No items in feed";
            }

            # this tosspot saves entites as unicode, causing further
            # irritation.
            $rss->{channel}->{title} =~ s/&(?!(amp|gt|lt))/&amp;/gs;
        };
        if ( $@ ) {
            logerror( "RSS parse: " . $@, $lasterror, $thisfeed{feedid}, $url, $title, $content,  );
            next;
        }

        # now check if it changed at all.
        my $newfeed = "";
        eval {
            $newfeed = $rss->as_string;
        };

        if ( $@ ) {
            warn "$title ($url): " . $@;
            if ( $@ =~ /not well-formed.*byte (\d+)|mismatched tag at line \d+, column \d+, byte (\d+)/ ) {
                print STDERR "Excerpt:\n";
                print STDERR substr( $content, $1 - 20, 20 ) . "=>" . substr( $content, $1, 1 ) . "<=" . substr( $content, $1 + 1, 20 );
                print STDERR "\n";
            } else {
                print STDERR Dumper( $rss );
            }
            open( SAVED, ">$fileprefix.failed" ) or warn $!;
            binmode( SAVED );
            print SAVED $content;
            close( SAVED );

            next;
        }

        if (( $thisfeed{cleanfeed}||'') ne $newfeed ) {
            my $save = $dbh->prepare( 'UPDATE feeds SET name=?,cleanfeed=? WHERE feedid=?' );
            my $saveres = $save->execute( $rss->{channel}->{title}||$title,
                                          $newfeed, $thisfeed{feedid} )
              or warn "Saving $title: $DBI::errstr";
        }

        # otherwise clean up any existing failed files
        unlink( "$fileprefix.failed" );

        if ( $debug ) {
            open( SAVED, ">$fileprefix.new" ) or warn $!;
            binmode( SAVED );
            print SAVED Dumper( $newfeed );
            close( SAVED );
        }

        # housekeeping: the hdr file is invalid if we couldn't
        # successfully parse the RSS feed. On the other hand, if we did
        # parse the feed we don't need the fetched file.
        # no real database equivalent for this... check if there are
        # any items in the new feed, perhaps. We can also erase the
        # rawfeed column.
    } else {
        print STDERR "  Using data from " . scalar( localtime( $lastupdated )) .
            "\n" if $debug;
    }

    # make the timestamp on the file correct.
    $dbh->do( 'UPDATE feeds SET lastupdate=FROM_UNIXTIME(?) WHERE feedid=?',
              undef, $lastupdated, $thisfeed{feedid} ) or
                warn( "timestamp update for $title failed ($DBI::errstr)" );
}

# Attempt to attach timestamps to untimestamped feeds. Won't do
# anything useful the first time through.
sub timestamp {
    my ( $feed, $rss, $lastupdated, $feedid ) = @_;
    my $expire = $dbh->do( 'UPDATE items SET active=0 WHERE feedid=' .
                           $feedid );

    print STDERR "  Timestamping $feed\n" if $debug;

    for my $item (@{$rss->{items}}) {
        my ( $itemid, $ts, $date );

        # fixme: this should be configurable
        if ( defined( $item->{dc} ) and defined( $item->{dc}->{date})) {
            $date = rss_date_to_unix( $item->{dc}->{date} );
            print STDERR "   Using dc date " .
              scalar( localtime( $date )) . "\n" if $debug;
        } elsif ( defined( $item->{pubDate})) {
            $date = rss_date_to_unix( $item->{pubDate});
            print STDERR "   Using pubdate " .
              scalar( localtime( $date )) . "\n" if $debug;
        } else {
            my ( $y, $m, $d );
            my $guid = $item->{guid}||$item->{link};
            my $title = $item->{title};
            my $desc = $item->{description};

            if ( defined( $guid )) {
                # dnalounge
                ( $y, $m, $d ) = $guid =~
                  m{(\d{4})/(\d{2}).html#(\d{2})};
                goto GLOM if $y and $m and $d;
            }

            if ( defined( $title )) {
                # arcamax doonesbury feed
                ( $m, $d, $y ) = $title =~
                  m{Doonesbury (\d+)/(\d+)/(\d+)};
                goto GLOM if $y and $m and $d;

                # doonesbury, I hate you
                next if $title eq "Past Stories";

                # jerkcity
                ( $d, $m, $y ) = $title =~
                  m{\b(\d{1,2})-(\w{3})-(\d{4})\b};
                goto GLOM if $y and $m and $d;

                # NTK
                ( $y, $m, $d ) = $title =~
                  m{\b(\d{4})-(\d{2})-(\d{2})};
                goto GLOM if $y and $m and $d;
            }

            if ( defined( $desc )) {
                # john shirley
                ( $d, $m, $y ) = $desc =~
                  m{\b(\d{1,2})-(\w{3})-(\d{4})\b};
                goto GLOM if $y and $m and $d;
            }

          GLOM:
            $date = sprintf( "%04d-%02d-%02d", $y, $m, $d )
              if $y and $m and $d;

            if ( defined( $date )) {
                $date = str2time( $date );
            }
        }

        if ( !defined( $date )) {
            my $feedfunc = $feed;
            $feedfunc =~ s/[^a-zA-Z]//gs;
            eval '$date = ts_' . $feedfunc . '( $item );';
            print STDERR "   Faked date from item: $date\n" if
              ( $debug && defined( $date ));
            $date = rss_date_to_unix( $date ) if $date;
        }

        # stuff dc->date
        if ( defined( $date ) and !defined( $item->{dc})) {
            my %dc;
            # stupid W3CDTF can't handle timezones :(
            $dc{date} = strftime( "%Y-%m-%dT%H:%M:%S GMT", gmtime( $date ));
            $item->{dc} = \%dc;
            print STDERR "   adding dc:date $date => " . $dc{date} . "\n"
              if $debug;
            $ts = $date;
        }

        # need to fake up a datestamp, catering for potential in-line utf8.
        # rawarticlehash is quite obviously a poor name at this point.
        my $text = $item->{description};
        my $p = new HTML::TokeParser( \$text );
        my $c = "";
        while ( my $t = $p->get_token()) {
            next unless $t->[0] eq "T";
            $c .= $t->[1];
        }

        my $digest =
          md5_hex( encode_utf8(( squish( $item->{title}||'' )) .
                               ( squish( $c )) .
                               ( $item->{link}||'' )));

        print STDERR "   Calculated digest $digest\n" if $debug;
# nice and all as this is, it's not preserved by the stupid conversion
#        if ( defined( $item->{guid} )) {
#            $digest = $item->{guid};
#            print "   using GUID\n" if $debug;
#        }
        my $itemdata = $dbh->selectall_arrayref( 'SELECT itemid, unix_timestamp(ts) FROM items WHERE feedid=? AND rawarticlehash=?', undef, $feedid, $digest );

        if ( defined( $itemdata ) and @{$itemdata} ) {
            $itemid = $itemdata->[0]->[0];
            $ts = $itemdata->[0]->[1] unless $ts;
        } else {
            $ts = $lastupdated unless $ts;
        }

        if ( defined( $itemid )) {
            print STDERR "   wait, this is already in the database!\n"
              if $debug;
        }

        my $stamp;
        if ( !defined( $itemid )) {
            print STDERR "   generating timestamp for " .
              encode_utf8( $item->{title} ) . "\n"
              if $debug;
            $stamp = $dbh->prepare( 'REPLACE INTO items(feedid,rawarticle,rawarticlehash,subject,body,url,ts,active) VALUES(?,?,?,?,?,?,FROM_UNIXTIME(?),1)' );
            #, { mysql_is_blob => [ 0, 1, 0, 0 ] });
            # existing date overrides
            if ( defined( $date )) {
                print STDERR "   Using item date " . scalar( localtime( $date )) . "\n" if $debug;
                $ts = $date;
            }
            my $frozen = freeze( $item );
            my $res = $stamp->execute( $feedid, $frozen, $digest,
                                       $item->{title},
                                       $item->{description},
                                       $item->{link}, $ts );
        } else {
            $stamp = $dbh->prepare( 'UPDATE items set feedid=?,rawarticle=?,rawarticlehash=?,subject=?,body=?,url=?,ts=FROM_UNIXTIME(?),active=1 WHERE itemid=?' );
            my $frozen = freeze( $item );
            my $res = $stamp->execute( $feedid, $frozen, $digest,
                                       $item->{title},
                                       $item->{description},
                                       $item->{link},
                                       $ts, $itemid );
        }

        my $newid = $dbh->last_insert_id( "", "rss", "items", "itemid" );
        print STDERR "   inserted as item $newid\n" if $debug and $newid and
          $newid != ( $itemid || 0 );

        if ( !defined( $item->{dc})) {
            my %dc;
            #$dc{date} = strftime( '%Y-%m-%dT%H:%M+0000', gmtime( $ts )); # XXXX
            # stupid W3CDTF can't handle timezones :(
            $dc{date} = strftime( "%Y-%m-%dT%H:%M:%S GMT", gmtime( $ts ));
            $item->{dc} = \%dc;
            print STDERR "   patching in time $ts => " . $dc{date} . "\n"
              if $debug;
        }
        print STDERR "\n" if $debug;
    }

    # now clean up anything that's not active
    #my $expire = $dbh->do( 'DELETE FROM items WHERE active=0' );
}

# convert rss date to unix time_t
sub rss_date_to_unix {
    my $date = shift;
    my $inp = $date;

    # RSS 2.0 uses a totally different date format. On the plus side,
    # Date::Parse should be able to handle it unchanged.
    if ( $date =~ /^\d+-/ ) {
        # Blogger's variation: 2003-07-16 17:44:13Z
        $date =~ s/[T ](\d+:\d+):\d+Z$/T$1+00:00/; # Z = Zulu time = GMT
    }

    # str2date vs ISO8601
    $date =~ s/(\d)T(\d)/$1 $2/;
    $date =~ s/(\d\d):(\d\d)$/$1$2/;

    my $outp = str2time( $date );
    print STDERR "   in: $inp out: $outp\n" if $debug;

    $outp;
}

# per-feed timestamps
# if a sub called ts_FeedName exists, it'll get called to try and
# extract a usable timestamp from an RSS item.
sub ts_Cloudiness {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|;
    if ( defined( $date )) {
        $date =~ s@/@-@g;
        $date =~ s@$@T00:00-0600@;
    }
    $date;
}

sub ts_KevLyda {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|;
    if ( defined( $date )) {
        $date =~ s@/@-@g;
        $date =~ s@$@T00:00+0000@;
    }
    $date;
}

# maybe it's time I just folded all these into ts_Bloxsom and added a
# bloxsom detector...
sub ts_Nanocrew {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m|.*blog/(.+)#|;
    if ( defined( $date )) {
        $date =~ s@/@-@g;
        $date =~ s@$@T00:00+0000@;
    }
    $date;
}

sub ts_DNALounge {
    my $item = shift;
    my $date = $item->{title};
    if ( defined( $date )) {
        $date =~ s/ \(.*\)//;
        $date = str2time( $date );
        if ( defined( $date )) {
            $date = strftime( '%Y-%m-%dT%H:%M-0800', gmtime( $date ));;
        }
    }
    $date;
}

sub ts_LungFish {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m/^.*\.(\d+)$/;
    if ( defined( $date )) {
        $date = strftime( '%Y-%m-%dT%H:%M-0800', gmtime( $date ));;
    }
    $date;
}

sub ts_RedMeat {
    my $item = shift;
    my ( $date ) = $item->{link} =~ m/redmeat\/(\d{4}-\d{2}-\d{2})/;
    if ( defined( $date )) {
        $date = str2time( $date );
        if ( defined( $date )) {
            $date = strftime( '%Y-%m-%dT%H:%M-0000', localtime( $date ));;
        }
    }
    return undef;
}

sub atom_to_rss {
    my $feed = shift;
    my $url = shift;

    $feed =~ s/^.*?<\?xml/<?xml/gs;

    # thanks, guys, that'll do nicely
    $feed =~ s@<title(\s+type=["']text['"])>(.*?)</title>@"<title$1>". encode_entities($2) ."</title>"@egs;

    print STDERR "  Parsing feed...\n" if $debug;
    my $atom = XML::Atom::Feed->new( \$feed ) or die "new(): " . $!;
    return undef if !defined( $atom->title );

    my $rss = new XML::RSS(
                           version => '1.0',
                           encoding => 'UTF-8',
                           encode_output => 1,
                          );
    my @links = $atom->link;
    while ( @links ) {
        last if $links[0]->type eq "text/html";
        shift @links;
    }

    print STDERR "  building XML::RSS object\n" if $debug;

    $rss->channel
      (
       title => $atom->title,
       link => ( defined( $links[0] ) ? $links[0]->href :
                 $url ),
       description => $atom->subtitle,
       dc => {
              date => $atom->modified || $atom->updated,
              language => $atom->language,
             }
      );

    for my $entry ( $atom->entries ) {
        print STDERR "  adding entry\n" if $debug;

        if ( !defined( $entry->content )) {
            if ( !defined( $entry->summary )) {
                $entry->content( "no content" );
            } else {
                $entry->content( $entry->summary );
            }
        }

        my %item = (
            title => $entry->title,
            link => $entry->link->href,
            description => $entry->content->body,
            dc => {
                date => $entry->issued || $entry->updated,
            },
            );

        if ( $entry->author ) {
            $item{dc}->{creator} = $entry->author->name;
        }

        $rss->add_item(  %item );
    }

    return $rss;
}

sub squish {
    my $text = shift;

    $text =~ s/[^[:word:][:space:]]//gs;

    # whitespace cleanup
    $text =~ s/ +/ /gs;
    $text =~ s/^\s+//gs;
    $text =~ s/\s+$//gs;

    $text;
}

sub logerror {
    my $newerror = shift;
    my $lasterror = shift;
    my $feed = shift;
    my $url = shift;
    my $title = shift;
    my $content = shift;

    my $errst = $dbh->prepare( "UPDATE feeds SET error=? WHERE feedid=?" );

    if ( $newerror ) {
        my $res = $errst->execute( substr( $newerror, 0, 255 ), $feed );
        if ( !$res ) {
            die $DBI::errstr;
        }

        # did this break already?
        if ( !$lasterror ) {
            chomp( $newerror );
            warn "$title ($url): " . $newerror;
            if ( $newerror =~ /not well-formed.*byte (\d+)|at line \d+, column \d+, byte (\d+)/ ) {
                my $offset = ( $1 || $2 );
                print STDERR "Excerpt around byte $offset:\n";
                print STDERR substr( $content, $offset - 20, 20 ) . "=>" . substr( $content, $offset, 1 ) . "<=" . substr( $content, $offset + 1, 20 );
                print STDERR "\n";
            }
        } else {
            print STDERR "  feed failed and already logged:\n" if $debug;
            print STDERR "  Last Error:\n  $lasterror\n  New Error:\n  $newerror\n" if $debug;
        }
        open( SAVED, ">$fileprefix.failed" ) or warn $!;
        binmode( SAVED );
        print SAVED $content;
        close( SAVED );
    } else {
        my $res = $errst->execute( "", $feed );
        die $DBI::errstr unless $res;
    }
}

