#!/usr/bin/perl -w

#################################################################################
# 
# Web Secretary
#
# Retrieves a list of web pages and send the pages via email to
# a designated recipient. It can optionally compare the page with a
# previously retrieved page, highlight the differences and send the
# modified page to the recipient instead.
#
# Copyright (C) 1998  Chew Wei Yih
# Copyright (C) 2004,2005 Baruch Even <baruch@ev-en.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#################################################################################

my $feature_compress = 1;

use HTTP::Status;
use HTTP::Date;
use LWP::UserAgent;
use URI;
eval { require Compress::Zlib; } or $feature_compress=0;
use POSIX qw(strftime);
use File::Spec;
use Getopt::Long;
use Pod::Usage;
use File::Temp qw/tempfile/;
use File::Copy;

my $version = "1.9.0";

# Print introduction
print "Web Secretary Ver $version\n";
print "By Chew Wei Yih Copyleft (c) 1998\n\n";

# Initialize parameters

$help = 0;
$man  = 0;
$urllist = "url.list";
$base = "";

# Parse command line options
GetOptions(
    "urllist=s" => \$urllist,
    "help|?" => \$help,
    "man"    => \$man,
    "base=s" => \$base
);

pod2usage(1) if $help;
pod2usage( -exitstatus => 0, -verbose => 2 ) if $man;

if ($base eq "") {
    if ( -e $urllist ) {
        $base = ".";
    } else {
        $base = $ENV{HOME} . "/.websec";
    }
}

# Remove trailing slash from base, we will add it ourself everywhere needed
$base =~ s/\/$//;

# Prepare pathnames.
$archive = "$base/archive";
mkdir $base,    0750 if !-d $base;
mkdir $archive, 0750 if !-d $archive;
($current_fh, $page_current) = tempfile(DIR=>"$base", UNLINK=>1, SUFFIX=>".html");
$current_fh=-1;

# Red Hat has sendmail in /usr/sbin/sendmail, but LWP::Protocol::mailto
# expects it in /usr/lib/sendmail.  Revert to the Red Hat location if there
# is no executable at the expected location.
use vars qw($LWP::Protocol::mailto::SENDMAIL);
$LWP::Protocol::mailto::SENDMAIL = "/usr/sbin/sendmail"
  unless -x "/usr/lib/sendmail";

# Location of webdiff, if it's in the same directory as websec, use it,
# this enables simply opening the archive and using the program inplace.
use FindBin;
$webdiffbin = "$FindBin::Bin/webdiff";
if ( !-e $webdiffbin ) {
    $webdiffbin = "webdiff";
}

$htmldiffbin = "$FindBin::Bin/htmldiff";
if ( !-e $htmldiffbin ) {
  $htmldiffbin = "./htmldiff";
  if ( !-e $htmldiffbin ) {
      $htmldiffbin = "htmldiff";
  }
}

# prepare digest
@digest = ();
@htmldigest = ();

# Set default values
local %defaults = (
    url        => "",
    auth       => "none",
    name       => "",
    prefix     => "",
    diff       => "webdiff",
    hicolor    => "blue",
    asciimarker => 0,
    ignore     => "none",
    ignoreurl  => "none",
    email      => "",
    emaillink  => "",
    emailerror => 1,
    program    => "",
    programdigest => "",
    proxy      => "",
    proxyauth  => "none",
    randomwait => 0,
    retry      => 3,
    retrywait  => 0,
    timeout    => 20,
    tmin       => 0,
    tmax       => 99999,
    addsubject => "",
    digest     => "false",
    useragent  => "WebSec/$version",
    datefmt    => " - %d %B %Y (%a)",
    mailfrom   => "",
);
%siteinfo = %defaults;

# Default return code
$rc = 0;

my $urllistfile = $base . "/" . "$urllist";
if (! -e $urllistfile) {
    print STDERR "Missing config file $urllistfile, Exiting.\n";
    exit 1;
}

open ARGV, "$urllistfile" unless exists $ARGV[0];

# Loop through input file and process all sites listed
while (<>) {
    chop $_;
    s/^\s*//;
    s/\s*$//;

    # Ignore comments
    next if (m/^#/);
    # Stop with a finish marker
    last if (m/^__END__/);

    # Handle non-empty lines
    if ( length != 0 ) {
        $rc = &HandleInput();
        if ( $rc != 0 ) { last; }
        next;
    }

    # Handle line separators
    $rc = &HandleSite();
    if ( $rc != 0 ) { last; }
    %siteinfo = %defaults;
}

# Process last site if available
if ( $rc == 0 && $siteinfo{url} ne "" ) { $rc = &HandleSite(); }

# Delete temp files
unlink($page_current);

if (@digest) {
    $linkmsg =
      "The contents of the following URLs have changed:\n\n"
      . join ( "\n", @digest ) . "\n";
    $subj = "$addsubject$today";
    &MailMessage( $linkmsg, $subj, $digestEmail, $siteinfo{mailfrom} );
}

if (@htmldigest) {
    ($OUTPAGE, $pagename) = tempfile(DIR=>"$base", UNLINK=>1, SUFFIX=>".html");
    print OUTPAGE "<HTML><BODY>The contents of the following URLs has changed:<P><P>";
    foreach (@htmldigest) { print OUTPAGE "$_<P>\n"; }
    print OUTPAGE "<P></BODY></HTML>";
    close(OUTPAGE);

    &ShowDocument( $program, $pagename);
}


# End of main program
exit $rc;

# Handle setting of parameters
# Params: none
sub HandleInput() {

    # Get keyword, value pair
    ( $keyword, $value ) = split ( /=/, $_, 2 );

    if (not defined $value) {
        print STDERR "Keyword '$keyword' has no value.\n";
        exit 1;
    }
    
    $keyword =~ s/^\s*(.*?)\s*$/$1/;
    $keyword =~ tr/A-Z/a-z/;
    $value   =~ s/^\s*\"?(.*?)\"?\s*$/$1/;

    # Check if valid keyword
    if ( not defined $defaults{$keyword} ) {
        print qq(Unrecognized keyword in line $.: "$_". Keyword="$keyword".\n);
        return -1;
    }

    # Allow values from the environment
    while ($value =~ m/\${([^}]+)}/) {
        if (not exists $ENV{$1}) {
            print STDERR "Used environment variable '$1' but it is not defined, aborting.\n";
            exit 1;
        }
        $value =~ s/\${([^}]+)}/$ENV{$1}/;
    }

    $siteinfo{$keyword} = $value;
    return 0;
}

# Handle downloading, highlighting and mailing of each site.
# Params: none
# Returns: 0 => OK, -1 => Error
sub HandleSite() {

    # Get parameter values for this page
    $url        = $siteinfo{url};
    $name       = $siteinfo{name};
    $prefix     = $siteinfo{prefix};
    $diff       = $siteinfo{diff};
    $hicolor    = $siteinfo{hicolor};
    $ignore     = $siteinfo{ignore};
    $ignoreurl  = $siteinfo{ignoreurl};
    $email      = $siteinfo{email};
    $emailLink  = $siteinfo{emaillink};
    $program    = $siteinfo{program};
    $programdigest = $siteinfo{programdigest};
    $proxy      = $siteinfo{proxy};
    $randomwait = $siteinfo{randomwait};
    $retry      = $siteinfo{retry};
    $retrywait  = $siteinfo{retrywait};
    $timeout    = $siteinfo{timeout};
    $tmin       = $siteinfo{tmin};
    $tmax       = $siteinfo{tmax};
    $addsubject = $siteinfo{addsubject};
    $digest     = $siteinfo{digest};
    $useragent  = $siteinfo{useragent};
    $datefmt    = $siteinfo{datefmt};

    # Get today's date in the format we want.
    $today = strftime $datefmt, localtime;

    # If block without URL, assume parameter setting block and update default
    # values
    if ( $url eq "" ) {
        %defaults = %siteinfo;
        return 0;
    }

    # If essential parameters are not present, abort with error
    if ( $name eq ""
        || $prefix eq ""
        || ( $email eq "" && $emailLink eq "" && $program eq "" ) )
    {
        print "Name, prefix, program or email info missing from URL: $url.\n";
        return -1;
    }

    # Prepare for downloading this page
    print "Processing => $url ($name) ...\n";
    $pagebase            = "$archive/$prefix";
    $page_previous       = "$pagebase.html";
    $page_archive        = "$pagebase.old.html";
    (undef, $outgoing)     = tempfile( SUFFIX => '.html' );
    $page_previousExists = 1;
    open( FILE, $page_previous ) or $page_previousExists = 0;
    # get modification time
    if ($page_previousExists) { $mtime = (stat(FILE))[9]; } 
    close(FILE);
    $subj    = "$addsubject $name$today - $url";
    $webdiff =
"$webdiffbin --basedir=$base --archive=$page_previous --current=$page_current --out=$outgoing "
      . "--hicolor=$hicolor --ignore=$ignore --ignoreurl=$ignoreurl --tmin=$tmin --tmax=$tmax";
     $htmldiff = "$htmldiffbin $page_previous $page_current > $outgoing";

    if ($siteinfo{asciimarker}) {
        $webdiff .= " --asciimarker";
    }

    # Download URL using LWP
    $ua = new LWP::UserAgent;
    $ua->agent($useragent);
    $ua->timeout($timeout);
    $ua->env_proxy;
    if ( $proxy ne "" ) { $ua->proxy( http => $proxy ); }
    $req = PrepareRequest($url);

    # set If-Modified-Since to the modification time of the archive file
    if ($page_previousExists) { $req->header('If-Modified-Since' => time2str($mtime)); }

    # Try up to '$retry' times to download URL
    $counter = 0;
    srand;
    while ( $counter < $retry ) {
        $resp = $ua->request($req);

        if ($resp->code == 304) { 
            print "Document not changed.\n";
            return 0;
        }

        if ( ! $resp->is_success ) {
            $counter++;
            if ( $randomwait > 0 ) {
                $random = int( rand $randomwait ) + 1;
                sleep $random;
            }
            else { sleep $retrywait; }
            next;
        }

        # Leave if there is no refresh header
        if (!$resp->header("Refresh")) { last; }

        # Handle it if the refresh is for zero seconds
        ( $time, $refresh_url ) = split(/;/, $resp->header("Refresh"), 2);
        if ($time > 0) { last; }
        
        # Convert to absolute URL and refetch the page
        ( undef, $refresh_to ) = split(/=/, $refresh_url, 2);
        $newurl = URI->new_abs($refresh_to, $url)->as_string();

        $req = PrepareRequest($newurl);
        # Don't reset the counter, we still want to protect from endless loops
    }

    # If URL is successfully downloaded
    if ( $resp->is_success ) {
        # Check if the data is gzip compressed, decompress if it is.
        if (($resp->content_encoding || "") =~ /gzip/) {
            my $new_content;

            if ($feature_compress) {
                $new_content = Compress::Zlib::memGunzip($resp->content);
            } else {
                $new_content = "Server sent gzip compressed data, and we are missing Compress::Gzip";
            }
            if (defined $new_content) {
                $resp->content($new_content);
                $resp->content_length(length $new_content);
                $resp->content_encoding("");
            }
        }
    
        open( HTML_FILE, ">$page_current" );
        print HTML_FILE "<!-- X-URL: ", $resp->base, " -->\n";
        print HTML_FILE "<BASE HREF= \"", $resp->base, "\">\n";
        my ($type, $charset) = $resp->content_type;
        if ($charset) {
            print HTML_FILE "<meta http-equiv=\"Content-Type\" content=\"", $type, "; ", $charset ,"\"/>\n";
        }
        print HTML_FILE $resp->content;
        close HTML_FILE;

        # set the modification date for later retrieval
        $mtime = $resp->header("Last-Modified");
        if ($mtime) {
            $mtime = str2time($mtime);
            if ($mtime) { # Make sure the time was in a legal format
                    utime($mtime, $mtime, $page_current);
            }
        }

        if ( $diff eq "webdiff" ) {
            if ( $page_previousExists == 1 ) {
                print
"Highlighting differences from previous version of webpage ...\n";
                $rc = system($webdiff);
                if ( $rc != 0 ) {
                    if ( $email ne "" ) {
                        print "Sending highlighted page to $email ...\n";
                        MailDocument( $outgoing, $subj, $email,
                            $siteinfo{mailfrom} );
                    }
                    if ( $emailLink ne "" ) {
                        print "Sending link to $emailLink ...\n";
                        if ( ( $digest ne "no" ) && ( $digest ne "false" ) ) {
                            push @digest, $url;
                            ($digestEmail) or ( $digestEmail = $emailLink );
                        }
                        else {
                            my $filepath = File::Spec->rel2abs($page_previous);
                            $linkmsg =
"The contents of the following URL has changed:\n$url\n\nIt can also be found at:\nfile://$filepath\n";
                            MailMessage(
                                $linkmsg,   $subj,
                                $emailLink, $siteinfo{mailfrom}
                            );
                        }
                    }
                    if ( $program ne "" ) {
                        if ( $programdigest ne "true" ) {
                            ShowDocument( $program, $outgoing );
                        }
                        else {
                            push @htmldigest, "<A HREF=\"".$outgoing."\">Changes for ".$name."</A>".
                                    "&nbsp;<A HREF=\"".$page_archive."\">previous page</A>".
                                    "&nbsp;<A HREF=\"".$page_previous."\">current page</A>".
                                    "&nbsp;<A HREF=\"".$url."\">current page on the net</A><P><P>";
                        }
                    }
                }
                else {
                    print "No changes were detected.\n";
                }
                move $page_previous, $page_archive;
                move $page_current,  $page_previous;
            }
            else {
                print
                  "No previous version for this page. Storing in archive ...\n";
                move $page_current, $page_previous;
            }
        }
        elsif ( $diff eq "htmldiff" )
        {
            if ( $page_previousExists == 1 ) {
                print
"Highlighting differences from previous version of webpage using htmldiff...\n";
                $rc = system($htmldiff);
                if ( $rc != 0 ) {
                    if ( $email ne "" ) {
                        print "Sending highlighted page to $email ...\n";
                        MailDocument( $outgoing, $subj, $email );
                    }
                    if ( $emailLink ne "" ) {
                        print "Sending link to $emailLink ...\n";
                        if ( ( $digest ne "no" ) && ( $digest ne "false" ) ) {
                            push @digest, $url;
                            ($digestEmail) or ( $digestEmail = $emailLink );
                        }
                        else {
                            my $filepath = File::Spec->rel2abs($page_previous);
                            $linkmsg =
"The contents of the following URL has changed:\n$url\n\nIt can also be found at:\nfile://$filepath\n";
                            MailMessage( $linkmsg, $subj, $emailLink );
                        }
                    }
                    if ( $program ne "" ) {
                        if ( $programdigest ne "true" ) {
                            ShowDocument( $program, $outgoing );
                        }
                        else {
                            push @htmldigest, "<A HREF=\"".$outgoing."\">Changes for ".$name."</A>".
                                    "&nbsp;<A HREF=\"".$page_archive."\">previous page</A>".
                                    "&nbsp;<A HREF=\"".$page_previous."\">current page</A>".
                                    "&nbsp;<A HREF=\"".$url."\">current page on the net</A><P><P>";
                        }
                    }
                }
                else {
                    print "No changes were detected.\n";
                }
                move $page_previous, $page_archive;
                move $page_current,  $page_previous;
            }
            else {
                print
                  "No previous version for this page. Storing in archive ...\n";
                move $page_current, $page_previous;
            }
        }
        else {
            if ( $email ne "" ) {
                MailDocument( $page_current, $subj, $email,
                    $siteinfo{mailfrom} );
            }
            if ($page_previousExists) { move $page_previous, $page_archive; }
            rename $page_current, $page_previous;
        }
    }

    # If unable to download URL
    else {
        print "Unable to retrieve page.\n";
        $errmsg =
          "Unable to retrieve $name ($url).\n\n"
          . "Detailed error as follows:\n"
          . $resp->error_as_HTML;

        if ( $email ne "" && $siteinfo{emailerror} ) {
            MailMessage( $errmsg, $subj, $email, $siteinfo{mailfrom} );
        }
        if ( $emailLink ne "" && $siteinfo{emailerror} ) {
            if ( ( $digest ne "no" ) && ( $digest ne "false" ) ) {
                push @digest, "Unable to retrieve: $url";
                ($digestEmail) or ( $digestEmail = $emailLink );
            }
            else {
                MailMessage( $errmsg, $subj, $emailLink, $siteinfo{mailfrom} );
            }
        }
    }

    unlink($outgoing);

    return 0;
}

sub PrepareRequest() {
    my $url = shift (@_);

    $req = new HTTP::Request( 'GET', $url );

    my $auth = $siteinfo{auth};
    if ( $auth ne "none" ) { $req->authorization_basic( split ( /:/, $auth, 2 ) ); }

    my $proxyAuth = $siteinfo{proxyauth};
    if ( $proxyAuth ne "none" ) { $req->proxy_authorization_basic( split ( /:/, $proxyAuth, 2 ) ); }
    
    #$req->push_header("Accept" => "text/html, text/plain, text/*, */*");
    
    my $compress_options = "identity";
    if ($feature_compress) { $compress_options = "gzip, $compress_options"; }
    $req->push_header("Accept-Encoding" => $compress_options);

    return $req;
}

# Mail message
# Params: message, subject, recipient
# Returns: none
sub MailMessage() {
    my $message    = shift (@_);
    my $subject    = shift (@_);
    my @recipients = split /,/, shift (@_);
    my $from       = shift (@_);

    foreach $email (@recipients) {
        $req = HTTP::Request->new( POST => "mailto:" . $email );
        if ( $from ne "" ) {
            $req->header( "From",   $from );
            $req->header( "Sender", $from );
        }
        $req->header( "Subject",      $subject );
        $req->header( "Content-type", "text/plain; charset=us-ascii" );
        $req->header( "Content-Transfer-Encoding", "7bit" );
        $req->header( "MIME-Version",              "1.0" );
        $req->content($message);

        $ua = new LWP::UserAgent;
        $ua->request($req);
    }
}

# Mail HTML document.
# Params: filename, subject, recipient
# Returns: none
sub MailDocument() {
    my $filename   = shift (@_);
    my $subject    = shift (@_);
    my @recipients = split /,/, shift (@_);
    my $from       = shift (@_);
    my $tmpstr     = $/;

    undef $/;
    open( FILE, "$filename" ) or die "Cannot open $filename: $!\n";
    my $content = <FILE>;
    close(FILE);

    foreach $email (@recipients) {
        $req = HTTP::Request->new( POST => "mailto:" . $email );
        if ( $from ne "" ) {
            $req->header( "From",   $from );
            $req->header( "Sender", $from );
        }
        $req->header( "Subject",                   $subject );
        $req->header( "Content-type",              "text/html" );
        $req->header( "Content-Transfer-Encoding", "7bit" );
        $req->header( "MIME-Version",              "1.0" );
        $req->content($content);

        $ua = new LWP::UserAgent;
        my $resp = $ua->request($req);
        die "Error mailing document: ".$resp->message()."\n" if $resp->is_error;
    }

    $/ = $tmpstr;
}

sub ShowDocument() {
    my ( $program, $outgoing ) = @_;
    my $status;

    # special handling for mozilla, try to use remoting...
    if ( $program eq "mozilla" ) {
        $status = system("mozilla -remote \"ping()\"");

        # print "Status after ping: ".$status."\n";

# if ping() returns ne 0, mozilla is not running, we cannot use openurl()
        if ( $status ne 0 ) {
            $status = system( "mozilla", $outgoing );
            if ( $status ne 0 ) {
                print "Running mozilla returned status: " . $status . "\n";
            }
        }
        else {
            $status =
              system(
                "mozilla -remote \"openurl(file:" . $outgoing . ",new-tab)\"" );
            if ( $status ne 0 ) {
                print "Running mozilla returned status: " . $status . "\n";
            }
        }
    }
    elsif ($program eq "konqueror") {
        # konqueror from KDE has a small client application that helps with opening urls
        # run 'kfmclient --commands' for help about the available commandline options
        $status = system( "kfmclient openURL ".$outgoing." text/html" );
        if ( $status ne 0 ) {
            print "Displaying URL in konqueror returned status: " . $status . "\n";
        }
    }
    else {

        # other applications are currently just started
        $status = system( $program, $outgoing );
        if ( $status ne 0 ) {
            print "Application " . $program
              . " returned status: " . $status . "\n";
        }
    }
}

__END__

=head1 NAME

websec - Web Secretary

=head1 SYNOPSIS

websec [options]


=head1 OPTIONS

=over 8

=item B<--help>

Print a brief help message and exits.

=item B<--man>

Prints the manual page and exits.

=item B<--base>

Base directory for configuration (~/.websec by default)

=item B<--urllist>

Use another file for the url list, by default it is "url.list".

=back

=head1 DESCRIPTION

B<websec> is a web page monitoring software.  It will send you a changed web
page with the contents highlighted.

The base directory is the place from which B<websec> will read the config files
and in which it will store its data.

When called without an argument, B<websec> will look for a base directory.
If the current directory has url.list it will use it, otherwise it will try to
use I<$HOME/.websec/>. You can also override this process with the I<--base>
option.

You can add a line like I<AddSubject = [websec]> to url.list, websec will add
I<[websec]> to every subject as a first word when mail is sent. You can then
easily detect this line by a mail filter.

The keywords I<Retry>, I<Retrywait>, and I<Timeout> in url.list lets you specify
the number of times to retry, time to wait between retries, and a timeout
setting.

B<Websec> waits for a random number of seconds between retries up to the value
specified by the I<Randomwait> keyword. This is to prevent websec from being
blocked by websites that perform log analysis to find time similarities between
requests.


=head1 SEE ALSO

/usr/share/doc/websec/README.gz, L<url.list(5)>, L<ignore.list(5)>, L<webdiff(1)>.


=head1 AUTHOR

Victor Chew is the original author of this software,
Baruch Even is continuing the maintenance and
Joop Stakenborg <pa3aba@debian.org> provided this man page, 

=cut

vim:set et ts=4:
