#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# urlwatch is a minimalistic URL watcher written in Python
#
# Copyright (c) 2008-2009 Thomas Perl <thp@thpinfo.com>
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. The name of the author may not be used to endorse or promote products
#    derived from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

"""Watch web pages and arbitrary URLs for changes"""

pkgname = 'urlwatch'

__author__ = 'Thomas Perl <thp@thpinfo.com>'
__copyright__ = 'Copyright 2008-2009 Thomas Perl'
__license__ = 'BSD'
__homepage__ = 'http://thpinfo.com/2008/urlwatch/'
__version__ = '1.7'

user_agent = '%s/%s (+http://thpinfo.com/2008/urlwatch/info.html)' % (pkgname, __version__)

# Configuration section
display_errors = False
line_length = 75


# File and folder paths
import sys
import os.path

urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
cache_dir = os.path.join(urlwatch_dir, 'cache')
scripts_dir = os.path.join(urlwatch_dir, 'lib')
hooks_py = os.path.join(scripts_dir, 'hooks.py')

# Check if we are installed in the system already
(prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))

if bindir == 'bin':
    # Assume we are installed in system
    examples_dir = os.path.join(prefix, 'share', pkgname, 'examples')
else:
    # Assume we are not yet installed
    examples_dir = os.path.join(prefix, bindir, 'examples')
    sys.path.append(os.path.join(prefix, bindir, 'lib'))

urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')

# Code section

try:
    # Available in Python 2.5 and above and preferred if available
    import hashlib
    have_hashlib = True
except ImportError:
    # "sha" is deprecated since Python 2.5 (throws a warning in Python 2.6)
    # Thanks to Frank Palvölgyi for reporting the warning in Python 2.6
    import sha
    have_hashlib = False

import shutil
import os
import urllib2
import difflib
import datetime
import optparse
import logging
import imp

log = logging.getLogger(pkgname)
log.setLevel(logging.DEBUG)

class NullHandler(logging.Handler):
    def emit(self, record):
        pass

log.addHandler(NullHandler())

def foutput(type, url, content=None, summary=None, c='*', n=line_length):
    """Format output messages
    
    Returns a snippet of a specific message type (i.e. 'changed') for
    a specific URL and an optional (possibly multi-line) content.

    The parameter "summary" (if specified) should be a list variable
    that gets one item appended for the summary of the changes.

    The return value is a list of strings (one item per line).
    """
    summary_txt = ': '.join((type.upper(), url))

    if summary is not None:
        if content is None:
            summary.append(summary_txt)
        else:
            summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))

    result = [c*n, summary_txt]
    if content is not None:
        result += [c*n, str(content)]
    result += [c*n, '', '']

    return result


if __name__ == '__main__':
    start = datetime.datetime.now()

    # Option parser
    parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
    parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
    parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
    parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
    parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')

    parser.set_defaults(verbose=False, display_errors=False)

    (options, args) = parser.parse_args(sys.argv)

    if options.verbose:
        # Enable logging to the console
        console = logging.StreamHandler()
        console.setLevel(logging.DEBUG)
        formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
        console.setFormatter(formatter)
        log.addHandler(console)
        log.info('turning on verbose logging mode')

    if options.display_errors:
        log.info('turning display of errors ON')
        display_errors = True

    if options.urls:
        if os.path.isfile(options.urls):
            urls_txt = options.urls
            log.info('using %s as urls.txt' % options.urls)
        else:
            log.error('%s is not a file' % options.urls)
            print 'Error: %s is not a file' % options.urls
            sys.exit(1)

    if options.hooks:
        if os.path.isfile(options.hooks):
            hooks_py = options.hooks
            log.info('using %s as hooks.py' % options.hooks)
        else:
            log.error('%s is not a file' % options.hooks)
            print 'Error: %s is not a file' % options.hooks
            sys.exit(1)

    # Created all needed folders
    for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
        if not os.path.isdir(needed_dir):
            os.makedirs(needed_dir)

    # Check for required files
    if not os.path.isfile(urls_txt):
        log.warning('not a file: %s' % urls_txt)
        urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
        hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
        print 'Error: You need to create a urls.txt file first.'
        print ''
        print 'Place it in %s' % (urls_txt)
        print 'An example is available in %s' % (urls_txt_fn)
        print ''
        if not options.hooks:
            print 'You can also create %s' % (hooks_py)
            print 'An example is available in %s' % (hooks_py_fn)
            print ''
        if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
            shutil.copy(urls_txt_example, urls_txt_fn)
        if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
            shutil.copy(hooks_py_example, hooks_py_fn)
        sys.exit(1)

    headers = {
            'User-agent': user_agent,
    }

    summary = []
    details = []
    count = 0

    if os.path.exists(hooks_py):
        log.info('using hooks.py from %s' % hooks_py)
        hooks = imp.load_source('hooks', hooks_py)
        if hasattr(hooks, 'filter'):
            log.info('found and enabled filter function from hooks.py')
            filter = hooks.filter
        else:
            log.warning('hooks.py has no filter function - ignoring')
            filter = lambda x, y: y
    else:
        log.info('not using hooks.py (file not found)')
        filter = lambda x, y: y

    for url in (x for x in open(urls_txt).read().splitlines() if not (x.startswith('#') or x.strip()=='')):
        log.info('processing URL: %s' % url)
        if have_hashlib:
            sha_hash = hashlib.new('sha1')
            sha_hash.update(url)
        else:
            sha_hash = sha.new(url)
        filename = os.path.join(cache_dir, sha_hash.hexdigest())
        try:
            request = urllib2.Request(url, None, headers)
            data = filter(url, urllib2.urlopen(request).read())
            if os.path.exists(filename):
                log.info('%s exists - creating unified diff' % filename)
                old_data = open(filename).read()
                diff = ''.join(difflib.unified_diff(old_data.splitlines(1), data.splitlines(1)))
                if len(diff) > 0:
                    log.info('%s has changed - adding diff' % url)
                    details += foutput('changed', url, diff, summary)
                else:
                    log.info('%s has not changed' % url)
            else:
                log.info('%s does not exist - url is considered "new"' % filename)
                details += foutput('new', url, None, summary)
            log.info('writing current content of %s to %s' % (url, filename))
            open(filename, 'w').write(data)
        except urllib2.HTTPError, error:
            log.error('got HTTPError while loading url: %s' % error)
            if display_errors:
                details += foutput('error', url, error, summary)
        except urllib2.URLError, error:
            log.error('got URLError while loading url: %s' % error)
            if display_errors:
                details += foutput('error', url, error, summary)
        count += 1

    end = datetime.datetime.now()

    # Output everything
    if len(summary) > 1:
        log.info('printing summary with %d items' % len(summary))
        print '-'*line_length
        print 'summary: %d changes' % (len(summary),)
        print ''
        for id, line in enumerate(summary):
            print '%02d. %s' % (id+1, line)
        print '-'*line_length
        print '\n\n\n'
    else:
        log.info('summary is too short - not printing')
    if len(details) > 1:
        log.info('printing details with %d items' % len(details))
        print '\n'.join(details)
        print '-- '
        print '%s %s, %s' % (pkgname, __version__, __copyright__)
        print 'Website: %s' % (__homepage__,)
        print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
    else:
        log.info('no details collected - not printing')

