#!/usr/bin/env python
# Copyright (C) 2010, 2011 Linaro
#
# Author: James Tunnicliffe <james.tunnicliffe@linaro.org>
#
# This file is part of Linaro Image Tools.
#
# Linaro Image Tools is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# Linaro Image Tools is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with Linaro Image Tools; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
# USA.

import os
import re
import urlparse
import logging
import bz2
import linaro_image_tools.fetch_image

RELEASES_WWW_DOCUMENT_ROOT = "/srv/releases.linaro.org/www"
RELEASE_URL = "http://releases.linaro.org/"
OLD_RELEASES_WWW_DOCUMENT_ROOT = "/srv/releases.linaro.org/www/platform"
OLD_RELEASE_URL = "http://releases.linaro.org/platform/"
SNAPSHOTS_WWW_DOCUMENT_ROOT = "/srv/snapshots.linaro.org/www/"
SNAPSHOTS_URL = "http://snapshots.linaro.org/"

class ServerIndexer():
    """Create a database of files on the linaro image servers for use by image
       creation tools."""
    def reset(self):
        self.url_parse = []

    def __init__(self):
        self.reset()
        self.db_file_name = "server_index"
        self.db = linaro_image_tools.fetch_image.DB(self.db_file_name)

    def regexp_list_matches_some(self, to_search, list):
        assert len(list), "empty list passed"

        for item in list:
            if re.search(item, to_search):
                return True

        return False

    def regexp_list_matches_all(self, to_search, list):
        assert len(list), "empty list passed"

        for item in list:
            if not re.search(item, to_search):
                return False

        return True

    def crawl(self):
        self.db.set_url_parse_info(self.url_parse)
        logging.getLogger("linaro_image_tools").info(self.url_parse)

        for index in range(len(self.url_parse)):
            info = self.url_parse[index]
            table = info["table"]
            
            logging.getLogger("linaro_image_tools").info("%s %s %s %s %s" % \
                (info["base_dir"], info["base_url"], table,
                info["url_validator"], info["url_chunks"]))

            self.go(info, table, index)

            logging.getLogger("linaro_image_tools").info("")


    def go(self, info, table, index):
        root_url = info["base_url"]
        root_dir = info["base_dir"]

        for root, subFolders, files in os.walk( root_dir ):
            for file in files:
                relative_location = re.sub(root_dir, "",
                                           os.path.join(root, file))
                relative_location = relative_location.lstrip("/")
                
                to_match = info["url_validator"][0]
                not_match = info["url_validator"][1]

                url = urlparse.urljoin(root_url, relative_location)
                url = urlparse.urljoin(url, file)

                to_match_ok = False
                if len(to_match) == 0:
                    to_match_ok = True
                if len(to_match) and self.regexp_list_matches_all(
                                            relative_location, to_match):
                    to_match_ok = True

                not_match_ok = True
                if len(not_match) and self.regexp_list_matches_some(
                                        relative_location, not_match):
                    not_match_ok = False

                if( not (to_match_ok and not_match_ok)
                   or not re.search("\.gz$", file)):
                    continue  # URL doesn't match the validator. Ignore.

                logging.getLogger("linaro_image_tools").info(url)
                self.db.record_url(url, index)
                    
        self.dump() 

    def dump(self):
        self.db.commit()
        
    def close_and_bzip2(self):
        # After finishing creating the database, create a compressed version
        # for more efficient downloads
        self.db.close()
        bz2_db_file = bz2.BZ2File(self.db_file_name + ".bz2", "w")
        db_file = open(self.db_file_name)
        bz2_db_file.write(db_file.read())
        bz2_db_file.close()

    def add_directory_parse_list(self,
                                 base_dir,
                                 base_url,
                                 url_validator,
                                 db_columns,
                                 table,
                                 url_chunks):
        
        if not id in self.url_parse:
            self.url_parse.append({"base_dir": base_dir,
                                   "base_url": base_url,
                                   "url_validator": url_validator,
                                   "db_columns": db_columns,
                                   "url_chunks": url_chunks,
                                   "table": table})
            logging.getLogger("linaro_image_tools").info(base_dir)

            # Construct data needed to create the table
            items = []
            for item in url_chunks:
                if(item != ""):
                    # If the entry is a tuple, it indicates it is of the
                    # form name, regexp
                    if(isinstance(item, tuple)):
                        items.append(item[0])
                    else:
                        items.append(item)

            self.db.create_table_with_name_columns(table, db_columns)

    def clean_removed_urls_from_db(self):
        self.db.clean_removed_urls_from_db()

if __name__ == '__main__':
    crawler = ServerIndexer()

    ch = logging.StreamHandler()
    ch.setLevel(logging.CRITICAL)
    formatter = logging.Formatter("%(message)s")
    ch.setFormatter(formatter)
    logger = logging.getLogger("linaro_image_tools")
    logger.setLevel(logging.CRITICAL)
    logger.addHandler(ch)

    #linaro-n/ubuntu-desktop/11.09
    crawler.add_directory_parse_list(OLD_RELEASES_WWW_DOCUMENT_ROOT,
                                     OLD_RELEASE_URL,
                                     ([], ["platform/", "old/", "hwpack",
                                           "alpha", "beta", "final", "leb",
                                           "leb", "release-candidate"]),
                                     ["platform", "image", "build=final"],
                                     "release_binaries",
                                     ["", "image", "platform"])

    #linaro-n/hwpacks/11.09
    crawler.add_directory_parse_list(OLD_RELEASES_WWW_DOCUMENT_ROOT,
                                     OLD_RELEASE_URL,
                                     (["/hwpacks/"],
                                      ["alpha", "beta", "final", "leb",
                                       "release-candidate"]),
                                     ["platform", "hardware", "build=final"],
                                     "release_hwpacks",
                                     ["", "", "platform",
                                      ("hardware", r"hwpack_linaro-(.*?)_")])

    # 11.10/ubuntu/oneiric-images/ubuntu-desktop/
    # NOT images/...
    crawler.add_directory_parse_list(RELEASES_WWW_DOCUMENT_ROOT,
                                     RELEASE_URL,
                                     (["\d+\.\d+", "ubuntu", "oneiric-images"],
                                      ["latest/", "platform/", "old/",
                                       "hwpack", "^images/"]),
                                     ["platform", "image", "build=final"],
                                     "release_binaries",
                                     ["platform", "", "", "image"])

    # 11.10/ubuntu/oneiric-hwpacks/
    crawler.add_directory_parse_list(RELEASES_WWW_DOCUMENT_ROOT,
                                     RELEASE_URL,
                                     (["\d+\.\d+", "ubuntu", "oneiric-hwpacks"],
                                      ["latest/", "platform/", "old/",
                                       "^images/"]),
                                     ["platform", "hardware", "build=final"],
                                     "release_hwpacks",
                                     ["platform", "", "",
                                      ("hardware", r"hwpack_linaro-(.*?)_")])

    #oneiric/linaro-o-alip/20111026/0/images/tar/
    crawler.add_directory_parse_list(SNAPSHOTS_WWW_DOCUMENT_ROOT,
                                     SNAPSHOTS_URL,
                                     (["^oneiric/"], ["/hwpack"]),
                                     ["platform", "image", "date", "build"],
                                     "snapshot_binaries",
                                     ["platform", "image", "date", "build"])

    #oneiric/lt-panda-oneiric/20111026/0/images/hwpack/
    crawler.add_directory_parse_list(SNAPSHOTS_WWW_DOCUMENT_ROOT,
                                     SNAPSHOTS_URL,
                                     (["^oneiric/", "/hwpack"], []),
                                     ["platform", "hardware", "date", "build"],
                                     "snapshot_hwpacks",
                                     ["platform", "hardware", "date", "build"])

    crawler.crawl()
    crawler.clean_removed_urls_from_db()
    crawler.dump()
    crawler.close_and_bzip2()
