#!/usr/bin/python
# encoding: utf-8

import os, sys, getopt, signal, time, socket, select, traceback, re, gzip, StringIO

VERSION="1.2.6p5"

sites = {}
dump_interval = 5 # dump state once every five seconds

#   .--Proxy---------------------------------------------------------------.
#   |                      ____                                            |
#   |                     |  _ \ _ __ _____  ___   _                       |
#   |                     | |_) | '__/ _ \ \/ / | | |                      |
#   |                     |  __/| | | (_) >  <| |_| |                      |
#   |                     |_|   |_|  \___/_/\_\\__, |                      |
#   |                                          |___/                       |
#   +----------------------------------------------------------------------+
#   |  The actual proxy code                                               |
#   '----------------------------------------------------------------------'

# State of all sites
g_sites = {}

# Example for this dictionary:
# 'mysite': {'channels': [{'since': 1370420497.907061,
#                        'socket': <socket._socketobject object at 0x7f5d0adc39f0>,
#                        'state': 'ready'},
#                       {'since': 1370420502.930155,
#                        'socket': <socket._socketobject object at 0x7f5d0adc3c20>,
#                        'state': 'busy'},
#                       {'since': 1370420492.883395,
#                        'socket': <socket._socketobject object at 0x7f5d0adc3ec0>,
#                        'state': 'heartbeat'}],
#          'clients': [{'since': 1370420648.132495,
#                        'socket': <socket._socketobject object at 0x7f2cc9b68fa0>,
#                        'state': 'idle'},
#                      {'channel': {'client': <Recursion on dict with id=29691200>,
#                        'since': 1370420695.609493,
#                        'socket': <socket._socketobject object at 0x7f2cc9b68a60>,
#                        'state': 'busy'}],
#          'heartbeat': {'channel': None,
#                        'count': 3,
#                        'since': 1370420507.965436},
#          'last_failed_connect': 0,
#          'last_reset': 1370420492.879927,
#          'last_reload' : 1370420492,
#          'last_inventory_update' : 1370420492.879927,
#          'inventory_pid' : None,
#          'cache' : {},
#          'socket': <socket._socketobject object at 0x7f5d0adc3910>,
#          'state': 'ready'}}
#



# The main loop of the daemon goes here

g_need_restart = False
g_need_reload = False

def liveproxyd_run():
    global g_need_restart, g_need_reload

    open_client_sockets()
    last_dump = 0

    while True:

        try:
            if g_need_restart:
                do_restart()
                g_need_restart = False
            if g_need_reload:
                do_reload()
                g_need_reload = False

            initiate_connections()
            do_heartbeats()

            readable, writable = do_select(0.2)

            complete_connections(writable)
            accept_new_clients(readable)
            get_new_requests(readable)
            distribute_requests()
            get_responses(readable) # also heartbeats
            send_responses(writable)
            handle_client_timeouts()
            garbage_collect_sockets()
            collect_inventory_updates()

            now = time.time()
            if dump_interval != None and now - last_dump > dump_interval:
                dump_state()
                last_dump = now

        except Exception, e:
            if opt_debug:
                raise

            if "Too many open files" in str(e):
                log("Too many open files! Please increase the ulimit for 'nofiles'!")
                log("Waiting for 5 seconds...")
                time.sleep(5)
                log("Restarting, in the hope that files will the again be enough for a while.")
                do_restart()

            log("Ignoring exception: %s: %s" % (e, traceback.format_exc()))
            time.sleep(1) # Avoid CPU loop in case of permanent error

def initiate_connections():
    # Create new channels to target sites. Nonblocking!
    for sitename, siteconf in sites.items():
        sitestate = g_sites[sitename]
        channels = sitestate["channels"]
        if len(channels) < siteconf["channels"]:
            if time.time() - sitestate["last_failed_connect"] >= siteconf["connect_retry"]:
                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                s.setblocking(0)
                try:
                    s.connect(siteconf["socket"])
                except socket.error, e:
                    try:
                        errno = e.errno
                    except:
                        errno = None

                    if e.errno != 115: # "Operation now in progress"
                        sitestate["last_failed_connect"] = time.time()
                        if opt_debug:
                            raise
                        else:
                            log("Error: cannot create TCP channel to %s:%d: %s" % (siteconf["socket"] + (e,)))
                            break

                except Exception, e:
                    sitestate["last_failed_connect"] = time.time()
                    if opt_debug:
                        raise
                    log("Error: cannot create TCP channel to %s:%d: %s" % (siteconf["socket"] + (e,)))
                    break

                channels.append({ "socket" : s, "state" : "connecting", "since": time.time()})


def do_heartbeats():
    now = time.time()
    for sitename, sitestate in g_sites.items():
        if sitestate["channels"]:
            rate, timeout = sites[sitename]["heartbeat"]
            channel = sitestate["heartbeat"]["channel"]
            since = sitestate["heartbeat"]["since"]
            if channel: # wait for response
                age = now - since
                if age >= timeout:
                    log("Heartbeat timeout after %.1f sec on channel %s/%d" % (
                        age, sitename, channel["socket"].fileno()))
                    sitestate["last_failed_connect"] = now
                    disconnect_from_site(sitename)
            else:
                if now - since >= rate:
                    if not send_heartbeat(sitename, sitestate):
                        if now - since >= 2 * rate:
                            log("Could not send heartbeat to %s for %d secs - no free channel" % ( sitename, (2 * rate)))
                            disconnect_from_site(sitename)



def send_heartbeat(sitename, sitestate):
    # find free channel that is unused for the longest time
    now = time.time()
    while True:
        next_channel = None
        for channel in sitestate["channels"]:
            if channel["state"] == "ready":
                if next_channel == None or channel["since"] < next_channel["since"]:
                    next_channel = channel
        if not next_channel:
            #log("Cannot send heartbeat to %s: no channel ready" % sitename)
            return False

        else:
            channel = next_channel
            # log("Sending heartbeat to channel %s/%d" % (sitename, channel["socket"].fileno()))
            try:
                channel["socket"].send("GET status\nKeepAlive: on\nColumns: program_start mk_inventory_last\n\n")
                channel["state"] = "heartbeat"
                channel["since"] = now
                sitestate["heartbeat"]["since"] = now
                sitestate["heartbeat"]["channel"] = channel
                return True
            except Exception, e:
                log("Cannot send heartbeat to channel %s/%d: %s" % (sitename, channel["socket"].fileno(), e))
                channel["state"] = "error"


# Close all connections. Site is considered to be dead
def disconnect_from_site(sitename):
    now = time.time()
    log("Site %s is considered dead. Closing all connections." % sitename)
    sitestate = g_sites[sitename]
    sitestate["channels"] = []
    sitestate["cache"] = {}
    sitestate["state"] = "starting"
    sitestate["heartbeat"] = { "since" : now, "channel" : None, "count" : 0}
    sitestate["last_reload"] = now
    sitestate["last_reset"] = now
    for client in sitestate["clients"]:
        if client["state"] == "wait_for_response":
            client["state"] = "wait_for_channel" # request still stored there


def complete_connections(writable):
    for sitename, sitestate in g_sites.items():
        for channel in sitestate["channels"]:
            if channel["state"] == "connecting" and channel["socket"].fileno() in writable:
                try:
                    channel["socket"].send("")
                    channel["socket"].setblocking(1) # avoid signals from interrupting us
                    log("Channel %s/%d successfully connected" % (sitename, channel["socket"].fileno()))
                    channel["state"] = "ready"
                except Exception, e:
                    log("Failed to connect channel %s/%d: %s" % (sitename, channel["socket"].fileno(), e))
                    channel["state"] = "error"
                    sitestate["last_failed_connect"] = time.time()

# Master/Mega/Central poll(). We though we are the select() master. But select()
# is limited to 1024 filedescriptors in most Python versions. So we rather use
# poll(). If you have many sites and many users you will easily get more
# filedescriptors...
def do_select(timeout):
    p = select.poll()

    for sitename, sitestate in g_sites.items():
        # outgoing connections currently building up
        for channel in sitestate["channels"]:
            sock = channel["socket"]
            if channel["state"] == "connecting":
                p.register(sock, select.POLLOUT)

        # new client connections
        sock = sitestate["socket"]
        p.register(sock, select.POLLIN)

        for client in sitestate["clients"]:
            sock = client["socket"]

            # new requests from existing clients
            if client["state"] == "idle":
                p.register(sock, select.POLLIN)

            # clients ready to receive a response
            elif client["state"] == "response":
                p.register(sock, select.POLLOUT)

        # Responses from channels, also heartbeat responses
        for channel in sitestate["channels"]:
            if channel["state"] in [ "busy", "heartbeat" ]:
                p.register(channel["socket"], select.POLLIN)

    try:
        readylist = p.poll(timeout * 1000)
        r_able = []
        w_able = []
        for fd, event in readylist:
            if event & select.POLLIN:
                r_able.append(fd)
            if event & select.POLLOUT:
                w_able.append(fd)
        return r_able, w_able

    except select.error, e:
        log("Error during poll(): %s" % e)
        return [], []



def accept_new_clients(readable):
    for sitename, sitestate in g_sites.items():
        if sitestate["socket"].fileno() in readable:
            try:
                s, addrinfo = sitestate["socket"].accept()
                s.setblocking(1)
                # log("Accepted new client %s/%d" % (sitename, s.fileno()))
                sitestate["clients"].append({"socket" : s, "state" : "idle", "since" : time.time()})
            except Exception, e:
                if opt_debug:
                    raise
                log("Failed to accept new client for %s: %s" % (sitename, e))

def get_new_requests(readable):
    for sitename, sitestate in g_sites.items():
        for client in sitestate["clients"]:
            if client["state"] == "idle" and \
                (client["socket"].fileno() in readable or client.get("nextrequest")):
                try:
                    request = receive_request(sitename, client)
                    if not request:
                        client["state"] = "closed"
                    else:
                        if sitestate["state"] == "starting":
                            respond_client_with_error(client, "Site is currently not reachable.")
                        elif not request.startswith("GET") and not request.startswith("COMMAND"):
                            log("Invalid request [%s] from client %s/%d" % (
                                request.replace("\n", "\\n"), sitename, client["socket"].fileno()))
                            client["state"] = "closed"
                        elif not request.startswith("COMMAND") and "ResponseHeader: fixed16\n" not in request:
                            respond_client_with_error(client,
                                "Invalid request, you must specify ResponseHeader: fixed16.")
                        else:
                            # Try to find a matching response in our cache
                            if not respond_from_cache(sitename, sitestate, client, request):
                                client["state"] = "wait_for_channel"
                                client["since"] = time.time()
                                client["request"] = cache_regex.sub("", request)
                except Exception, e:
                    if opt_debug:
                        raise
                    log("Cannot read request from client %s/%d: %s" %
                        (sitename, client["socket"].fileno(), e))

cache_regex = re.compile("\nCache: *([^\n]*)")
localtime_regex = re.compile("\nLocaltime:[^\n]*")

def respond_from_cache(sitename, sitestate, client, request):
    mo = cache_regex.search(request)
    if not mo:
        client["add_to_cache"] = False # do not cache the response
        return False

    # Get the cache time option. It is either a number of
    # seconds (not implemented) or the word "reload", which
    # means: cache until the configuration of the site changed
    # due to a reload. Currently we simply ignore this option,
    # since we only know "reload".
    # cache_time = mo.group(1)

    # Remove Localtime: header, since it contains a timestamp
    # and therefore is always unique.
    cleared_request = localtime_regex.sub("", request)

    # Now lets look into our cache if it has a cached response
    response = sitestate["cache"].get(cleared_request)
    if response:
        client["response"] = response
        client["response_offset"] = 0
        client["state"] = "response"
        return True
    else:
        client["add_to_cache"] = cleared_request # get response into cache
    return False

def distribute_requests():
    for sitename, sitestate in g_sites.items():
        if sitestate["state"] != "ready":
            continue
        waiting_clients = [ client for client in sitestate["clients"] if client["state"] == "wait_for_channel"]

        # Sort after waiting time, we should be fair to all...
        waiting_clients.sort(cmp = lambda a, b: cmp(a["since"], b["since"]))

        # one channel must always be kept for heartbeat
        allowed_channels = len([c for c in sitestate["channels"] if c["state"] in ["ready", "heartbeat"]])
        if allowed_channels <= 1:
            sitestate["state"] = "busy"
        else:
            for channel in sitestate["channels"]:
                if not waiting_clients:
                    break
                if channel["state"] == "ready":
                    client = waiting_clients[0]
                    del waiting_clients[0]
                    forward_request(sitename, client, channel)

def forward_request(sitename, client, channel):
    cls = client["socket"]
    chs = channel["socket"]
    try:
        chs.send(client["request"])
        if not client["request"].startswith("COMMAND"):
            client["state"] = "wait_for_response"
            channel["state"] = "busy"
            client["channel"] = channel
            channel["client"] = client
        else:
            client["request"] = ""
            client["state"] = "idle"
    except Exception, e:
        if opt_debug:
            raise
        log("Error: %s" % e)
        respond_client_with_error(client, str(e))


def respond_client_with_error(client, message):
    try:
        while True:
            try:
                response = "400%12d\n%s\n" % (len(message) + 1, message)
                client["socket"].send(response)
                break
            except socket.error, e:
                if e.errno == 4:
                    continue # Interrupted system call
                else:
                    raise
    except Exception, e:
        if opt_debug:
            raise
        log("Cannot send error message to client %d: %s" % (
            client["socket"].fileno(), e))
        client["state"] = "closed"

    if client.get("channel"):
        channel = client["channel"]
        del channel["client"]
        del client["channel"]

    client["state"] = "idle"
    client["request"] = ""



# TODO: one malicious client can hang the whole proxy. In order
# to prevent this we'd need partial requests...
def receive_request(sitename, client):
    # Note: Multisite can send several requests at once. For example
    # a command and a wait query (reschedule button)
    request = client.get("nextrequest", "")
    client["nextrequest"] = ""
    while "\n\n" not in request:
        try:
            chunk = client["socket"].recv(65536)
            request += chunk
        except socket.error, e:
            if e.errno == 104: # Connection reset by peer
                return None
            elif e.errno != 4: # Interrupted system cal
                raise

        if not chunk:
            # log("Client %s/%d closed connection." % (sitename, client["socket"].fileno()))
            return None
    end = request.index("\n\n")
    client["nextrequest"] = request[end+2:]
    request = request[:end+2]
    return request


def get_responses(readable):
    for sitename, sitestate in g_sites.items():
        for channel in sitestate["channels"]:
            if channel["socket"].fileno() in readable:
                if channel["state"] == "busy":
                    receive_response(sitename, channel)
                else:
                    receive_heartbeat(sitename, channel)



def receive_response(sitename, channel):
    client = channel.get("client") # None -> client timed out before response!
    if not client:
        log("Response from timed-out client arrived lately on channel %s/%d" %
            (sitename, channel["socket"].fileno()))
    # We always assume fixed16 as response header!
    old_response = channel.get("response", "")
    try:
        chunk = channel["socket"].recv(65536)
        if not chunk:
            raise Exception("Connection closed by foreign host")

        response = old_response + chunk
        channel["response"] = response

    except Exception, e:
        log("Cannot read response from %s/%d: %s" %
            (sitename, channel["socket"].fileno(), e))
        if client:
            client["channel"] = None
            client["state"] = "wait_for_channel"
        channel["state"] = "closed"
        return

    if len(response) < 16: # header not yet complete
        return

    try:
        bodylength = int(response[3:15])
    except Exception, e:
        log("Malformed response header from cannel %s/%d: [%s]" %
            (sitename, channel["socket"].fileno(), response[:16]))
        if client:
            client["channel"] = None
            client["state"] = "wait_for_channel"
        channel["state"] = "error"
        return

    if len(response) > bodylength + 16:
        log("Too large response on channel %s/%d (%d exceeding bytes: [%s])" % (
            sitename, channel["socket"].fileno(), len(response) - bodylength - 16, response[bodylength + 16:]))
        if client:
            client["channel"] = None
            client["state"] = "wait_for_channel"
        channel["state"] = "error"
        return


    elif len(response) < bodylength + 16:
        return

    # Response complete
    sitestatus = g_sites[sitename]
    channel["state"] = "ready"
    sitestatus["state"] = "ready" # at least one channel free
    channel["response"] = ""

    if client:
        del channel["client"]
        del client["channel"]
        client["response"] = response
        client["response_offset"] = 0
        client["state"] = "response"
        if client["add_to_cache"]:
            cleared_request = client["add_to_cache"]
            sitestatus["cache"][cleared_request] = response
            del client["add_to_cache"]

def send_responses(writable):
    for sitename, sitestatus in g_sites.items():
        for client in sitestatus["clients"]:
            if client["state"] == "response" and client["socket"].fileno() in writable:

                try:
                    # ACHTUNG: Beim senden an den Client können wir blockieren, wenn
                    # der Empfänger uns ausbremst. Dürfen wir aber nicht. Wir brauchen
                    # eine Queue, müssen mit select() warten, usw.
                    offset = client["response_offset"]
                    chunk = client["response"][offset:offset + 8192]
                    client["socket"].setblocking(0) # TEST TEST TEST
                    bytes_sent = client["socket"].send(chunk)
                    client["socket"].setblocking(1)
                    if bytes_sent <= 0:
                        raise Exception("Could not send any bytes of response to client")
                    if offset + bytes_sent == len(client["response"]):
                        client["state"] = "idle"
                        del client["response"]
                        del client["response_offset"]
                    else:
                        client["response_offset"] += bytes_sent

                except Exception, e:
                    if opt_debug:
                        raise
                    log("Cannot forward next %d bytes of response to client %s/%d: %s" %
                        (len(chunk), sitename, client["socket"].fileno(), e))
                    client["state"] = "error"
                    del client["response"]
                    del client["response_offset"]

def receive_heartbeat(sitename, channel):
    sitestate = g_sites[sitename]
    try:
        chunk = channel["socket"].recv(4096)
        if not chunk:
            log("Channel %s/%d closed by foreign host while reading heartbeat" %
                (sitename, channel["socket"].fileno()))
            disconnect_from_site(sitename)
        else:
            # We expect two timestamps (integer): the time of the last
            # configuration change and the time of the last HW/SW-Inventory
            # update. Example responses:
            # 1414747788;0          -> no inventory never ever done
            # 1414747788;1414747545 -> last inventory was at 1414747545
            if len(chunk) < 12 or not chunk[0].isdigit() or chunk[-1] != "\n" or chunk.count(";") != 1:
                log("Channel %s/%d: invalid response \"%s\" to heartbeat" % (
                    sitename, channel["socket"].fileno(), chunk))
                disconnect_from_site(sitename)
            else:
                parts = chunk.split(";")
                last_reload = int(parts[0])
                mk_inventory_last = int(parts[1])
                if mk_inventory_last: # Remote site has inventory data
                    check_inventory_update(mk_inventory_last, sitename)

                # Set site to ready if at least one other channel exists
                if len(sitestate["channels"]) > 1:
                    sitestate["state"] = "ready"
                sitestate["state"] = "ready" # at least one connection is up
                channel["state"] = "ready"
                sitestate["heartbeat"]["since"] = time.time()
                sitestate["heartbeat"]["channel"] = None
                sitestate["heartbeat"]["count"] += 1
                if sitestate["last_reload"] != last_reload:
                    if sitestate["cache"]:
                        log("Site %s might have new configuration. Dropping cache" % sitename)
                        sitestate["cache"] = {}
                    sitestate["last_reload"] = last_reload

    except Exception, e:
        if opt_debug:
            raise
        log("Exception while reading heartbeat from channel %s/%d: %s" %
            (sitename, channel["socket"].fileno(), e))


def handle_client_timeouts():
    now = time.time()
    for sitename, sitestatus in g_sites.items():
        siteconf = sites[sitename]

        for client in sitestatus["clients"]:
            if client["state"] != "idle":
                age = now - client["since"]
                if client["state"] == "wait_for_channel" and \
                    age >= siteconf["channel_timeout"]:
                    respond_client_with_error(client, "Timeout while waiting for free Livestatus channel to site %s." %
                        sitename)
                elif client["state"] == "wait_for_response" and \
                    age > siteconf["query_timeout"]:
                    respond_client_with_error(client, "Timeout while waiting for response from site %s." % sitename)





def open_client_sockets():
    if not os.path.exists(opt_socketdir):
        os.makedirs(opt_socketdir)

    for sitename, siteconf in sites.items():
        create_unix_socket(sitename)

def create_unix_socket(sitename):
    path = opt_socketdir + "/" + sitename
    if os.path.exists(path):
        log("Removing left-over unix socket %s" % path)
        os.remove(path)
    try:
        s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
        s.bind(path)
        s.listen(10)
    except Exception, e:
        if opt_debug:
            raise
        bail_out("Cannot create unix socket %s: %s" % (path, e))

    g_sites[sitename]["socket"] = s

def garbage_collect_sockets():
    for sitename, sitestate in g_sites.items():
        sitestate["channels"] = [
            channel for channel in sitestate["channels"]
            if channel["state"] not in [ "error", "closed" ]]
        if len(sitestate["channels"]) == 0:
            sitestate["state"] = "starting"
        sitestate["clients"] = [
            channel for channel in sitestate["clients"]
            if channel["state"] not in [ "error", "closed" ]]

def format_time(t):
    return "%s (%3d secs ago)" % (
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(t)), time.time() - t)


def dump_state():
    sf = file(opt_statedumppath + ".new", "w")

    sf.write("----------------------------------------------\n")
    sf.write("Current state:\n")
    now = time.time()
    for sitename, sitestate in g_sites.items():
        sf.write("[%s]\n" % sitename)
        sf.write("  State:                   %s\n" % sitestate["state"])
        sf.write("  Last Reset:              %s\n" % format_time(sitestate["last_reset"]))
        sf.write("  Site's last reload:      %s\n" % format_time(sitestate["last_reload"]))
        sf.write("  Last failed connect:     %s\n" % format_time(sitestate["last_failed_connect"]))
        sf.write("  Cached responses:        %d\n" % len(sitestate["cache"]))
        sf.write("  Last inventory update:   %s\n" % format_time(sitestate["last_inventory_update"]))
        sf.write("  PID of inventory update: %s\n" % sitestate["inventory_pid"])
        sf.write("  Channels:\n")
        for channel in sitestate["channels"]:
            sf.write("    %3d - %-18s-  client: %4s - since: %s\n" %
                (channel["socket"].fileno(), channel["state"],
                 channel.get("client") and channel["client"]["socket"].fileno() or "none",
                 format_time(channel["since"])))
        sf.write("  Clients:\n")
        for client in sitestate["clients"]:
            sf.write("    %3d - %-18s- channel: %4s - since: %s\n" %
                (client["socket"].fileno(), client["state"],
                 client.get("channel") and client["channel"]["socket"].fileno() or "none",
                 format_time(client["since"])))

        sf.write("  Heartbeat:\n")
        hb = sitestate["heartbeat"]
        sf.write("    heartbeats received: %d\n" % hb["count"])
        if hb["channel"]:
            sf.write("    out since %.1fs\n" % (now - hb["since"]))
        else:
            sf.write("    next in %.1fs\n" % (sites[sitename]["heartbeat"][0] - (now - hb["since"])))

    # import pprint
    # sf.write("\n\n%s\n" % pprint.pformat(g_sites))
    os.rename(opt_statedumppath + ".new", opt_statedumppath)


#.
#   .--HW/SW-Inventory-----------------------------------------------------.
#   |            ___                      _                                |
#   |           |_ _|_ ____   _____ _ __ | |_ ___  _ __ _   _              |
#   |            | || '_ \ \ / / _ \ '_ \| __/ _ \| '__| | | |             |
#   |            | || | | \ V /  __/ | | | || (_) | |  | |_| |             |
#   |           |___|_| |_|\_/ \___|_| |_|\__\___/|_|   \__, |             |
#   |                                                   |___/              |
#   +----------------------------------------------------------------------+
#   | Synchronization of distributed HW/SW-Inventory                       |
#   '----------------------------------------------------------------------'

def check_inventory_update(mk_inventory_last, sitename):
    if not opt_inventory_dir:
        return # only works if this is set

    if not os.path.exists(opt_inventory_dir):
        os.makedirs(opt_inventory_dir)

    sitestatus = g_sites[sitename]
    if sitestatus["inventory_pid"]:
        return # Sync already running

    if mk_inventory_last <= sitestatus["last_inventory_update"]:
        return # we are up-to-date

    # log("Site %s: new HW/SW-Inventory data since last update %d secs ago" % (
    #     sitename, mk_inventory_last - sitestatus["last_inventory_update"]))

    # Inventory update goes in two steps:
    # 1. Getting the list of all hosts that have new inventory
    # 2. For each of those retriev the gzipped inventory data and store it
    #    locally
    # We cannot do this here since we must not block our main process!
    # Instead we spawn a helper process that inherits one of the channels,
    # does the update and returns a status code.

    # Find a free channel
    for cha in sitestatus["channels"]:
        if cha["state"] == "ready":
            channel = cha
            break
    else:
        log("Currently no free channel, postponing until next heartbeat")
        return

    channel["state"] = "inventory"

    # Now create helper process that inherits the filedescriptor of this
    # channel.
    pid = os.fork()
    if pid:
        sitestatus["inventory_pid"] = pid
        sitestatus["inventory_new_last"] = mk_inventory_last

    else:
        try:
            do_inventory_update(sitename, channel, sitestatus["last_inventory_update"])
            sys.exit(0)
        except Exception, e:
            log("Inventory update of %s failed: %s" % (sitename, e))
            sys.exit(1)


def collect_inventory_updates():
    for sitename, sitestatus in g_sites.items():
        if sitestatus["inventory_pid"]:
            pid, status = os.waitpid(sitestatus["inventory_pid"], os.WNOHANG)
            if pid:
                sitestatus["inventory_pid"] = None
                for channel in sitestatus["channels"]:
                    if channel["state"] == "inventory":
                        channel["state"] = "ready"
                if not status: # success
                    sitestatus["last_inventory_update"] = sitestatus["inventory_new_last"]
                    del sitestatus["inventory_new_last"]


def do_inventory_update(sitename, channel, since):
    # Close all filedescriptors that we do not need.
    needed_fds = [
        channel["socket"].fileno(),
        g_logfile.fileno(),
    ]

    for fd in range(4096):
        if fd not in needed_fds:
            try:
                os.close(fd)
            except:
                pass

    # Get list of all hosts that have more recent inventory data
    query = "GET hosts\n" \
            "Columns: name mk_inventory_last\n" \
            "Filter: mk_inventory_last > %d\n" \
            "KeepAlive: on\n" \
            "ResponseHeader: fixed16\n\n" % \
        since

    channel["socket"].send(query)
    response = get_livestatus_response(channel["socket"])

    entries = [ l.split(";") for l in response.split() ]

    for host_name, timestamp in entries:
        do_inventory_update_of(channel["socket"], sitename, host_name, int(timestamp))


def get_livestatus_response(socket):
    header = socket.recv(16)
    if len(header) != 16:
        raise Exception("Invalid resonse '%s' from remote site" % header)

    status = header[:3]
    length = int(header[3:].strip())
    response = ""
    while(len(response) != length):
        response += socket.recv(4096)

    if status != "200": raise Exception("Livestatus query failed: %s" % response)

    return response


# Update the inventory data of a host *if* its timestamp differs from
# the file that we already have
def do_inventory_update_of(socket, sitename, host_name, timestamp):
    path = opt_inventory_dir + "/" + host_name
    try:
        last_mtime = int(os.stat(path).st_mtime)
        if last_mtime == timestamp:
            log("Inventory of %s is uptodate" % host_name)
            return
    except:
        pass

    query = "GET hosts\n" \
            "Columns: mk_inventory_gz\n" \
            "Filter: name = %s\n" \
            "KeepAlive: on\n" \
            "ResponseHeader: fixed16\n\n" % host_name

    socket.send(query)
    response = get_livestatus_response(socket)
    gzipped_invdata = response[:-1] # drop final linefeed
    file(path + ".gz", "w").write(gzipped_invdata)
    os.utime(path + ".gz", (timestamp, timestamp))
    unzipped_invdata = gzip.GzipFile(fileobj=StringIO.StringIO(gzipped_invdata)).read()
    file(path, "w").write(unzipped_invdata)
    os.utime(path, (timestamp, timestamp))
    log("Site %s: new HW/SW-Inventory data for %s (%d bytes)" % (
            sitename, host_name, len(gzipped_invdata)))


#.
#   .--Daemon/main---------------------------------------------------------.
#   | ____                                      __               _         |
#   ||  _ \  __ _  ___ _ __ ___   ___  _ __    / / __ ___   __ _(_)_ __    |
#   || | | |/ _` |/ _ \ '_ ` _ \ / _ \| '_ \  / / '_ ` _ \ / _` | | '_ \   |
#   || |_| | (_| |  __/ | | | | | (_) | | | |/ /| | | | | | (_| | | | | |  |
#   ||____/ \__,_|\___|_| |_| |_|\___/|_| |_/_/ |_| |_| |_|\__,_|_|_| |_|  |
#   |                                                                      |
#   +----------------------------------------------------------------------+
#   | Global code, daemonize, getopt and usage                             |
#   '----------------------------------------------------------------------'

# Open logfile, fall back to stdout if this is not successfull
def open_log():
    global g_logfile
    try:
        g_logfile = file(opt_logfilepath, "a")
        g_logfile.flush()
    except Exception, e:
        if opt_debug:
            raise
        g_logfile = sys.stderr
        log("Cannot open logfile %s: %s" % (opt_logfilepath, e))

# Send message to logfile. In foreground mode we omit the timestamp
def log(text):
    if type(text) == unicode:
        text = text.encode("utf-8")
    try:
        if not opt_foreground:
            t = time.time()
            g_logfile.write('%s.%06d '  % (time.strftime("%F %T", time.localtime(t)), int(t * 1000000) % 1000000))
        g_logfile.write("%s\n" % text)
        g_logfile.flush()
    except:
        sys.stderr.write("%s\n" % text)

# Log a message, but only in verbose mode
def verbose(text):
    if opt_verbose:
        log(text)


def bail_out(reason):
    log("FATAL ERROR: %s" % reason)
    sys.exit(1)


def usage():
    sys.stdout.write("""Usage: liveproxyd [OPTIONS]

   -v, --verbose        Enable verbose output
   -g, --foreground     Do not daemonize, run in foreground
   -c, --conifg CONFIG  Read configuration from file CONFIG
   --debug              Enable debug mode (let exceptions through)
   -h, --help           Show this help and exit
   -V, --version        Print version and exit

""")


def daemonize(user=0, group=0):
    # do the UNIX double-fork magic, see Stevens' "Advanced
    # Programming in the UNIX Environment" for details (ISBN 0201563177)
    try:
        pid = os.fork()
        if pid > 0:
            # exit first parent
            sys.exit(0)
    except OSError, e:
        sys.stderr.write("Fork failed (#1): %d (%s)\n" % (e.errno, e.strerror))
        sys.exit(1)

    # decouple from parent environment
    # chdir -> don't prevent unmounting...
    os.chdir("/")

    # Create new process group with the process as leader
    os.setsid()

    # Set user/group depending on params
    if group:
        os.setregid(getgrnam(group)[2], getgrnam(group)[2])
    if user:
        os.setreuid(getpwnam(user)[2], getpwnam(user)[2])

    # do second fork
    try:
        pid = os.fork()
        if pid > 0:
            sys.exit(0)
    except OSError, e:
        sys.stderr.write("Fork failed (#2): %d (%s)\n" % (e.errno, e.strerror))
        sys.exit(1)

    sys.stdout.flush()
    sys.stderr.flush()

    si = os.open("/dev/null", os.O_RDONLY)
    so = os.open("/dev/null", os.O_WRONLY)
    os.dup2(si, 0)
    os.dup2(so, 1)
    os.dup2(so, 2)
    os.close(si)
    os.close(so)

    open_log()
    log("Successfully daemonized with PID %d." % os.getpid())

def signal_handler(signum, stack_frame):
    global g_need_restart, g_need_reload
    if signum in [ 2, 3, 15 ]:
        log("Got signal %d. Good bye" % signum)
        sys.exit(0)
    elif signum == 10:
        g_need_restart = True
    elif signum == 12:
        dump_state()
    elif signum == 1:
        g_need_reload = True

def read_configuration():
    if os.path.exists(opt_configfile):
        try:
            execfile(opt_configfile, globals())
        except Exception, e:
            if opt_debug:
                raise
            bail_out("Cannot read configuration file %s: %s" % (
                opt_configfile, e))

    # Create state object for all sites
    now = time.time()
    for sitename, siteconf in sites.items():
        siteconf.setdefault("channels", 5)
        siteconf.setdefault("heartbeat", (5, 2.0))
        siteconf.setdefault("channel_timeout", 3.0)
        siteconf.setdefault("query_timeout", 120.0)
        siteconf.setdefault("connect_retry", 4.0)
        g_sites[sitename] = {
            "state": "starting",
            "channels"    : [],
            "clients"     : [],
            "heartbeat"   : {
                "since"     : time.time(),
                "channel"   : None,
                "count"     : 0,
            },
            "last_reset"            : now,
            "last_reload"           : int(now),
            "last_failed_connect"   : 0,
            "last_inventory_update" : 0,
            "inventory_pid"         : None,
            "cache"                 : {},
        }
        if siteconf["channels"] <= 1:
            raise bail_out("Invalid configuration for site %s: you need at least two channels" % sitename)

def do_restart():
    log("Restarting myself")
    for fd in range(3, 8192):
        try:
            os.close(fd)
        except:
            pass
    os.execvp("liveproxyd", sys.argv)

def do_reload():
    log("Reload not implemented.")

os.unsetenv("LANG")

opt_verbose       = 0
opt_debug         = False
opt_foreground    = False
g_logfile         = sys.stderr

# Please adapt this piece of code
omd_root = os.getenv("OMD_ROOT")
if omd_root:
    opt_logfilepath   = omd_root + "/var/log/liveproxyd.log"
    opt_statedumppath = omd_root + "/var/log/liveproxyd.state"
    opt_pid_file      = omd_root + "/tmp/run/liveproxyd.pid"
    opt_configfile    = omd_root + "/etc/check_mk/liveproxyd.mk"
    opt_socketdir     = omd_root + "/tmp/run/liveproxy"
    opt_inventory_dir = omd_root + "/var/check_mk/inventory"
else:
    curdir = os.path.abspath('.')
    opt_logfilepath   = curdir + "/liveproxyd.log"
    opt_statedumppath = curdir + "/liveproxyd.state"
    opt_pid_file      = curdir + "/liveproxyd.pid"
    opt_configfile    = curdir + "/liveproxyd.mk"
    opt_socketdir     = curdir + "/liveproxy"
    opt_inventory_dir = None # Not supported for non-OMD installations

short_options = "hvVgc"
long_options = [ "help", "version", "verbose", "debug", "foreground", "config=" ]

try:
    opts, args = getopt.getopt(sys.argv[1:], short_options, long_options)

    # first parse modifers
    for o, a in opts:
        if o in [ '-v', '--verbose' ]:
            opt_verbose += 1
        elif o in [ '-d', '--debug' ]:
            opt_debug = True
        elif o in [ '-g', '--foreground' ]:
            opt_foreground = True
        elif o in [ '-c', '--conifg' ]:
            opt_config = a

    # now handle action options
    for o, a in opts:
        if o in [ '-h', '--help' ]:
            usage()
            sys.exit(0)
        elif o in [ '-V', '--version' ]:
            sys.stdout.write("liveproxyd version %s\n" % VERSION)
            sys.exit(0)

    if not opt_foreground:
        daemonize()

    log("----------------------------------------------------------")
    log("Livestatus Proxy-Daemon starting...")

    read_configuration()
    log("Configured %d sites" % len(sites))

    # Create PID file
    file(opt_pid_file, "w").write("%d\n" % os.getpid())

    # Install signal hander
    signal.signal(1,  signal_handler)  # HUP (--> reload)
    signal.signal(2,  signal_handler)  # INT
    signal.signal(3,  signal_handler)  # QUIT
    signal.signal(15, signal_handler)  # TERM
    signal.signal(10, signal_handler)  # USR1
    signal.signal(12, signal_handler)  # USR2
    signal.signal(13, signal.SIG_IGN)  # PIPE

    # Now let's go...
    liveproxyd_run()

    # We reach this point, if the server has been killed by
    # a signal or hitting Ctrl-C (in foreground mode)

    log("Successfully shut down.")
    os.remove(opt_pid_file)
    sys.exit(0)

except Exception, e:
    if opt_debug:
        raise
    bail_out(e)

