#!/usr/bin/env python

# THIS FILE IS PART OF THE CYLC SUITE ENGINE.
# Copyright (C) 2008-2017 NIWA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""cylc [util] report-timings [OPTIONS] REG

Retrieve suite timing information for wait and run time performance analysis.
Raw output and summary output (in text or HTML format) are available.  Output
is sent to standard output, unless an output filename is supplied.

Summary Output (the default):
Data stratified by host and batch system that provides a statistical
summary of
    1. Queue wait time (duration between task submission and start times)
    2. Task run time (duration between start and succeed times)
    3. Total run time (duration between task submission and succeed times)
Summary tables can be output in plain text format, or HTML with embedded SVG
boxplots.  Both summary options require the Pandas library, and the HTML
summary option requires the Matplotlib library.

Raw Output:
A flat list of tabular data that provides (for each task and cycle) the
    1. Time of successful submission
    2. Time of task start
    3. Time of task successful completion
as well as information about the batch system and remote host to permit
stratification/grouping if desired by downstream processors.

Timings are shown only for succeeded tasks.

For long-running and/or large suites (i.e. for suites with many task events),
the database query to obtain the timing information may take some time.

"""

import sys
from cylc.remote import remrun
if remrun().execute():
    sys.exit(0)

import cStringIO as StringIO
import contextlib
import os

import cylc.flags
from cylc.cfgspec.globalcfg import GLOBAL_CFG
from cylc.option_parsers import CylcOptionParser as COP
from cylc.rundb import CylcSuiteDAO


@contextlib.contextmanager
def smart_open(filename=None):
    """
    Allow context management for output to either a file or
    to standard output transparently.

    See https://stackoverflow.com/a/17603000

    """
    if filename and filename != '-':
        fh = open(filename, 'w')
    else:
        fh = sys.stdout
    try:
        yield fh
    finally:
        if fh is not sys.stdout:
            fh.close()


def main():
    parser = COP(
        __doc__,
        argdoc=[('REG', 'Suite name')]
    )
    parser.add_option(
        "-r", "--raw",
        help="Show raw timing output suitable for custom diagnostics.",
        action="store_true", default=False, dest="show_raw"
    )
    parser.add_option(
        "-s", "--summary",
        help="Show textual summary timing output for tasks.",
        action="store_true", default=False, dest="show_summary"
    )
    parser.add_option(
        "-w", "--web-summary",
        help="Show HTML summary timing output for tasks.",
        action="store_true", default=False, dest="html_summary"
    )
    parser.add_option(
        "-O", "--output-file",
        help="Output to a specific file",
        action="store", default=None, dest="output_filename")
    options, args = parser.parse_args()

    output_options = [
        options.show_raw, options.show_summary, options.html_summary
    ]
    if output_options.count(True) > 1:
        parser.error('Cannot combine output formats (choose one)')
    if not any(output_options):
        # No output specified - choose summary by default
        options.show_summary = True

    suite = args.pop(0)
    run_db = _get_dao(suite)
    row_buf = format_rows(*run_db.select_task_times())

    with smart_open(options.output_filename) as output:
        if options.show_raw:
            output.write(row_buf.getvalue())
        else:
            if options.show_summary:
                summary = TextTimingSummary(row_buf)
            elif options.html_summary:
                summary = HTMLTimingSummary(row_buf)
            summary.write_summary(output)


def format_rows(header, rows):
    """Write the rows in tabular format to a string buffer.

    Ensure that each column is wide enough to contain the widest data
    value and the widest header value.

    """
    sio = StringIO.StringIO()
    max_lengths = [
        max(data_len, head_len)
        for data_len, head_len in zip(
            [max([len(r[i]) for r in rows]) for i in range(len(header))],
            [len(h) for h in header]
        )
    ]
    formatter = ' '.join(['%%-%ds' % l for l in max_lengths]) + '\n'
    sio.write(formatter % header)
    for r in rows:
        sio.write(formatter % r)
    sio.seek(0)
    return sio


def _get_dao(suite):
    """Return the DAO (public) for suite."""
    suite_log_dir = GLOBAL_CFG.get_derived_host_item(
        suite, 'suite log directory'
    )
    pub_db_path = os.path.join(
        os.path.dirname(suite_log_dir),
        CylcSuiteDAO.DB_FILE_BASE_NAME
    )
    return CylcSuiteDAO(pub_db_path, is_public=True)


class TimingSummary(object):
    """Base class for summarizing timing output from cylc run database."""

    def __init__(self, filepath_or_buffer=None):
        """Set up internal dataframe storage for time durations."""

        self._check_imports()
        if filepath_or_buffer is not None:
            self.read_timings(filepath_or_buffer)
        else:
            self.df = None
            self.by_host_and_batch = None

    def read_timings(self, filepath_or_buffer):
        """
        Set up time duration dataframe storage based on start/stop/succeed
        times from flat database output.

        """
        import pandas as pd
        df = pd.read_csv(
            filepath_or_buffer, delim_whitespace=True, index_col=[0, 1, 2, 3],
            parse_dates=[4, 5, 6]
        )
        self.df = pd.DataFrame({
            'queue_time': (
                df['start_time'] - df['submit_time']).apply(self._dt_to_s),
            'run_time': (
                df['succeed_time'] - df['start_time']).apply(self._dt_to_s),
            'total_time': (
                df['succeed_time'] - df['submit_time']).apply(self._dt_to_s),
        })
        self.by_host_and_batch = self.df.groupby(
            level=['host', 'batch_system']
        )

    def write_summary(self, buf=None):
        """Using the stored timings dataframe, output the data summary."""

        if buf is None:
            buf = sys.stdout
        self.write_summary_header(buf)
        for group, df in self.by_host_and_batch:
            self.write_group_header(buf, group)
            df_reshape = df.unstack('name').stack(level=0)
            df_describe = df.groupby(level='name').describe()
            if df_describe.index.nlevels > 1:
                df_describe = df_describe.unstack()  # for pandas < 0.20.0
            df_describe.index.rename(None, inplace=True)
            for timing_category in self.df.columns:
                self.write_category(
                    buf, timing_category, df_reshape, df_describe
                )
        self.write_summary_footer(buf)

    def write_summary_header(self, buf):
        pass

    def write_summary_footer(self, buf):
        pass

    def write_group_header(self, buf, group):
        pass

    def write_category(self, buf, category, df_reshape, df_describe):
        pass

    def _check_imports(self):
        try:
            import pandas as pd
        except ImportError:
            raise Exception('Cannot import pandas - summary unavailable.')

    @staticmethod
    def _dt_to_s(dt):
        import pandas as pd
        try:
            return dt.total_seconds()
        except AttributeError:
            # Older versions of pandas have the timedelta as a numpy
            # timedelta64 type, which didn't support total_seconds
            return pd.Timedelta(dt).total_seconds()


class TextTimingSummary(TimingSummary):
    """Timing summary in text form."""

    line_width = 80

    def write_group_header(self, buf, group):
        title = 'Host: %s\tBatch System: %s' % group
        buf.write('=' * self.line_width + '\n')
        buf.write(title.center(self.line_width - 1) + '\n')
        buf.write('=' * self.line_width + '\n')

    def write_category(self, buf, category, df_reshape, df_describe):
        buf.write(category.center(self.line_width) + '\n')
        buf.write(('-' * len(category)).center(self.line_width) + '\n')
        buf.write(df_describe[category].to_string())
        buf.write('\n\n')


class HTMLTimingSummary(TimingSummary):
    """Timing summary in HTML form."""

    def write_summary_header(self, buf):
        css = """
            body {
                font-family: Sans-Serif;
                text-align: center;
            }
            h1 {
                background-color: grey;
            }
            h2 {
                width: 85%;
                background-color: #f0f0f0;
                margin: auto;
            }
            table {
                width: 75%;
                margin: auto;
                border-collapse: collapse;
            }
            tr:nth-child(even) {
                background-color: #f2f2f2;
            }
            td {
                text-align: right;
            }
            th, td {
                border-bottom: 1px solid grey;
            }
            svg {
                width: 65%;
                height: auto;
                margin: auto;
                display: block;
            }
            .timing {
                padding-bottom: 40px;
            }
        """

        buf.write('<html><head><style>%s</style></head><body>' % css)

    def write_summary_footer(self, buf):
        buf.write('</body></html>')

    def write_group_header(self, buf, group):
        buf.write('<h1>Timings for host %s using batch system %s</h1>' % group)

    def write_category(self, buf, category, df_reshape, df_describe):
        import matplotlib.pyplot as plt
        buf.write('<div class="timing" id=%s>' % category)
        buf.write('<h2>%s</h2>\n' % (category.replace('_', ' ').title()))
        ax = df_reshape.xs(category, level=3).plot(kind='box', vert=False)
        ax.invert_yaxis()
        ax.set_xlabel('Seconds')
        plt.tight_layout()
        plt.gcf().savefig(buf, format='svg')
        try:
            table = df_describe[category].to_html(
                classes="summary", index_names=False, border=0
            )
        except TypeError:
            # older pandas don't support the "border" argument
            # so explicitly remove it
            table = df_describe[category].to_html(
                classes="summary", index_names=False
            )
            table = table.replace('border="1"', '')
        buf.write(table)
        buf.write('</div>')
        pass

    def _check_imports(self):
        try:
            import matplotlib
            matplotlib.use('Agg')
        except ImportError:
            raise Exception(
                'Cannot import matplotlib - HTML summary unavailable.'
            )
        super(HTMLTimingSummary, self)._check_imports()


if __name__ == "__main__":
    try:
        main()
    except Exception as exc:
        if cylc.flags.debug:
            raise
        sys.exit(str(exc))
