#!/usr/bin/python3
# generate debian/copyright from debian/copyright.template and node_modules
# Author: Martin Pitt <mpitt@debian.org>
#
# package.json license: format uses https://spdx.org/licenses/ identifiers, which
# are mostly compatible with
# https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/#license-specification

import os
import sys
import json
import re
import subprocess
import argparse
from glob import glob

debian_dir = os.path.join(os.path.dirname(__file__), 'debian')
template_file = os.path.join(debian_dir, 'copyright.template')


def parse_args():
    p = argparse.ArgumentParser(description='Generate debian/copyright file from template and node_modules')
    return p.parse_args()


def template_licenses(template):
    '''Return set of existing License: short names'''

    ids = set()
    for line in template.splitlines():
        if line.startswith('License:'):
            ids.add(line.split(None, 1)[1].lower())
    return ids


def module_license(moddir):
    '''Return License: short name for given module'''

    # First check if package.json has a "license" field
    try:
        with open(os.path.join(moddir, 'package.json'), encoding='UTF-8') as f:
            license = json.load(f)['license']
            if license.startswith('(MIT OR'):
                return 'MIT'
            return license
    except (IOError, KeyError):
        pass

    # *LICENSE*/COPYING/README
    if os.path.exists(os.path.join(moddir, 'MIT-LICENSE.txt')):
        return 'MIT'
    try:
        with open((glob(os.path.join(moddir, 'LICENSE*')) +
                   glob(os.path.join(moddir, 'COPYING')))[0]) as f:
            text = f.read()
        if 'Permission is hereby granted,' in text and 'THE SOFTWARE IS PROVIDED "AS IS"' in text:
            return 'MIT'
    except IndexError:
        pass

    raise SystemError('Could not determine license from %s' % moddir)


def module_copyright(moddir):
    '''Return Copyrights for given module'''

    mod = os.path.basename(moddir)
    copyrights = set()
    try:
        out = subprocess.check_output(['env', '-u', 'LANGUAGE', 'LC_ALL=C.UTF-8', 'grep', '-hri', '--binary-files=without-match', r'copyright.*\([1-9][0-9]\+\|(c)\)'],
                                      cwd=moddir).decode('UTF-8')
        for line in out.splitlines():
            # weed out some noise
            reject_substr = ['Binary file',
                             '{"version":',
                             'function(',
                             'eval("',
                             '*//*!',
                             '"!="',
                             'Code released under the',
                             'Add copyright statements',
                             'grunt.template.today',
                             'copyright-mark',
                             '@font-face',
                             '<name of author>',
                             'glyph-name=',
                             'copyright holder',
                             'WIPO copyright treaty']
            if any(substr in line for substr in reject_substr):
                continue

            line = line.replace('<copyright>', '').replace('</copyright>', '')
            line = line.replace('&copy;', '(c)').replace('copyright = ', '')
            line = re.sub('@license.*Copyright', '', line)
            line = line.strip(' \t*/\'|,#')
            if not line.endswith('Inc.'):
                # avoids some duplicated lines which only differ in trailing dot
                line = line.strip('.')
            if line.startswith("u'") or line.startswith('u"'):  # Python unicode prefix
                line = line[2:]
            if 'Fixes #' in line:
                continue  # this is from a changelog
            # some noise fine-tuning for specific packages
            if mod == 'bootstrap' and ('https://github.com' in line or line == 'Copyright 2015'):
                continue
            if mod == 'patternfly':
                line = re.sub(' and licensed.*', '', line)

            copyrights.add(' ' + line)  # space prefix for debian/copyright RFC822 format
    except subprocess.CalledProcessError:
        pass

    if not copyrights:
        # fall back to package.json's author
        try:
            with open(os.path.join(moddir, 'package.json')) as f:
                author = json.load(f)['author']

                # `author` can be either a string or a dict
                # https://flaviocopes.com/package-json/#author
                if isinstance(author, dict):
                    author = author['name']
                assert isinstance(author, str)

                copyrights.add(' ' + author)
        except (IOError, KeyError):
            pass

    if not copyrights and moddir.startswith('node_modules/@patternfly/'):
        return module_copyright('node_modules/patternfly')

    if not copyrights:
        raise SystemError('Did not find any copyrights in %s' % moddir)
    return copyrights

#
# main
#


args = parse_args()

with open(template_file, encoding='UTF-8') as f:
    template = f.read()

license_ids = template_licenses(template)
module_users = {}

for included_modules in sorted(glob("dist/*/included-modules")):
    pkg = os.path.basename(os.path.dirname(included_modules))
    with open(included_modules) as f:
        for module in f:
            module = module.strip()
            # collect all packages pointing to one node module
            if module not in module_users:
                module_users[module] = set()
            module_users[module].add(f'dist/{pkg}/*')

paragraphs = {}
for moddir in module_users:
    license = module_license(moddir)
    if license.lower() not in license_ids:
        raise KeyError('%s has license %s which is not in %s' % (moddir, license, template_file))
    copyrights = '\n'.join(sorted(module_copyright(moddir)))
    paragraph = f'Copyright:{copyrights}\nLicense: {license}'
    # collect all modules with identical licence/copyright
    if paragraph not in paragraphs:
        paragraphs[paragraph] = set()
    paragraphs[paragraph].update(module_users[moddir])

output = []
for paragraph in sorted(paragraphs):
    files = '\n '.join(f'{dir}' for dir in sorted(paragraphs[paragraph]))
    output.append(f'Files: {files}\n{paragraph}')

# force UTF-8 output, even when running in C locale
for line in template.splitlines():
    if '#NPM' in line:
        sys.stdout.buffer.write('\n\n'.join(output).encode('UTF-8'))
    else:
        sys.stdout.buffer.write(line.encode('UTF-8'))
    sys.stdout.buffer.write(b'\n')
