#!/usr/bin/env python

# This script counts the lines of code in various types of source files.
# It has options to print summaries at various levels.

# todo:
# - possibly specify/override/add files/dirs to be ignored on command line

import re
import os
import sys
import stat
import time
import argparse

# Define the various file types and comment marker for single line and multi line.
# It defines the following fields:
#  - The file type
#  - A list of matching file name extensions
#  - None or a regex defining a matching file name extension pattern
#  - A list of matching file names
#  - The delimiter defining a comment in a single line
#  - The delimiters defining the start and end of a block comment
#  - indication if file contains code or other info (1=code, 0=other)
# Similar to cloc there are a few theoretical problems:
# 1. If a quoted string contains comment delimiters, they are recognized
#    as comment delimiters. In principle some regexes could be defined to replace
#    such strings, possibly after first replacing escaped backslashes and quotes.
#    This is quite some work for cases that do not occur in practice.
# 2. A regex like '""".*"""' is greedy, thus a line like """some1"""some2"""
#    is fully matched. In practice such lines are not used.
# See cnt.py-new for an attempt solving these issues.
types = [ ('C++', ['cc', 'tcc', 'hcc', 'cpp', 'cxx'], None, [], '//', '/\*', '\*/', 1),
          ('C++Hdr', ['h', 'hpp', 'hxx'], None, [], '//', '/\*', '\*/', 1),
          ('C', ['c'], None, [], '//', '/\*', '\*/', 1),
          ('Cuda', ['cu'], None, [], '//', '/\*', '\*/', 1),
          ('OpenCL', ['cl'], None, [], '//', '/\*', '\*/', 1),
          ('Fortran', ['f', 'for'], None, [], '*', '', '', 1),
          ('Assembly', ['m', 'S'], None, [], '', '', '', 1),
          ('Lisp', ['lisp'], None, [], '', '', '', 1),
          ('SQL', ['sql'], None, [], '--', '', '', 1),
          ('Flex', ['l', 'll'], None, [], '//', '/\*', '\*/', 1),
          ('Bison', ['y', 'yy'], None, [], '//', '/\*', '\*/', 1),
          ('Python', ['py', 'python'], None, [], '#', '"""', '"""', 1),
          ('Perl', ['pl', 'perl'], None, [], '#', '', '', 1),
          ('test-run', ['run'], None, [], '','','', 0),
          ('test-in', ['in'], re.compile('in_.*'), [], '','','', 0),
          ('test-out', ['out', 'stdout'], None, [], '','','', 0),
          ('sh', ['sh'], None, [], '#', '', '', 0),
          ('bash', ['bash'], None, [], '#', '', '', 0),
          ('csh', ['csh'], None, [], '#', '', '', 0),
          ('tcsh', ['tcsh'], None, [], '#', '', '', 0),
          ('CMake', ['cmake'], None, ['CMakeLists.txt'], '#', '', '', 0),
          ('Config', ['conf', 'cfg', 'dat'], None, [], '#', '', '', 0),
          ('Component', ['comp'], None, [], '#', '', '', 0),
          ('parset', ['parset'], re.compile('parset.*'), [], '#', '', '', 0),
          ('log_prop', ['log_prop'], None, [], '#', '', '', 0),
          ('rst', ['rst'], None, [], '', '', '', 0),
          ('doxygen', ['dox'], None, ['doxygen.cfg'], '', '', '', 0),
          ('xml', ['xml', 'xsl', 'xsd'], None, [], '', '', '', 0),
          ('html', ['html', 'htm'], None, [], '', '', '', 0),
          ('binary', [], None, [], '', '', '', 0),               # binary files
          ('ignore', ['log', 'shar', 'tmp', 'ps', 'fig'], re.compile('.*(~|-sav|-new)'), ['templates','makefile','changelog'], '', '', '', 0),  # files to be ignored
          ('other', [], None, [], '', '', '', 0) ]


def showTypes (verbose):
    for (type,exts,extre,filenms,comm,scomm,ecomm,ctyp) in types:
        print '%-24s ctyp=%d' % (type,ctyp)
        print '  file name extensions: ', exts
        if verbose:
            if not extre is None:
                print '  extension pattern:    ', extre.pattern
            if len(filenms) > 0:
                print '  file names:           ', filenms
            if len(comm) > 0:
                print '  comment marker:       ', comm
            if len(scomm) > 0:
                print '  start comment block:  ', scomm
                print '  end   comment block:  ', ecomm


# Define regex for a line containing an alphanumeric character
reAlphaNum  = re.compile('\w')

def hasAlphaNum (line):
    l = line.strip()
    return len(l) > 0  and  reAlphaNum.search(line)


# From http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python
# Might treat UTF-16 files also as binary.
def is_textfile(filename):
    fin = open(filename, 'rb')
    try:
        CHUNKSIZE = 4096
        while 1:
            chunk = fin.read(CHUNKSIZE)
            if '\0' in chunk:
                return False
            if len(chunk) < CHUNKSIZE:
                break
    finally:
        fin.close()
    return True

    
# Return tuple with nr of files, nr of lines, nr of code lines,
# nr of comment lines, nr of blank lines, and nr of header lines.
#  linecomm: comment marker for a single line
#  scomm:    start block comment marker (empty is no block comments)
#  ecomm:    end block comment marker
#  basic:    False = only count lines with >= 1 alphanumeric char and
#                    count header separately
def countcodecomm (filename, linecomm, scomm='', ecomm='', basic=False):
    f = open(filename)
    nhdr   = 0
    nblank = 0
    ncomm  = 0
    ncode  = 0
    nline  = 0
    skipHeader = not basic
    blockComm = False
    if len(scomm) > 0:
        reComm2a = re.compile('\s*' + scomm + '\s*' + ecomm + '\s*')
        reComm2b = re.compile('\s*' + scomm + '(.*)' + ecomm + '\s*')
        reSComm = re.compile(scomm)
        reEComm = re.compile(ecomm)
        reTillSComm = re.compile('.*' + scomm + '\s*')
        reTillEComm = re.compile('.*' + ecomm + '\s*')
        reFromSComm = re.compile('\s*' + scomm + '.*')
        reFromEComm = re.compile('\s*' + ecomm + '.*')
    # Loop over all lines in the file.
    for line in f:
        nline += 1
        # Skip file header till first non-comment line.
        # This header is usually the licensing info.
        if skipHeader:
            if len(linecomm) > 0  and  line[:len(linecomm)] == linecomm:
                nhdr += 1
                continue
            skipHeader = False
        # Remove leading and trailing whitespace (including newline)
        line = line.strip()
        if len(line) == 0:
            nblank += 1
        else:
            # Handle lines in a block comment.
            if blockComm:
                if reEComm.search (line):
                    # End of block comment
                    blockComm = False
                    # Remove the part until the comment marker.
                    # If nothing left, it can be a comment line.
                    l1 = reTillEComm.sub ('', line)
                    if len(l1) == 0  or  (not basic and not hasAlphaNum(l1)):
                        if basic  or  hasAlphaNum(reFromEComm.sub ('', line)):
                            ncomm += 1
                        continue
                    line = l1
                else:
                    # A line inside a block comment
                    # Only count as comment if an alphanumeric in it
                    if basic  or  hasAlphaNum(line):
                        ncomm += 1
                    continue
            # Test for start of block comment.
            hasBlCom = False
            if len(scomm) > 0:
                # Remove empty block comments on a single line.
                l1 = reComm2a.sub ('', line)
                if len(l1) == 0:
                    # If nothing left, the line contains 'scomm ecomm' only.
                    if basic:
                        ncomm += 1
                    continue
                # Remove non-empty block comments on a single line.
                l1 = reComm2b.sub ('', l1)
                if l1 != line:
                    hasBlCom = hasAlphaNum(reComm2b.sub (r'{\1}', line))
                    if len(l1) == 0:
                        # Nothing left; count if appropriate.
                        if basic  or  hasBlCom:
                            ncomm += 1
                        continue
                line = l1
                # Check for the start of a block comment
                if reSComm.search (line):
                    blockComm = True
                    # Remove the part past the comment marker.
                    # If nothing left, it can be a comment line.
                    l1 = reFromSComm.sub ('', line)
                    if len(l1) == 0:
                        if basic  or  hasBlCom  or  hasAlphaNum(reTillSComm.sub ('', line)):
                            ncomm += 1
                        continue
                    line = l1
            # A code or a single comment line
            # Count if it contains an alphanumeric character.
            if basic  or  hasAlphaNum(line):
                if len(linecomm) > 0  and  line[:len(linecomm)] == linecomm:
                    ncomm += 1
                else:
                    ncode += 1
            elif hasBlCom:
                # Also count as comment if there was a comment block.
                ncomm += 1
    return (1, nline, ncode, ncomm, nblank, nhdr)

def printHeader():
    sys.stdout.write ('%9s%7s%9s%9s%16s%16s%9s%9s\n' % ('Type','Files','Lines','Code','Comment','Blank','Header','Other'))
    
def printCount(file, type, cnt, ccperc):
    perc = [0.,0.]
    t = cnt[1]
    if ccperc:
        t = cnt[2] + cnt[3]    # code + comment
    if t > 0:
        for i in (0,1):
            perc[i] = 100. * cnt[i+2] / t
        file.write ('%9s %6d %8d %8d %5.1f%% %8d %5.1f%% %8d %8d %8d\n' % (type, cnt[0], cnt[1], cnt[2], perc[0], cnt[3], perc[1], cnt[4], cnt[5], cnt[1]-cnt[2]-cnt[3]-cnt[4]-cnt[5]))
    else:
        if cnt[0] > 0:
            file.write ('%9s %6d\n' % (type, cnt[0]))

# Count another file.
# If present, use the shebang to derive the file type.
# Otherwise count it as other.
def countother(filename, basic, usecode):
    f = open(filename)
    nline  = 0
    nblank = 0
    # Test first line for shebang.
    for line in f:
        if line[:2] == '#!':
            # Remove shebang, whitespace and comment.
            line = line[2:].strip()
            recomm = re.compile('#.*')
            line = recomm.sub('', line)
            # Remove till last slash and optionally env.
            rescr = re.compile('.*/')
            reenv = re.compile('env\s\s*')
            line = rescr.sub('', line)
            ext = reenv.sub('', line).lower()
            # Count a known file type
            for (type,exts,extre,filenms,comm,scomm,ecomm,ctyp) in types:
                if ext in exts:
                    if usecode and ctyp==0:
                        return (type, ctyp, (1,0,0,0,0,0))
                    return (type, ctyp, countcodecomm (filename,comm,scomm,ecomm,basic))
        line = line.strip()
        if len(line) == 0:
            nblank += 1
        nline += 1
    # Unknown file type, nothing to be counted.
    if usecode:
        return ('other', 0, (1,0,0,0,0,0))
    return ('other', 0, (1,nline,0,0,nblank,0))

def countfiles(dirname, test, basic, ccperc, verbose, printlevel, level, usecode, dosum):
    sums = [{}, {}]
    for t in types:
        sums[0][t[0]] = [0,0,0,0,0,0]
        sums[1][t[0]] = [0,0,0,0,0,0]
    inx = 0
    if test  and  os.path.basename(dirname) == 'test':
        inx = 1
    files = os.listdir(dirname)
    for file in files:
        if file not in ['.svn', '.cvs', 'CVS', 'doc']:
            ffile = os.path.join(dirname,file)
            try:
                mode = os.lstat(ffile).st_mode
            except OSError:
                sys.stderr.write ('No such file: %s\n' % ffile)
                continue
            if stat.S_ISLNK(mode):
                # skip symlinks because casacore contains symlink to itself
                continue
            elif stat.S_ISDIR(mode):
                cnts = countfiles (ffile, test, basic, ccperc, verbose, printlevel, level+1, usecode, dosum)
                for j in [0,1]:
                    for t in types:
                        for i in range(len(sums[j][t[0]])):
                            sums[j][t[0]][i] += cnts[j][t[0]][i]
            elif stat.S_ISREG(mode):
                if not is_textfile(ffile):
                    type = 'binary'
                    ctyp = 0
                    cnt = (1,0,0,0,0,0)
                else:
                    fnd = False
                    (root,ext) = os.path.splitext(ffile)
                    if len(ext) > 0:
                        ext = ext[1:]   # remove .
                    for (type,exts,extre,filenms,comm,scomm,ecomm,ctyp) in types:
                        if file in filenms  or  ext in exts  or  (not extre is None and extre.match(ext)):
                            if type == 'ignore'  or  (usecode and ctyp==0):
                                cnt = (1,0,0,0,0,0)
                            else:
                                cnt = countcodecomm (ffile,comm,scomm,ecomm,basic)
                            fnd = True
                            break
                    if not fnd:
                        (type,ctyp,cnt) = countother (ffile, basic, usecode)
                if not usecode  or  ctyp != 0:
                    for i in range(len(cnt)):
                        sums[inx][type][i] += cnt[i]
                    if type == 'other':
                        sys.stderr.write ('Unknown type: %s\n' % ffile)
                    elif verbose:
                        sys.stderr.write ('** %s\n' % ffile)
                        printCount (sys.stderr, type, cnt, ccperc);
    if level <= printlevel:
        bl = level*2*' '
        for j in [0,1]:
            first = True
            sumall = [0,0,0,0,0,0]
            for t in types:
                c = sums[j][t[0]]
                if c[0] > 0:
                    if first:
                        tc = ''
                        if j==1:
                            tc = ' testcode'
                        sys.stdout.write ('%s%s%s\n' % (bl,dirname,tc))
                        first = False
                    if dosum:
                        for i in range(len(c)):
                            sumall[i] += c[i]
                    else:
                        printCount (sys.stdout, t[0], c, ccperc);
            if dosum:
                printCount (sys.stdout, '', sumall, ccperc)
    return sums

def testit():
    print countcodecomm ('/Users/diepen/testcnt1', '#')
    print countcodecomm ('/Users/diepen/testcnt2', '#', '"""', '"""')
    print countcodecomm ('/Users/diepen/testcnt1', '#', '', '', False)
    print countcodecomm ('/Users/diepen/testcnt2', '#', '"""', '"""', False)
    

if __name__ == '__main__':
    # Define the options.
    parser = argparse.ArgumentParser(prog='PROG')
    parser.add_argument('-b', '--basic', help='count copyright header and lines without an alphanumeric character as code/comment lines', action='store_true')
    parser.add_argument('-c', '--code', help='only use source files containing code (e.g. no .parset)', action='store_true')
    parser.add_argument('-s', '--sum', help='only calculate and print the sum of all file types', action='store_true')
    parser.add_argument('-l', '--limitperc', help='limit to the nr of code and comment lines to determine percentages', action='store_true')
    parser.add_argument('-p', '--printlevel', type=int, default=0, help='first directory level to print (default 0 (=top))')
    parser.add_argument('-d', '--displaytypes', help='display the currently recognized file types (full info with -v)', action='store_true')
    parser.add_argument('-t', '--testinclude', help='do not count test directories separately', action='store_true')
    parser.add_argument('-v', '--verbose', help='print count for each source file', action='store_true')
    parser.add_argument('directory', nargs='?', default='.', help='name of top directory to count source files (default is .)')
    # If nothing given, do test and show options.
    if len(sys.argv) == 1:
        #print 'Testing the script ...'
        #testit()
        print ''
        print 'countcode counts per known source file type the number of source lines in the'
        print ' files in the given directory and recursively in its subdirectories.'
        print 'It supports many file types. The type is recognized from the file name extension'
        print ' or the shebang script type. Use -s to see all supported types.'
        print 'The following line types are counted:'
        print '  code:     pure code lines'
        print '  comment:  pure comment lines'
        print '  blank:    empty lines or lines containing whitespace only'
        print '  header:   the copyright header (leading comment lines)'
        print '  other:    all other lines'
        print 'Unless -b is given, a pure code or comment line has to contain an alphanumeric'
        print ' character; e.g., a single } does not count as code line.'
        print 'It calculates the percentage of code and comment lines in the total number of'
        print ' lines or (if -l is given) in the sum of code and comment lines.'
        print 'Unless -t is given, files in test directories are counted separately.'
        print 'Normal output is written on stdout; verbose on stderr.'
        print 'Files with an unknown type are reported on stderr.'
        print 'Note that -bt should give about the same results as a tool like cloc.'
        print ''
        parser.parse_args(['-h'])
    else:
        values = parser.parse_args(sys.argv[1:])
        if values.displaytypes:
            showTypes (values.verbose)
        else:
            dirname = values.directory
            test    = not values.testinclude
            # Remove possible trailing slash
            if len(dirname) > 1  and  dirname[-1] == '/':
                dirname = dirname[:-1]
            sys.stdout.write ('%s   Count %s  test=%d basic=%d limitperc=%d code=%d\n'%(time.ctime(),dirname,test,values.basic,values.limitperc,values.code))
            printHeader()
            countfiles (dirname, test, values.basic, values.limitperc, values.verbose, values.printlevel, 0, values.code, values.sum)
            printHeader()
            sys.stdout.write ('%s   Count %s  test=%d basic=%d limitperc=%d code=%d\n'%(time.ctime(),dirname,test,values.basic,values.limitperc,values.code))
