#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2020-07-09 21:12:32 +0300 (Thu, 09 Jul 2020) $ 
#$Revision: 8170 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.0.1/scripts/cif_hkl_COD_number $
#------------------------------------------------------------------------------
#*
#* Parse a CIF file and print out the essential information in the COD
#* CIF format
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

use strict;
use warnings;
use File::Basename qw( basename );
use COD::CIF::Data::CODFlags qw( has_hkl );
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::DictTags;
use COD::CIF::Tags::COD;
use COD::CIF::Tags::Excluded;
use COD::CIF::Tags::Manage qw( exclude_tag exclude_empty_tags set_tag );
use COD::CIF::Tags::Print qw( print_cif fold );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_names );
use COD::SOptions qw( getOptions );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_warnings
                          process_errors
                          process_parser_messages
                          report_message );
use COD::ToolsVersion;

my $die_on_error_level = {
    ERROR   => 1,
    WARNING => 0,
    NOTE    => 0
};

my $use_parser = 'c';

my $exclude_empty_tags = 0;
my $preserve_loop_order = 0;
my $record_original_filename = 0;
my $exclude_misspelled_tags = 0;
my $fold_long_fields = 0;
my $fold_title = 0;
my $folding_width = 76;

my $extra_tag_file;
my $original_filename;

my $cif_comment_header; # A header with comments to printed at the
                        # beginning of the output CIF file.

my $cif_header_file; # The name of an external file that holds a CIF
                     # header.

my $data_block_format = "%07d";
my $data_block_nr; # If defined, specifies that data blocks should be
                   # numbered in a COD-like fasion.

#* OPTIONS:
#*   -h, --add-cif-header header_file.cif
#*                     Prepend each of the output files with the comments
#*                     from the beginning of the specified file.
#*
#*   --exclude-empty-tags
#*                     Remove tags that contain only empty values. A value is
#*                     considered empty if it equal to a single question mark ('?')
#*                     or a single period ('.').
#*   --dont-exclude-empty-tags, --no-exclude-empty-tags
#*                     Disable the '--exclude-empty-tags' option (default).
#*
#*   -x, --extra-tag-list tag-list.lst
#*                     Add additional tags to the list of recognised CIF tags.
#*                     These extra tags are presented in a separate file, one
#*                     tag per line.
#*   --exclude-misspelled-tags
#*                     Remove tags that were not present in the recognised
#*                     tag list.
#*   --dont-exclude-misspelled-tags,
#*   --no-exclude-misspelled-tags
#*                     Disable the '--exclude-misspelled-tags' option.
#*
#*   --preserve-loop-order
#*                     Print loops in the same order they appeared in the
#*                     original file.
#*   --use-internal-loop-order
#*                     Disregard original loop order while printing the tags
#*                     (default).
#*
#*   --original-filename data_source.cif
#*                     Use the provided string as the name of the original file.
#*                     (see --record-original-filename).
#*   --clear-original-filename
#*                     Do not use any previously provided strings as the name
#*                     of the original file.
#*   --record-original-filename
#*                     Record the original filename and the original data
#*                     block name for each data block as the '_cod_data_source_*'
#*                     tag values.
#*   --dont-record-original-filename
#*                     Do not record the original filename and the original
#*                     data block name (default).
#*
#*   -S, --start-data-block-number 1234567
#*                     Use the provided number as the start number when
#*                     renaming data blocks (default: '7000001'). Setting
#*                     this option enables the '-R' option.
#*   -d, --data-block-format '%07d'
#*                     Use the provided format to determine new data block
#*                     names from the provided data block numbers
#*                     (default: '%07d').
#*   -R, --renumber-data-blocks
#*                     Rename all data blocks. The new names are constructed
#*                     by taking a start number (specified by the '-S' option),
#*                     applying the string format (specified by the '-d' option)
#*                     and then incrementing the start number for each
#*                     sequential data block.
#*   -R-, --dont-renumber-data-blocks
#*                     Do not rename data blocks (default). Enabling this
#*                     option sets the '-S' option to default value.
#*
#*   --folding-width 76
#*                     Specify the length of the longest unfolded line.
#*
#*   --fold-title
#*   --dont-fold-title
#*                     Folds the title, if longer than folding width.
#*                     Default: off.
#*
#*   --fold-long-fields
#*   --dont-fold-long-fields
#*                     Fold fields, longer than folding width. Default: off.
#*
#*   --use-perl-parser
#*                     Use Perl parser to parse CIF files.
#*   --use-c-parser
#*                     Use C parser to parse CIF files (default).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    "-h,--add-cif-header" => \$cif_header_file,

    "-x,--extra-tag-list" => \$extra_tag_file,

    "--exclude-empty-tags"      => sub { $exclude_empty_tags = 1; },
    "--dont-exclude-empty-tags" => sub { $exclude_empty_tags = 0; },
    "--no-exclude-empty-tags"   => sub { $exclude_empty_tags = 0; },

    "--exclude-misspelled-tags"      => sub { $exclude_misspelled_tags = 1; },
    "--dont-exclude-misspelled-tags" => sub { $exclude_misspelled_tags = 0; },
    "--no-exclude-misspelled-tags"   => sub { $exclude_misspelled_tags = 0; },

    "--preserve-loop-order"     => sub { $preserve_loop_order = 1; },
    "--use-internal-loop-order" => sub { $preserve_loop_order = 0; },

    "--original-filename" => sub { $original_filename = get_value();
                                   $record_original_filename = 1 },
    "--clear-original-filename" => sub { undef $original_filename },

    "--record-original-filename"      => sub { $record_original_filename = 1; },
    "--dont-record-original-filename" => sub { $record_original_filename = 0; },

    "-d,--data-block-format"          => \$data_block_format,
    "-S,--start-data-block-number"    => \$data_block_nr,
    "-R,--renumber-data-blocks"       => sub { $data_block_nr = 7000001 },
    "-R-,--dont-renumber-data-blocks" => sub { undef $data_block_nr },

    "--folding-width"         => \$folding_width,
    "--fold-title"            => sub{ $fold_title = 1 },
    "--dont-fold-title"       => sub{ $fold_title = 0 },
    "--fold-long-fields"      => sub{ $fold_long_fields = 1 },
    "--dont-fold-long-fields" => sub{ $fold_long_fields = 0 },

    "--use-perl-parser" => sub { $use_parser = "perl" },
    "--use-c-parser"    => sub { $use_parser = "c" },
    "--options"         => sub { options; exit },
    "--help,--usage"    => sub { usage; exit },
    '--version'         => sub { print 'cod-tools version ',
                                 $COD::ToolsVersion::Version, "\n";
                                 exit }
);

my @extra_tags = ();

eval {
    if( $extra_tag_file ) {
        open( my $extra, '<', "$extra_tag_file" ) or die 'ERROR, '
            . 'could not open extra tag list file for reading -- '
            . lcfirst($!) . "\n";

        @extra_tags = map { s/\s//g; $_ }
                        map {split}
                          grep { /^\s*_/ } <$extra>;

        close( $extra ) or die 'ERROR, '
            . 'error while closing extra tag file after reading -- '
            . lcfirst($!) . "\n";
        ## local $, = "\n"; local $\ = "\n";
         print @extra_tags;
    }
};
if ($@) {
    process_errors( {
      'message'  => $@,
      'program'  => $0,
      'filename' => $extra_tag_file
    }, $die_on_error_level->{'ERROR'} );
};

my %excluded_tags  = map { ($_,$_) } @COD::CIF::Tags::Excluded::tag_list;

my @dictionary_tags = ( @COD::CIF::Tags::DictTags::tag_list,
                        @COD::CIF::Tags::COD::tag_list,
                        @extra_tags );

my %dictionary_tags = map { $_, $_ } @dictionary_tags;

my %cif_tags_lc = map {(lc($_),$_)} @dictionary_tags;

# Reading the header file
eval {
    if( defined $cif_header_file ) {
        open( my $header, '<', "$cif_header_file" ) or die 'ERROR, '
          . 'could not open CIF header file for reading -- '
          . lcfirst($!) . "\n";

        local $/; # enable "slurp" mode: read the whole header file.
        $cif_comment_header = <$header>;

        close( $header ) or die 'ERROR, '
         . 'error while closing CIF header file after reading -- '
         . lcfirst($!) . "\n";

        # The header must not contain CIF 2.0 magic code. For CIF 2.0
        # files the magic code is printed explicitly before the header.
        $cif_comment_header =~ s/^#\\#CIF_2\.0[ \t]*\n//;
    }
};
if ($@) {
    process_errors( {
      'message'  => $@,
      'program'  => $0,
      'filename' => $cif_header_file
    }, $die_on_error_level->{'ERROR'} );
};

binmode( STDOUT, ":encoding(UTF-8)" );
binmode( STDERR, ":encoding(UTF-8)" );

my $filename = shift(@ARGV);

my $options = { parser => $use_parser, no_print   => 1 };
my ($data, $err_count, $messages) = parse_cif($filename, $options);
process_parser_messages( $messages, $die_on_error_level );

if( !@{$data} ) {
    report_message( {
       'program'   => $0,
       'filename'  => $filename,
       'err_level' => 'WARNING',
       'message'   => 'file seems to be empty'
    }, $die_on_error_level->{'WARNING'} );
}

#------------------------------------------------------------------------------

for my $dataset (@$data) {

    my $datablok = $dataset->{values};
    my $dataname = 'data_' . $dataset->{'name'};

    local $SIG{__WARN__} = sub {
        process_warnings( {
            'message'  => @_,
            'program'  => $0,
            'add_pos'  => $dataname,
            'filename' => $filename
        }, $die_on_error_level )
    };

    canonicalize_names( $dataset );

    next if !has_hkl( $dataset );

    eval {
        # Remove empty tags, if requested:
        if( $exclude_empty_tags ) {
            exclude_empty_tags( $dataset );
        }

        # Fold title if requested:
        if( $fold_title ) {
            my $cif_title = join( " ", @{$datablok->{_publ_section_title}} );
            $cif_title =~ s/\n/ /g;
            $datablok->{_publ_section_title} = [
                "\n" . join( "\n", map { " " . $_ }
                              fold( $folding_width - 2,
                                    " +", " ", $cif_title ))
            ];
        }

        # Exclude potentially copyrighted and irrelevant tags
        # unconditionally:
        do {
            my @tag_list = @{$dataset->{tags}};
            for my $tag (@tag_list) {
                if( exists $excluded_tags{$tag} || $tag =~ /^_vrf_/ ) {
                    exclude_tag( $dataset, $tag );
                }
            }
        };

        # Check for misspelled tags:
        for my $tag (@{$dataset->{tags}}) {
            unless( exists $dictionary_tags{$tag} ) {
                warn "WARNING, tag '$tag' is not recognised\n";
            }
        }

        # Exclude the misspelled tags if requested:
        if( $exclude_misspelled_tags ) {
            my @tag_list = @{$dataset->{tags}};
            for my $tag (@tag_list) {
                unless( exists $dictionary_tags{$tag} ) {
                    exclude_tag( $dataset, $tag );
                }
            }
        }

        # Add the data source file name, if requested:
        if( $record_original_filename ) {
            my $basename;
            if( defined $original_filename ) {
                $basename = basename( $original_filename );
            } elsif( defined $filename && $filename ne "-" ) {
                $basename = basename( $filename );
            } else {
                $basename = "?";
            }
            set_tag( $dataset, "_cod_data_source_file", $basename );
            set_tag( $dataset, "_cod_data_source_block", $dataset->{name} );
        }

        # Clean up the resulting CIF data structure:
        for my $excluded_tag (qw( _publ_author_address
                                  _publ_author.address
                                  _publ_author_email
                                  _publ_author.email
                                  _publ_author_footnote
                                  _publ_author.footnote
                                  _publ_author_id_iucr
                                  _publ_author.id_iucr )) {
            if( exists $datablok->{$excluded_tag} ) {
                exclude_tag( $dataset, $excluded_tag );
            }
        }

        # Print out the CIF header if requested:
        if( defined $cif_comment_header ) {
            # Ensure that for CIF v2.0 the magic code comes
            # before the CIF comment header:
            if( exists $dataset->{cifversion} &&
                       $dataset->{cifversion}{major} == 2 ) {
                printf( "#\\#CIF_%d.%d\n",
                        $dataset->{cifversion}{major},
                        $dataset->{cifversion}{minor} );
            }
            print $cif_comment_header;
        }

        # Calculate the data block name:
        if( defined $data_block_nr ) {
            $dataset->{name} = sprintf $data_block_format, $data_block_nr;
            $data_block_nr ++;
        }

        # Print out requested tags:
        print_cif( $dataset, { 
            exclude_misspelled_tags => $exclude_misspelled_tags,
            preserve_loop_order => $preserve_loop_order,
            fold_long_fields => $fold_long_fields,
            folding_width => $folding_width,
            dictionary_tags => \%dictionary_tags,
            dictionary_tag_list => \@dictionary_tags,
        } );
    };
    if ($@) {
        process_errors( {
            'message'       => $@,
            'program'       => $0,
            'filename'      => $filename,
            'add_pos'       => $dataname
        }, $die_on_error_level->{'ERROR'} );
    };
}
