#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2021-04-28 19:35:53 +0300 (Wed, 28 Apr 2021) $
#$Revision: 8738 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.3.0/scripts/cif_fix_values $
#------------------------------------------------------------------------------
#*
#* Correct various CIF file values and output the made changes into the
#* standard I/O streams. The script is capable of making these corrections:
#*   - Converting temperature from degrees Celsius to kelvins, removing
#*     temperature units of measurement, processing other undefined
#*     temperature values;
#*   - Fixing misspelled values by using the provided replacement value file;
#*   - Fixing enumeration values in CIF file against CIF dictionaries;
#*   - Correcting values of '_exptl_crystal_density_meas' data item.
#*
#* All described corrections are enabled by default, but can be disabled by
#* prefixing the corresponding options by '--do-not', '--dont' or '--no'
#* (for example, '--no-fix-temperature').
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

use strict;
use warnings;
use List::MoreUtils qw( any );
use COD::CIF::ChangeLog qw( summarise_messages
                            append_changelog_to_single_item );
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_all_names );
use COD::CIF::Tags::DictTags;
use COD::CIF::Tags::COD;
use COD::CIF::Tags::TCOD;
use COD::CIF::Tags::DFT;
use COD::CIF::Tags::Print qw( print_cif pack_precision );
use COD::CIF::Tags::Manage qw( set_tag rename_tag );
use COD::SOptions qw( getOptions get_value );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_warnings process_errors
                          process_parser_messages report_message );
use COD::Precision qw( unpack_precision );
use COD::ToolsVersion qw( get_version_string );

my $Id = '$Id: cif_fix_values 8738 2021-04-28 16:35:53Z antanas $';
my $keep_tag_order = 0;

my %fix = (
    'misspelt_values'            => 1,
    'temperature'                => 1,
    'exptl_crystal_density_meas' => 1,
    'refine_ls_weighting_scheme' => 1,
    'atom_sites_solution'        => 1,
    'dois'                       => 1,
    'enums'                      => 1,
    'treat_as_set'               => 1,
);

sub fix_none
{
    $fix{$_} = 0 for keys %fix;
    return;
}

my $replacement_file;

my $die_on_errors   = 1;
my $die_on_warnings = 0;
my $die_on_notes    = 0;

my $use_parser = 'c';

my @dictionaries = ();

my %default_enums = (
    '_atom_site_adp_type'                    => [ 'Uani', 'Uiso', 'Uovl',
                                                  'Umpe', 'Bani', 'Biso', 'Bovl' ],
    '_atom_site_calc_flag'                   => [ 'd', 'calc', 'c', 'dum' ],
    '_atom_site_refinement_flags_adp'        => [ '.', 'T', 'U', 'TU' ],
    '_atom_site_refinement_flags_occupancy'  => [ '.', 'P' ],
    '_atom_site_refinement_flags_posn'       => [ '.', 'D', 'G', 'R', 'S',
                                                  'DG', 'DR', 'DS', 'GR',
                                                  'GS', 'RS', 'DGR', 'DGS',
                                                  'DRS', 'GRS', 'DGRS' ],
    '_atom_site_refinement_flags'            => [ '.', 'S', 'G', 'R',
                                                  'D', 'T', 'U', 'P' ],
    '_atom_sites_solution_hydrogens'         => [ 'difmap', 'vecmap', 'heavy',
                                                  'direct', 'geom', 'disper',
                                                  'isomor', 'mixed', 'notdet',
                                                  'dual', 'iterative', 'other' ],
    '_atom_sites_solution_primary'           => [ 'difmap', 'vecmap', 'heavy',
                                                  'direct', 'geom', 'disper',
                                                  'isomor', 'notdet', 'dual',
                                                  'iterative', 'other' ],
    '_atom_sites_solution_secondary'         => [ 'difmap', 'vecmap', 'heavy',
                                                  'direct', 'geom', 'disper',
                                                  'isomor', 'notdet', 'dual',
                                                  'iterative', 'other' ],
    '_atom_site_thermal_displace_type'       => [ 'Uani', 'Uiso', 'Uovl', 'Umpe',
                                                  'Bani', 'Biso', 'Bovl' ],
    '_chemical_absolute_configuration'       => [ 'rm', 'ad', 'rmad',
                                                  'syn', 'unk', '.' ],
    '_chemical_conn_bond_type'               => [ 'sing', 'doub', 'trip', 'quad',
                                                  'arom', 'poly', 'delo', 'pi' ],
    '_chemical_enantioexcess_bulk_technique' => [ 'OA', 'CD', 'EC', 'other' ],
    '_chemical_enantioexcess_crystal_technique' => [ 'CD', 'EC', 'other' ],
    '_citation_coordinate_linkage'           => [ 'no', 'n', 'yes', 'y' ],
    '_diffrn_radiation_probe'                => [ 'x-ray', 'neutron',
                                                  'electron', 'gamma' ],
    '_diffrn_radiation_wavelength_determination' => [ 'fundamental',
                                                      'estimated', 'refined' ],
    '_diffrn_radiation_xray_symbol'          => [ 'K-L~3~', 'K-L~2~',
                                                  'K-M~3~', 'K-L~2,3~' ],
    '_diffrn_refln_scan_mode_backgd'         => [ 'st', 'mo' ],
    '_diffrn_refln_scan_mode'                => [ 'om', 'ot', 'q' ],
    '_diffrn_source_target'                  => [ 'H', 'He', 'Li', 'Be', 'B',
                                                  'C', 'N', 'O', 'F', 'Ne',
                                                  'Na', 'Mg', 'Al', 'Si', 'P',
                                                  'S', 'Cl', 'Ar', 'K', 'Ca',
                                                  'Sc', 'Ti', 'V', 'Cr', 'Mn',
                                                  'Fe', 'Co', 'Ni', 'Cu', 'Zn',
                                                  'Ga', 'Ge', 'As', 'Se', 'Br',
                                                  'Kr', 'Rb', 'Sr', 'Y', 'Zr',
                                                  'Nb', 'Mo', 'Tc', 'Ru', 'Rh',
                                                  'Pd', 'Ag', 'Cd', 'In', 'Sn',
                                                  'Sb', 'Te', 'I', 'Xe', 'Cs',
                                                  'Ba', 'La', 'Ce', 'Pr', 'Nd',
                                                  'Pm', 'Sm', 'Eu', 'Gd', 'Tb',
                                                  'Dy', 'Ho', 'Er', 'Tm', 'Yb',
                                                  'Lu', 'Hf', 'Ta', 'W', 'Re',
                                                  'Os', 'Ir', 'Pt', 'Au', 'Hg',
                                                  'Tl', 'Pb', 'Bi', 'Po', 'At',
                                                  'Rn', 'Fr', 'Ra', 'Ac', 'Th',
                                                  'Pa', 'U', 'Np', 'Pu', 'Am',
                                                  'Cm', 'Bk', 'Cf', 'Es', 'Fm',
                                                  'Md', 'No', 'Lr' ],
    '_exptl_absorpt_correction_type'         => [
        'analytical', 'cylinder', 'empirical', 'gaussian', 'integration',
        'multi-scan', 'none', 'numerical', 'psi-scan', 'refdelf', 'sphere'
    ],
    '_exptl_crystal_colour_lustre'           => [ 'metallic', 'dull', 'clear' ],
    '_exptl_crystal_colour_modifier'         => [ 'light', 'dark', 'whitish',
                                                  'blackish', 'grayish', 'brownish',
                                                  'reddish', 'pinkish', 'orangish',
                                                  'yellowish', 'greenish', 'bluish' ],
    '_exptl_crystal_colour_primary'          => [ 'colourless', 'white', 'black',
                                                  'gray', 'brown', 'red', 'pink',
                                                  'orange', 'yellow', 'green',
                                                  'blue', 'violet' ],
    '_geom_angle_publ_flag'                  => [ 'no', 'n', 'yes', 'y' ],
    '_geom_bond_publ_flag'                   => [ 'no', 'n', 'yes', 'y' ],
    '_geom_contact_publ_flag'                => [ 'no', 'n', 'yes', 'y' ],
    '_geom_hbond_publ_flag'                  => [ 'no', 'n', 'yes', 'y' ],
    '_geom_torsion_publ_flag'                => [ 'no', 'n', 'yes', 'y' ],
    '_publ_body_element'                     => [ 'section', 'subsection',
                                                  'subsubsection', 'appendix',
                                                  'footnote' ],
    '_publ_body_format'                      => [ 'ascii', 'cif', 'latex', 'rtf',
                                                  'sgml', 'tex', 'troff' ],
    '_publ_manuscript_incl_extra_defn'       => [ 'no', 'n', 'yes', 'y' ],
    '_publ_requested_category'               => [
        'AD', 'CI', 'CM', 'CO', 'EI', 'EM', 'EO', 'FA', 'FI', 'FM', 'FO',
        'GI', 'GM', 'GO', 'HI', 'HM', 'HO', 'QI', 'QM', 'QO', 'SC'
    ],
    '_refine_ls_hydrogen_treatment'          => [
        'refall', 'refxyz', 'refU', 'noref', 'constr', 'hetero', 'heteroxyz',
        'heteroU', 'heteronoref', 'hetero-mixed', 'heteroxyz-mixed',
        'heteroU-mixed', 'heteronoref-mixed', 'mixed', 'undef'
    ],
    '_refine_ls_matrix_type'                 => [
        'full', 'fullcycle', 'atomblock', 'userblock', 'diagonal', 'sparse'
    ],
    '_refine_ls_structure_factor_coef'       => [ 'F', 'Fsqd', 'Inet' ],
    '_refine_ls_weighting_scheme'            => [ 'sigma', 'unit', 'calc' ],
    '_refln_include_status'                  => [ 'o', '<', '-', 'x', 'h', 'l' ],
    '_refln_observed_status'                 => [ 'o', '<', '-', 'x', 'h', 'l' ],
    '_refln_refinement_status'               => [ 'incl', 'excl', 'extn' ],
    '_space_group_crystal_system'            => [
        'triclinic', 'monoclinic', 'orthorhombic',
        'tetragonal', 'trigonal', 'hexagonal', 'cubic'
    ],
    '_symmetry_cell_setting'                 => [
        'triclinic', 'monoclinic', 'orthorhombic', 'tetragonal',
        'rhombohedral', 'trigonal', 'hexagonal', 'cubic'
    ],
);
my $default_enums = \%default_enums;

my @default_set_tags = ( '_atom_site_refinement_flags' );

#* OPTIONS:
#*   --fix-temperature
#*                     Corrects temperature values which have units specified
#*                     or converts between degrees Celsius and kelvins.
#*                     Changes 'room/ambiante temperature' to the appropriate
#*                     numeric value. Changes other undefined values
#*                     ('none', 'not given') to '?' symbol.
#*
#*   --fix-misspelled replacement_value.lst
#*                     Corrects misspelled values in the CIF file. Provide a file
#*                     for the check.
#*
#*   --fix-dois
#*                     Removes URL or DOI: prefixes from values of DOI data
#*                     items.
#*
#*   --fix-enums
#*                     Corrects enumeration values in the CIF against CIF
#*                     dictionaries. Provide dictionaries using option
#*                     --dictionaries (or -d), otherwise script uses a
#*                     built-in table derived from the CIF Core dictionary.
#*
#*   --treat-as-set
#*                     Treats certain enumeration values as set data type in
#*                     the CIF against CIF dictionaries (default). Provide
#*                     dictionaries using --dictionaries (or -d) option, otherwise
#*                     script uses a built-in table derived from the CIF Core
#*                     dictionary.
#*
#*   --fix-density-meas
#*                     Corrects value of data item '_exptl_crystal_density_meas'.
#*                     Values which are perceived as 'not measured' are changed
#*                     to '?'. The correct units are deleted. The new data item
#*                     is created if there are information about temperature.
#*
#*   --fix-weighting-scheme
#*                     Changes the value of the data item '_refine_ls_weighting_scheme'
#*                     to the value 'calc' if it contains details which seems to
#*                     have a formula, and creates new data item
#*                     '_refine_ls_weighting_details' to store that information.
#*                     If value is equal to number one, then it is changed to
#*                     the enumeration value 'unit'.
#*
#*   --fix-atom-sites-solution
#*                     Corrects the most frequent mistakes in the values of
#*                     data items '_atom_sites_solution_primary',
#*                     '_atom_sites_solution_secondary' and
#*                     '_atom_sites_solution_hydrogens'.
#*
#*   -d, --dictionaries 'cif_core.dic,cif_cod.dic'
#*                     A list of CIF dictionary files (according to DDL1)
#*                     to be used in CIF file validation. List elements
#*                     are separated either by ',' or by ' '. To include
#*                     dictionaries with filenames containing these symbols,
#*                     the --add-dictionary option is used.
#*
#*   -D, --add-dictionary 'cif new dictionary.dic'
#*                     Add additional CIF dictionary to the list.
#*
#*   --clear-dictionaries
#*                     Remove all CIF dictionaries from the list.
#*
#*   --use-perl-parser
#*   --use-c-parser
#*                     Specify parser to parse CIF files (default: C parser).
#*
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '--fix-misspelled'
        => sub{ $fix{'misspelt_values'} = 1;
                $replacement_file = get_value() },
    '--fix-only-misspelled'
        => sub{ fix_none();
                $fix{'misspelt_values'} = 1;
                $replacement_file = get_value() },
    '--no-fix-misspelled,' .
    '--do-not-fix-misspelled,' .
    '--dont-fix-misspelled'
        => sub{ $fix{'misspelt_values'} = 0;
                undef $replacement_file },

    '--fix-temperature'
        => sub { $fix{'temperature'} = 1 },
    '--fix-only-temperature'
        => sub{ fix_none();
                $fix{'temperature'} = 1 },
    '--no-fix-temperature,' .
    '--do-not-fix-temperature,' .
    '--dont-fix-temperature'
        => sub{ $fix{'temperature'} = 0 },

    '--fix-dois'
        => sub{ $fix{'dois'} = 1 },
    '--fix-only-dois'
        => sub{ fix_none();
                $fix{'dois'} = 1 },
    '--no-fix-dois,' .
    '--do-not-fix-dois,' .
    '--dont-fix-dois'
        => sub{ $fix{'dois'} = 0 },

    '--fix-enums'
        => sub{ $fix{'enums'} = 1 },
    '--fix-only-enums'
        => sub{ fix_none();
                $fix{'enums'} = 1 },
    '--no-fix-enums,' .
    '--do-not-fix-enums,' .
    '--dont-fix-enums'
        => sub{ $fix{'enums'} = 0 },

    '--treat-as-set'
        => sub{ $fix{'treat_as_set'} = 1 },
    '--no-treat-as-set,' .
    '--do-not-treat-as-set,' .
    '--dont-treat-as-set'
        => sub{ $fix{'treat_as_set'} = 0 },

    '--fix-density-meas'
        => sub{ $fix{'exptl_crystal_density_meas'} = 1 },
    '--fix-only-density-meas'
        => sub{ fix_none();
                $fix{'exptl_crystal_density_meas'} = 1 },
    '--no-fix-density-meas,' .
    '--do-not-fix-density-meas,' .
    '--dont-fix-density-meas'
        => sub{ $fix{'exptl_crystal_density_meas'} = 0 },

    '--fix-weighting-scheme'
        => sub{ $fix{'refine_ls_weighting_scheme'} = 1 },
    '--fix-only-weighting-scheme'
        => sub{ fix_none();
                $fix{'refine_ls_weighting_scheme'} = 1 },
    '--no-fix-weighting-scheme,' .
    '--do-not-fix-weighting-scheme,' .
    '--dont-fix-weighting-scheme'
        => sub{ $fix{'refine_ls_weighting_scheme'} = 0 },

    '--fix-atom-sites-solution'
        => sub{ $fix{'atom_sites_solution'} = 1 },
    '--fix-only-atom-sites-solution'
        => sub{ fix_none();
                $fix{'atom_sites_solution'} = 1 },
    '--no-fix-atom-sites-solution,' .
    '--do-not-fix-atom-sites-solution,' .
    '--dont-fix-atom-sites-solution'
        => sub{ $fix{'atom_sites_solution'} = 0 },

    '-d,--dictionaries'    => sub{ @dictionaries = split m/,|\s+/, get_value() },
    '-D,--add-dictionary'  => sub{ push @dictionaries, get_value() },
    '--clear-dictionaries' => sub{ @dictionaries = () },

    '--use-perl-parser'       => sub { $use_parser = 'perl' },
    '--use-c-parser'          => sub { $use_parser = 'c' },

    '--options'      => sub { options; exit },
    '--help,--usage' => sub { usage; exit },
    '--version'      => sub { print get_version_string(), "\n"; exit }
);

my $die_on_error_level = {
    ERROR   => $die_on_errors,
    WARNING => $die_on_warnings,
    NOTE    => $die_on_notes
};

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

##
# Extracts dictionary provenance details from a DDL1 dictionary.
# @param $dict
#       Reference to DDL1 dictionary structure as returned by the
#       COD::CIF::Parser.
# @return $dict_provenance
#       Reference to dictionary provenance hash. Example of the
#       returned structure:
#       {
#        'name'    => 'cif_core.dic'
#               name of the dictionary
#        'version' => '2.4.5'
#               dictionary version
#        'update'  => '2014-11-21'
#               date of the last dictionary update
#       }
##
sub get_dictionary_provenance
{
    my ( $dict ) = @_;

    my %dict_provenance;
    for my $dict_item ( @{$dict} ) {
        next if $dict_item->{'name'} ne 'on_this_dictionary';
        next if !exists $dict_item->{'values'};

        my $values = $dict_item->{'values'};
        $dict_provenance{'name'}    = $values->{'_dictionary_name'}[0];
        $dict_provenance{'version'} = $values->{'_dictionary_version'}[0];
        $dict_provenance{'update'}  = $values->{'_dictionary_update'}[0];
        last;
    }

    return \%dict_provenance;
}

##
# Builds a string of a regular form from a dictionary provenance hash.
# @param $dict_provenance
#       Reference to dictionary provenance hash as returned by the
#       get_dictionary_provenance() subroutine. In addition to the
#       fields populated by the subroutine, the 'file' field is also
#       recognised. Example of the accepted structure:
#       {
#        'file'    => /home/user/dictionaries/cif_core.dic
#               name of the dictionary file
#        'name'    => 'cif_core.dic'
#               name of the dictionary
#        'version' => '2.4.5'
#               dictionary version
#        'update'  => '2014-11-21'
#               date of the last dictionary update
#       }
# @return $provenance
#       A formatted string containing the dictionary provenance
#       information.
##
sub sprintf_dictionary_provenance
{
    my ( $dict_provenance ) = @_;

    # TODO: it should be discussed if we really want to output the filename
    # of the dictionary since it might potentially be more confusing than
    # useful
    my $provenance =
        ( defined $dict_provenance->{'file'} ?
                  "the $dict_provenance->{'file'}" : '' ) . ' dictionary' .
        ( defined $dict_provenance->{'name'} ?
                  ' named \'' . $dict_provenance->{'name'} . '\'' : '' ) .
        ( defined $dict_provenance->{'version'} ?
                  ' version ' . $dict_provenance->{'version'} : '' ) .
        ( defined $dict_provenance->{'update'} ?
                  ' last updated on ' . $dict_provenance->{'update'} : '' );

    return $provenance;
}

#
# cif_fix_values specific subroutines:
#

sub replacement_candidates($$)
{
    my( $cif_value, $dict_value_list ) = @_;
    my @candidate_list = ();
    foreach my $dict_tag_value( @{$dict_value_list} ) {
        if( $cif_value eq $dict_tag_value ) {
            return ();
        }
        my $test_dict_value = $dict_tag_value;
        $test_dict_value =~ s/[-_\s]//g;
        my $test_cif_value = $cif_value;
        $test_cif_value =~ s/[-_\s]//g;

        if( lc $test_cif_value eq lc $test_dict_value ) {
            push @candidate_list, $dict_tag_value;
            next;
        }
        if( lc $cif_value eq lc $dict_tag_value ) {
            push @candidate_list, $dict_tag_value;
            next;
        }
    }
    if( scalar( @candidate_list ) == 1 ) {
        return @candidate_list;
    } else {
        return @{$dict_value_list};
    }
}

sub fix_misspelled_values($$) {
    my( $dataset, $value_spelling ) = @_;

    my @notes;
    for my $tag ( @{$dataset->{'tags'}} ) {
        next if !exists $value_spelling->{$tag};
        for my $value ( @{ $dataset->{'values'}{$tag} } ) {
            next if $value =~ /^[.?]$/;

            my $count = 0;
            my $old_value = $value;
            for my $pair( @{ $value_spelling->{$tag} } ){
                my $regex = $pair->[0];
                my $replacement = $pair->[1];

                if ( $value =~ /^ $regex $/xi ){
                    $value =~ s/^ $regex $/$replacement/xi;
                    $count++;
                }
            }
            if( $count > 1 ) {
                 warn 'WARNING, more than 1 different substitution '
                    . "was applied on data item '$tag' value '$old_value'\n";
            }
            if( $count == 1 ) {
                push @notes,
                     "data item '$tag' value '$old_value' " .
                     "was replaced with the value '$value' " .
                     'as specified in the replacement file ' .
                     "'$replacement_file'";
            }
        }
    }

    my $summarised_notes = summarise_messages( \@notes );
    for my $note ( @{$summarised_notes} ) {
        warn "NOTE, $note\n";
    };

    return @{$summarised_notes};
}

my $number_pos =
    '(?:\+?' .
    '(?:[0-9]+(?:\.[0-9]*)?|\.[0-9]+)' .
    '(?:[eE][-+]?[0-9]+)?)';
my $number_neg =
    '(?:\-' .
    '(?:[0-9]+(?:\.[0-9]*)?|\.[0-9]+)' .
    '(?:[eE][-+]?[0-9]+)?)';
my $temp_K  =
    '(?:(?i:K(?i:elvin?)?)|(?i:K))';
my $temp_C  =
    '(?:(?i:deg\.?(?:rees?)?)?\s*(?i:C(?i:el[sc]ius)?)|' .
    '(?i:Deg\.?(?:rees?)?\s*[Cc]?)|' .
    '(?:(?:(?i:[\\\/]+o)|(?i:O)|(?:[\\\/]*\%))' .
    '(?:[-_\s]*)(?i:C\.?)?)|' .
    '(?:[ ]*0(?i:C\.?))|' .
    '(?i:(?i:degrees?)?(?:[-_\s]*)centigrades?))';
my $temp_RT =
    '(?:(?:(?i:temp\\\\\'erature)\s*ambi[ae]nte?)|' .
    '(?:(?:(?i:room)|(?i:amb(?i:i[ae]nte?)))' .
    '\s*(?i:tem[pt](?:\.|erature)?)?)|(?i:rt))';
my $temp_undef =
    '(?:(?i:ye?s?)|(?i:no?(?i:ne)?)|(?i:unknown)|' .
    '(?i:not?\s*(?:(?i:meas*ure?d?)|(?i:important)|' .
    '(?i:determine?d?)|(?i:avai?lable?)|(?i:relevant)|' .
    '(?i:recorde?d?)))|(?i:N\/?(?i:[DA]))|\s*|[-])';
my $sigma = '(?:[0-9]+\.[0-9]+|[0-9]+\.|\.[0-9]+|[0-9]+)';
my $temp_dec =
    '(?i: d\.?(?i:ec\.?)?' .
    '(?i:omp\.?)?(?i:os(?i:e[ds]?|ition))?\s*(?i:at)?)';
my $measured_at = qr/
    (?: (?:measured)? \s* at )
    /ix;
my $value_not_measured = qr/
    (?:
        not[ _]measured|none|na|n\.a\.|n\/[ad]|
        -+|no|nm|n|no[ntr]|
            (?:[mn]oi?[ntr]|nit|npt|no|pas|nicht)
            [- _']+
            (?:[mn]ea?s?a?urt?e*[ds]|meas|measurement|
            mes(?:ua|ou|asu)red|meas(?:e|y|ua|ou|us|ru|hu)red|
            meausred|measused|measure|measrued|measued|mesur\\'ee|
            performed|determined|applied|available|done|gemessen)
        |
        notmeasured|not[ ]being[ ]measured|
        unknown|\?none|mone|nnoe|nonne|noref|nonr|
        not[ ]measured'\?|\?|\?"|0\?|\?0|\?\/|'[ ]'|\/
    )
    /ix;
my $units_equiv_Mg_per_m3 = qr/
    (?: (?!)
    |   Mg [ .] m  \^? -3 \^?
    |   Mg \/   m  \^?  3 \^?
    |   g  [ .] cm \^? -3 \^?
    |   g  \/   cm \^?  3 \^?
    )
    /x;
my $unit_weights = '(?i:unit\s+weights?\s*(?:applied)?)';

sub fix_temperature($)
{
    my( $dataset ) = @_;

    my @notes;
    my $values = $dataset->{values};
    my @temp_tags = qw ( _cell_measurement_temperature
                         _chemical_temperature_decomposition
                         _chemical_temperature_sublimation
                         _diffrn_ambient_temperature
                         _exptl_crystal_density_meas_temp
                         _chemical_melting_point );
    for my $tag( @temp_tags ) {
        next if !exists $values->{$tag};
        # NOTE: even though some of the temperature data items are allowed
        # to appear in a looped context processing of such instances is
        # quite complex since data item renaming can potentially take
        # place. Overall, looped temperature data item values are quite
        # rare and are better handled on the case by case basis
        next if @{$values->{$tag}} != 1;

        my $i = 0;
        my $temperature = $values->{$tag}[$i];
        my $temperature_modif = $temperature;
        $temperature_modif =~ s/^\s+|^\n+|\n+$|\s+$//g;

        next if $temperature_modif =~ /^[.?]$/;
        next if $temperature_modif =~ /^($number_pos|$number_pos\([0-9]+\))$/;

        if( $temperature_modif =~
            /^ \(?($temp_dec)?\)?(?:[-_,\s]*)
                    ($number_pos)\(?($sigma)?\)?
                    (?:[-_\s]*)$temp_K?(?:[-_,\s]*)
                    \(?($temp_dec)?\)?$
                    /x )  {
            if( $1 || $4 ) {
                my $old_tag = $tag;
                my $new_tag = '_chemical_temperature_decomposition';
                $values->{$tag}[$i] = pack_precision( $2 , $3 );
                my $new_val = $values->{$tag}[$i];
                rename_tag( $dataset, $old_tag, $new_tag );
                push @notes,
                     "data name '$old_tag' was changed to '$new_tag' " .
                     "since the value had been '$temperature'. " .
                     "The value '$temperature' was changed to " .
                     "'$new_val'";
                next;
            }
        }

        if ( $temperature_modif =~ /^[><]/ ) {
            my $lt_gt_notes = fix_temperature_lt_gt( $dataset, $tag, $i );
            if ( @{$lt_gt_notes} ) {
                push @notes, @{$lt_gt_notes};
                next;
            }
        }

        if( $temperature_modif =~
            /^
                ($number_pos)\s*(?:\()?
                [\s]*(?:\+|\+\/?\-)?
                [\s]*($sigma)(?:\))?$
                /x ) {
            my $check_value = $temperature_modif;
            $values->{$tag}[$i] =
                pack_precision( $1, $2 );
            my $new_val = $values->{$tag}[$i];
            $check_value =~ s/\s+//g;
            if( $check_value eq  $new_val ) {
                push @notes,
                     "data item '$tag' value '$temperature' was changed to " .
                    "'$new_val' -- the value was reformatted";
                next;
            } else {
                push @notes,
                     "data item '$tag' value '$temperature' was changed to " .
                     "'$new_val' -- precision was estimated";
                next;
            }
        }

        if( $temperature_modif =~
            /^
                ($number_pos)(?:\()?($sigma)?
                (?:\))?(?:[-_\s]*)\(?$temp_K\)?$
                /x ) {
            $values->{$tag}[$i] =
                pack_precision( $1, $2 );
            my $new_val = $values->{$tag}[$i];
            push @notes,
                 "data item '$tag' value '$temperature' was changed to " .
                 "'$new_val' -- the value should be numeric " .
                 'and without a unit designator';
            next;
        }

        if( $temperature_modif =~
            /^
                    ($number_neg)(?:\()?($sigma)?
                    (?:\))?(?:[-_\s]*)$temp_C?$
                    /x ) {
            my $number = $1;
            my $sig = $2;
            if( $sig ) {
                $sig = unpack_precision( $number, $sig );
            }
            $number = celsius_to_kelvin( $number );
            $values->{$tag}[$i] = pack_precision( $number, $sig );
            my $new_val = $values->{$tag}[$i];
            push @notes,
                 "data item '$tag' value '$temperature' was changed to " .
                 "'$new_val' -- it was converted from degrees " .
                 'Celsius (C) to kelvins (K)';
            next;
        }

        if( $temperature_modif =~
            /^
                ($temp_RT)(?:[-_\s]*)$
                /x ) {
            $values->{$tag}[$i] = '295(2)';
            push @notes,
                 "data item '$tag' value '$temperature' was changed to " .
                 '\'295(2)\' -- the room/ambient temperature ' .
                 'average [293;298] in kelvins (K) was taken';
            next;
        }

        if( $temperature_modif =~
            /^
                ($temp_undef)(?:[-_\s]*)$
                /x ) {
            $values->{$tag}[$i] = '?';
            push @notes,
                 "data item '$tag' value '$temperature' ".
                 'was changed to \'?\' -- the ' .
                 'value is undefined or not given';
            next;
        }

        # A temperature range is provided
        if( $temperature_modif =~
            /^
                ($number_pos)\s*[\-\/\:]+\s*($number_pos)
                (?:[-_\s]*)\(?((?:$temp_C)|(?:$temp_K))?\)?
                (?:[-_,\s]*)\(?($temp_dec)?\)?$
                /x )   {
            my $temp_gt = $1;
            my $temp_lt = $2;
            my $temp_u  = $3;
            my $temp_d  = $4;

            if( $temp_lt > $temp_gt ) {
                my $is_in_Celsius = (defined $temp_u && $temp_u =~ $temp_C);
                my $number = ($temp_gt + $temp_lt)/2;
                if ($is_in_Celsius) {
                    $number = celsius_to_kelvin($number);
                }
                my $sig = ($temp_lt - $temp_gt)/2;

                my $new_val = pack_precision( $number, $sig );
                $values->{$tag}[$i] = $new_val;
                my $report_msg;

                if( $temp_d ) {
                    my $new_tag = '_chemical_temperature_decomposition';
                    rename_tag( $dataset, $tag, $new_tag );
                    $report_msg =
                        "data name '$tag' was changed to '$new_tag' "
                      . 'since the value had been given as '
                      . "'$temperature'. The value ";
                } else {
                    $report_msg = "data item '$tag' value '$temperature' ";
                }

                $report_msg .= "was changed to '$new_val' -- ";
                if ( $is_in_Celsius ) {
                    $report_msg .= 'it was converted from degrees '
                                 . 'Celsius (C) to kelvins (K), '
                }
                $report_msg .= 'the average value was taken and '
                             . 'precision was estimated';

                push @notes, $report_msg;
                next;
            }
        }

        if( $temperature_modif =~
            /^ ($temp_dec)?(?:[-_,\s]*)
                    \(?($number_pos)\(?($sigma)?\)?
                    (?:[-_\s]*)(?:$temp_C|(?:\+\s*273(?:[\.\,][0-9]+)?\)?))
                    (?:[-_,\s]*)\(?($temp_dec)?\)?$
                    /x )  {
            my $decomp_prefix = $1;
            my $number = $2;
            my $sig = $3;
            my $decomp_postfix = $4;
            if( $sig ) {
                $sig = unpack_precision( $number, $sig );
            }
            $number = celsius_to_kelvin($number);
            $values->{$tag}[$i] = pack_precision( $number, $sig );
            my $new_val = $values->{$tag}[$i];

            if( defined $decomp_prefix || defined $decomp_postfix ) {
                my $old_tag = $tag;
                my $new_tag = '_chemical_temperature_decomposition';
                rename_tag( $dataset, $old_tag, $new_tag );
                push @notes,
                     "data name '$old_tag' was changed to '$new_tag' " .
                     'since the value had been given as ' .
                     "'$temperature'. " .
                     "The value '$temperature' was changed to " .
                     "'$new_val' -- " .
                     'it was converted from degrees Celsius (C) ' .
                     'to kelvins (K)';
                next;
            } else {
                push @notes,
                     "data item '$tag' value '$temperature' was changed to " .
                     "'$new_val' -- it was converted from degrees " .
                     'Celsius (C) to kelvins (K)';
                next;
            }
        }

        if( $temperature_modif !~
            /^$number_pos(?:[(]$sigma[)])?$/x ) {
            if( length($temperature) > 40 ) {
                    $temperature = substr $temperature, 0, 40;
                    $temperature .= '...';
                }
            warn "WARNING, data item '$tag' value is '$temperature', but it "
               . 'should be numeric, i.e. \'FLOAT\' or \'INT\', '
               . 'permitted range is [0.0;+inf], the value should be '
               . 'in kelvins (K) without a unit designator' . "\n";
        }
    }

    for ( @notes ) {
        warn "NOTE, $_\n";
    }

    return @notes;
}

sub fix_temperature_lt_gt
{
    my ($data_block, $tag, $value_index) = @_;

    my $values = $data_block->{'values'};
    my $temperature = $values->{$tag}[$value_index];
    my $temperature_modif = $temperature;
    $temperature_modif =~ s/^\s+|^\n+|\n+$|\s+$//g;
    my @notes;

    if( $temperature_modif =~
        /^
            ([><])[_\s]*($number_pos|$number_neg)
            \(?($sigma)?\)?(?:[-_\'\s]*)
            (?: ($temp_C)|($temp_K) )?(?:[-_,\s]*)
            \(?($temp_dec)?\)?$
            /x ) {

        my $sign    = $1;
        my $number  = $2;
        my $sig     = $3;
        my $units_c = $4;
        my $units_k = $5;
        my $decomp  = $6;

        # temperature in negative kelvins
        return [] if ( defined $units_k && $number < 0 );

        my $old_tag = $tag;
        if( $decomp ) {
            $old_tag = '_chemical_temperature_decomposition';
        }

        return [] if $old_tag =~ /_cell_measurement_temperature/;

        if( $sig ) {
            $sig = unpack_precision( $number, $sig );
        }

        my $is_in_Celsius = (defined $units_c || $number < 0);
        if ( $is_in_Celsius ) {
            $number = celsius_to_kelvin($number);
        }
        $values->{$tag}[$value_index] = pack_precision( $number, $sig );
        my $new_val = $values->{$tag}[$value_index];

        my $unit_processing_postfix = '';
        if ( $is_in_Celsius ) {
            $unit_processing_postfix =
                ". The value '$temperature' was changed to '$new_val' -- " .
                'it was converted from degrees Celsius (C) to kelvins (K)'
        } elsif ( defined $units_k ) {
            $unit_processing_postfix =
                 ". The value '$temperature' was changed to '$new_val' -- " .
                'it should be numeric and without a unit designator'
        };

        my $report_msg;
        if( $sign eq '>' ) {
            my $new_tag = $old_tag . '_gt';
            rename_tag( $data_block, $tag, $new_tag );
            $report_msg =
                "data name '$tag' was changed to '$new_tag' since the value was " .
                'specified as \'more than\' (\'>\') a certain temperature' .
                $unit_processing_postfix;
        } elsif( $sign eq '<' ) {
            my $new_tag = $old_tag . '_lt';
            rename_tag( $data_block, $tag, $new_tag );
            $report_msg =
                "data item '$tag' was changed to '$new_tag' since the value was " .
                'specified as \'less than\' (\'<\') a certain temperature' .
                $unit_processing_postfix;
        }
        push @notes, $report_msg;
    }

    return \@notes;
}

sub fix_value_of_exptl_crystal_density_meas($)
{
    my( $dataset ) = @_;

    my @notes;
    my $values = $dataset->{values};
    my @value_tags = (
        '_exptl_crystal_density_meas',
        );
    for my $tag ( @value_tags ) {
        next if !exists $values->{$tag};
        # skipping the processing of looped values
        next if @{$values->{$tag}} != 1;

        my $i = 0;
        my $value = $values->{$tag}[$i];
        my $value_modif = $value;
        $value_modif =~ s/^\s+|\s+$//g;

        next if $value_modif =~ /^[.?]$/;
        next if $value_modif =~ /^($number_pos|$number_pos\([0-9]+\))$/;

        if( $value_modif =~
            /^
                (?: ' \s*)?
                $value_not_measured
                (?: \s* ')?
            $/x ) {
            $values->{$tag}[$i] = '?';
            push @notes,
                 "data item '$tag' value '$value' ".
                 'was changed to \'?\' -- the ' .
                 'value is perceived as not measured';
            next;
        }

        if( $value_modif =~
            /^
                (?: ' \s*)?
                (?> ($number_pos) )
                # atomic group is used for not to match 150K
                # otherwise it backtracks and matches: [15,0,K]
                \s* (?: \( \s* ($sigma) \s* \) )?
                \s* \(? \s* ($units_equiv_Mg_per_m3)? \s* \)?
                \s* \(? \s* $measured_at ? \s*
                    (?: ($temp_RT) |
                    ($number_pos | $number_neg)
                    \s* (?: \( \s* ($sigma) \s* \) )?
                    \s* (?: ($temp_C) | ($temp_K) )?
                    )?
                \s* \)?
                (?: \s* ')?
            $/x ) {

            my $meas_number = $1;
            my $meas_sigma = $2;
            my $units = $3;
            my $room_temp = $4;
            my $temp_number = $5;
            my $temp_sigma = $6;
            my $temp_Celsius = $7;
            my $temp_kelvins = $8;

            if( defined $temp_number && $temp_number < 0
                && ! defined $temp_Celsius ){
                next;
            }

            if( ! defined $units && ! defined $room_temp
                && ! defined $temp_number ){
                next;
            }

            if( defined $meas_sigma ){
                $meas_sigma = unpack_precision( $meas_number, $meas_sigma );
            }

            my $new_meas_value =
                $values->{$tag}[$i] =
                pack_precision( $meas_number, $meas_sigma );

            my $report_msg = "data item '$tag' value '$value' was changed " .
                             "to '$new_meas_value'";

            if( defined $units ){
                $report_msg .= ". Units '$units' were correct but " .
                               'unnecessary, so they were removed';
            }

            if( defined $room_temp || defined $temp_number ) {

                my $new_tag = $tag . '_temp';
                my $temp_value;

                if( defined $room_temp ){
                    $temp_value = '295(2)';
                }

                if( defined $temp_number ){
                    if( defined $temp_sigma ){
                        $temp_sigma = unpack_precision( $temp_number, $temp_sigma );
                    }
                    if ( $temp_Celsius ) {
                        $temp_number = celsius_to_kelvin( $temp_number );
                    }
                    $temp_value = pack_precision( $temp_number, $temp_sigma );
                }

                set_tag( $dataset, $new_tag, $temp_value );

                $report_msg .=
                    ". A new data item '$new_tag' was created with the " .
                    "value set to '$temp_value'" .
                        ( defined $temp_Celsius ? ' -- it was converted ' .
                         'from degrees Celsius (C) to kelvins (K)' : '' );
            }

            push @notes, $report_msg;
        }
    }

    for ( @notes ) {
        warn "NOTE, $_\n";
    }

    return @notes;
}

sub fix_refine_ls_weighting_scheme
{
    my( $dataset ) = @_;

    my $tag = '_refine_ls_weighting_scheme';

    return () if !exists $dataset->{'values'}{$tag};
    return () if @{$dataset->{'values'}{$tag}} != 1;

    my $index = 0;
    my $report = process_ls_weighting_scheme( $dataset, $tag, $index );

    my @notes;
    if ( defined $report ) {
        push @notes, $report;
    }

    for ( @notes ) {
        warn "NOTE, $_\n";
    }

    return @notes;
}

sub process_ls_weighting_scheme
{
    my ($data_block, $tag, $i) = @_;

    my $values = $data_block->{'values'};

    my $value = $values->{$tag}[$i];
    my $value_modif = $value;
    # Silently removes spaces
    $value_modif =~ s/^\s+|\s+$//g;

    my $enums = join '|', @{ $default_enums{$tag} };
    if ( $value_modif =~ /^([.?]|$enums)$/ ) {
        $values->{$tag}[$i] = $1;
        return;
    }

    return if $value_modif =~ /\?$/;

    if ( $value_modif =~ /^$value_not_measured$/ ) {
        $values->{$tag}[$i] = '?';

        return "data item '$tag' value '$value' was changed to '?' -- " .
               'the value is perceived as not measured';
    }

    if ( $value_modif =~ /^\s*
        (?: 1 (?:\.0*)? |
            $unit_weights
        ) \s*
        $/xi ) {
        my $new_value = 'unit';
        $values->{$tag}[$i] = $new_value;

        return "data item '$tag' value '$value' was changed to '$new_value'";
    }

    my $report = process_calc_ls_weighting_scheme($data_block, $tag, $i);
    return $report if defined $report;

    # TODO: reporting of mismatched enumerators should be fully delegated
    # to the CIF validator (discussion topic)
    if ( length($value_modif) > 40 ) {
        $value_modif = substr $value_modif, 0, 40;
        $value_modif .= '...';
    }
    warn "WARNING, data item '$tag' value '$value_modif' must be one of " .
         'the enumeration values [' .
            join( ', ', map { "'$_'" } @{$default_enums{$tag}} ) .
         ']' . "\n";

    return;
}

sub process_calc_ls_weighting_scheme
{
    my ( $data_block, $tag, $i ) = @_;

    my $value = $data_block->{'values'}{$tag}[$i];
    my $value_modif = $value;
    $value_modif =~ s/^\s+|\s+$//g;

    if ( $value_modif =~
        /^
            (?: (calc\b)? \s*
                (w \s* (?:\^-1\^)? \s* ={1,2} \s*)?
                (.*)
            )
        $/xi ) {

        my $calc = $1;
        my $w = $2;
        my $weighting_value = $3;

        if ( defined $w ||
             $weighting_value =~ m/^ \s*
                (?:  $number_pos \s* \/
                |    4\(?F
                |    [\[({](?!w)
                )
            /xi ) {

            $data_block->{'values'}{$tag}[$i] = 'calc';
            my $new_value = ( defined $w ? $w : 'w = ' ) . $weighting_value;
            my $new_tag = '_refine_ls_weighting_details';
            set_tag( $data_block, $new_tag, $new_value );

            return "data item '$tag' value '$value' was changed to 'calc'. " .
                   "A new data item '$new_tag' was created with " .
                   'the value set to ' . "'$new_value'";
        }
    }

    return;
}

sub fix_value_of_atom_sites_solution($) {
    my( $dataset ) = @_;
    my @notes;
    my $values = $dataset->{values};
    foreach my $tag ( qw(   _atom_sites_solution_primary
                            _atom_sites_solution_secondary
                            _atom_sites_solution_hydrogens ) ) {
        next if !exists $values->{$tag};
        for my $i (0..$#{$values->{$tag}}) {
            my $value = $values->{$tag}[$i];

            $value =~ s/^\s*direct[ _]methods?\s*$/direct/i;

            $value =~ s/^\s*heavy[ \-]atom([ \-]method)?\s*$/heavy/i;

            $value =~ s/^\s*geometric( positions|ally( placed)?)?\s*$/geom/i;
            $value =~ s/^\s*placed geometrically\s*$/geom/i;

            $value =~ s/^diffmap$/difmap/i;
            $value =~ s/^\s*diff?(erence)?([ \-]fourier)? maps?\s*$/difmap/i;

            $value =~ s/^\s*diff?(map)?(\s+and\s+|\s*[&\/,+]\s*)geom\s*$/mixed/i;
            $value =~ s/^\s*geom(\s+and\s+|\s*[&\/,+]\s*)diff?(map)?\s*$/mixed/i;

            if( $values->{$tag}[$i] ne $value ) {
                my $old_value = $values->{$tag}[$i];
                $values->{$tag}[$i] = $value;

                push @notes,
                     "data item '$tag' value '$old_value' was changed to '$value'";
            }
        }
    }

    for ( @notes ) {
        warn "NOTE, $_\n";
    }

    return @notes;
}

sub fix_dois
{
    my( $dataset ) = @_;

    my @notes;
    my $values = $dataset->{values};
    foreach my $tag ( qw( _audit_block_doi
                          _citation_doi
                          _database_dataset_doi
                          _journal_paper_doi ) ) {
        next if !exists $values->{$tag};
        for my $i (0..$#{$values->{$tag}}) {
            my $value = my $old_value = $values->{$tag}[$i];

            $value =~ s|^https?://(dx\.)?doi\.org/||;
            $value =~ s/^doi://;
            next if $value eq $old_value;

            $values->{$tag}[$i] = $value;
            push @notes,
                 "data item '$tag' value '$old_value' was changed to '$value'";
        }
    }

    for ( @notes ) {
        warn "NOTE, $_\n";
    }

    return @notes;
}

sub fix_enums
{
    my( $dataset, $tag2dict, $enum_data_items ) = @_;

    my @notes;
    my @warnings;
    for my $tag( @{$dataset->{'tags'}} ) {
        next if !exists $dataset->{'values'}{$tag};
        next if !exists $enum_data_items->{$tag};
        next if $fix{'treat_as_set'} && any { $_ eq $tag } @default_set_tags;

        for my $tag_value( @{$dataset->{'values'}{$tag}} ) {
            next if $tag_value =~ /^[.?]$/;

            my @replacement_list =
                replacement_candidates( $tag_value,
                                        $enum_data_items->{$tag} );
            next if !@replacement_list;

            if( scalar( @replacement_list ) == 1 ) {
                my $new_value = shift @replacement_list;
                my $old_value = $tag_value;
                $tag_value = $new_value;
                push @notes,
                    "data item '$tag' value '$old_value' was changed to " .
                    "'$new_value' " . 'in accordance with ' .
                    sprintf_dictionary_provenance($tag2dict->{$tag});
            } else {
                my $dict_values = join ', ', map { "'$_'" } @replacement_list;
                my $val = $tag_value;
                $val =~ s/^\n|\n$//g;
                if( length($val) > 30 ) {
                    $val = substr $val, 0, 30;
                    $val .= '...';
                }
                push @warnings,
                     "data item '$tag' value '$val' must be one of " .
                     'the enumeration values [' . $dict_values . '] ' .
                     'according to ' .
                     sprintf_dictionary_provenance($tag2dict->{$tag});
            }
        }
    }

    for my $warning ( @{ summarise_messages(\@warnings) } ) {
        warn "WARNING, $warning\n";
    };

    my $summarised_notes = summarise_messages( \@notes );
    for my $note ( @{$summarised_notes} ) {
        warn "NOTE, $note\n";
    };

    return @{$summarised_notes};
}

sub treat_as_set
{
    my( $dataset, $tag2dict, $enum_data_items ) = @_;

    my @notes;
    my @warnings;
    for my $tag( @{$dataset->{'tags'}} ) {
        next if !any { $_ eq $tag } @default_set_tags;
        next if !exists $dataset->{'values'}{$tag};
        my $set_values = join '', @{$default_enums{$tag}};
        # '.' specifies that none of the flags were set so it should
        # not be in the concatenated string
        $set_values =~ s/\.//;
        for my $tag_value( @{$dataset->{'values'}{$tag}} ) {
            next if $tag_value =~ /^[.?]$/;

            if ( $tag_value =~ /[^$set_values]/i ) {
                my $dict_values = join ', ', @{$default_enums{$tag}};
                my $val = $tag_value;
                $val =~ s/^\n|\n$//g;
                if( length($val) > 30 ) {
                    $val = substr $val, 0, 30;
                    $val .= '...';
                }
                push @warnings,
                     "data item '$tag' value '$val' should " .
                     'only contain a combination of the ' .
                     "enumeration values [$dict_values] " .
                     'according to ' .
                     sprintf_dictionary_provenance($tag2dict->{$tag});
            } else {
                my $new_value = '';
                my $val = $tag_value;
                foreach( split m//, $val ) {
                    my @replacement_list =
                        replacement_candidates( $_, $enum_data_items->{$tag} );

                    if( scalar( @replacement_list ) == 1 ) {
                        $new_value .= shift @replacement_list;
                    } else  {
                        $new_value .= $_;
                    }
                }

                my $old_value = $tag_value;
                my $values = $dataset->{'values'};
                if ($new_value =~ /([$values])[^\1]*\1/ ) {
                    push @warnings,
                         "data item '$tag' value '$new_value' " .
                         "('$old_value' before processing) " .
                         'should not contain duplicate values ' .
                         'according to ' .
                         sprintf_dictionary_provenance($tag2dict->{$tag});
                } elsif ($tag_value ne $new_value) {
                    $tag_value = $new_value;
                    push @notes,
                        "data item '$tag' value '$old_value' was " .
                        "changed to '$new_value' " . 'in accordance with ' .
                        sprintf_dictionary_provenance($tag2dict->{$tag});
                }
            }
        }
    }

    for my $warning ( @{ summarise_messages(\@warnings) } ) {
        warn "WARNING, $warning\n";
    };

    my $summarised_notes = summarise_messages( \@notes );
    for my $note ( @{$summarised_notes} ) {
        warn "NOTE, $note\n";
    };

    return @{$summarised_notes};
}

sub celsius_to_kelvin
{
    my ($degrees_celsius) = @_;

    return $degrees_celsius + 273.15;
}

my %value_spelling = ();
eval {
    local $SIG{__WARN__} = sub { process_warnings( {
                                   'message'       => @_,
                                   'program'       => $0,
                                   'filename'      => $replacement_file,
                                 }, $die_on_error_level ) };

    if( defined $replacement_file ) {
        open my $list, '<', $replacement_file or die 'ERROR, '
          . 'could not open replacement list file for input -- '
          . lcfirst($!) . "\n";

        while ( <$list> ) {
            chomp;
            next if m/^#/ || m/^\s*$/;
            if( /^(\S+)\s+(\S+)\s+(\S+)$/ ) {
                push @{ $value_spelling{$1} }, [ $2, $3 ];
            } else {
                warn "WARNING, unrecognized string '$_' in "
                   . "replacement value file\n";
            }
        }

        close $list or die 'ERROR, '
           . 'error while closing replacement value file after reading -- '
           . lcfirst($!) . "\n";
    }
};
if ($@) {
    process_errors( {
      'message'       => $@,
      'program'       => $0,
      'filename'      => $replacement_file
    }, $die_on_errors );
};

my %tagDicts;
my %enum_data_items;
if( @dictionaries ) {
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    for my $dict ( @dictionaries ) {
        my ( $data, $err_count, $messages ) = parse_cif( $dict, $options );
        process_parser_messages( $messages, $die_on_error_level );

        canonicalize_all_names( $data );

        my $provenance = get_dictionary_provenance( $data );
        $provenance->{'file'} = $dict;

        for my $dataset( @{$data} ) {
            my $values = $dataset->{values};
            next if !exists $values->{'_name'};
            next if !exists $values->{'_enumeration'};

            for ( @{$values->{'_name'}} ) {
                if ( !defined $tagDicts{$_} ) {
                    $enum_data_items{$_} = $values->{'_enumeration'};
                    $tagDicts{$_}        = $provenance;
                } else {
                    report_message( {
                       'program'   => $0,
                       'err_level' => 'WARNING',
                       'message'   => "data item '$_' is already defined by "
                                    . "the dictionary '$dict'" },
                       $die_on_warnings );
                }
            }
        }
    }
} else {
    my %default_dict_provenance = (
        'file'    => 'built-in table derived from the CIF Core',
        'name'    => 'cif_core.dic',
        'version' => '2.4.5',
        'update'  => '2014-11-21',
    );
    foreach my $tag( keys %default_enums ) {
        my $enums = $default_enums{$tag};
        $enum_data_items{$tag} = $enums;
        $tagDicts{$tag} = \%default_dict_provenance;
    }
}

# to print out all tags and theirs enum values from the given dictionary
#foreach( keys %enum_data_items ) {
#    print "'$_' => [ ";
#    foreach( @{ $enum_data_items{$_} } ){
#        print '"' . $_ . '"' .', ';
#    }
#    print "\n";
#}
#exit 0;

my @dictionary_tags = ( @COD::CIF::Tags::DictTags::tag_list,
                        @COD::CIF::Tags::COD::tag_list,
                        @COD::CIF::Tags::TCOD::tag_list,
                        @COD::CIF::Tags::DFT::tag_list );
my %dictionary_tags = map { $_ => $_ } @dictionary_tags;

@ARGV = ('-') unless @ARGV;

for my $filename (@ARGV) {
    my $options = { 'parser' => $use_parser, 'no_print' => 1 };
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $options );
    process_parser_messages( $messages, $die_on_error_level );

    canonicalize_all_names( $data );

    for my $dataset( @{$data} ) {
        my @insert_reports = ();

        my $dataname = 'data_' . $dataset->{'name'};

        local $SIG{__WARN__} = sub {
            process_warnings( {
                'message'  => @_,
                'program'  => $0,
                'filename' => $filename,
                'add_pos'  => $dataname
            }, $die_on_error_level )
        };

        eval {
            if( $fix{'temperature'} ) {
                my @temperature_reports = fix_temperature( $dataset );
                push @insert_reports, @temperature_reports;
            }
            if( $fix{'misspelt_values'} ) {
                my @misspell_reports =
                        fix_misspelled_values( $dataset, \%value_spelling );
                push @insert_reports, @misspell_reports;
            }
            if( $fix{'dois'} ) {
                my @reports_of_dois = fix_dois( $dataset );
                push @insert_reports, @reports_of_dois;
            }
            if( $fix{'enums'} ) {
                my @enums_reports = fix_enums( $dataset, \%tagDicts, \%enum_data_items );
                push @insert_reports, @enums_reports;
            }
            if ( $fix{'exptl_crystal_density_meas'} ) {
                my @reports_of_exptl_crystal_density_meas =
                    fix_value_of_exptl_crystal_density_meas( $dataset );
                push @insert_reports, @reports_of_exptl_crystal_density_meas;
            }
            if( $fix{'refine_ls_weighting_scheme'} ) {
                my @reports_of_refine_ls_weighting_scheme =
                    fix_refine_ls_weighting_scheme( $dataset );
                push @insert_reports, @reports_of_refine_ls_weighting_scheme;
            }
            if( $fix{'atom_sites_solution'} ) {
                my @reports_of_atom_sites_solution =
                    fix_value_of_atom_sites_solution( $dataset );
                push @insert_reports, @reports_of_atom_sites_solution;
            }
            if( $fix{'treat_as_set'} ) {
                my @set_reports = treat_as_set( $dataset, \%tagDicts, \%enum_data_items );
                push @insert_reports, @set_reports;
            }

            @insert_reports = map { "$_." } @insert_reports;
            append_changelog_to_single_item( $dataset, \@insert_reports,
                                              {
                                                'signature' => $Id
                                              }
                                           );

            print_cif( $dataset, {
                exclude_misspelled_tags => 0,
                preserve_loop_order => 1,
                fold_long_fields => 0,
                dictionary_tags => \%dictionary_tags,
                dictionary_tag_list => \@dictionary_tags,
                keep_tag_order => $keep_tag_order,
            });
        };
        if ($@) {
            process_errors( {
              'message'       => $@,
              'program'       => $0,
              'filename'      => $filename,
              'add_pos'       => $dataname },
               $die_on_errors );
        };
    }
}
