#! /bin/sh
#!perl -w # --*- Perl -*--
eval 'exec perl -x $0 ${1+"$@"}'
    if 0;
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2021-04-28 19:35:53 +0300 (Wed, 28 Apr 2021) $
#$Revision: 8738 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.3.0/scripts/cif_eval_numbers $
#------------------------------------------------------------------------------
#*
#* Read a CIF file and convert those number values that are written as
#* simple fractions (e.g. "1/4") into a single floating point number.
#*
#* USAGE:
#*    $0 --options input1.cif input*.cif
#**

use strict;
use warnings;
use COD::CIF::Parser qw( parse_cif );
use COD::CIF::Tags::Print qw( print_cif );
use COD::CIF::Tags::CanonicalNames qw( canonicalize_names );
use COD::SOptions qw( getOptions );
use COD::SUsage qw( usage options );
use COD::ErrorHandler qw( process_parser_messages );
use COD::ToolsVersion qw( get_version_string );

my $use_parser = 'c';
my $fix_errors = 0;
my $debug = 0;
my $tags_to_fix;
my $dictionary_file;

my $die_on_error_level = {
    'ERROR'   => 1,
    'WARNING' => 0,
    'NOTE'    => 0
};

#* OPTIONS:
#*   -d, --dictionary cif_core.dic
#*                     Use the specified DDL1 dictionary for determining which
#*                     values are numeric.
#*   -t, --tags-to-fix _tag_1,_tag_2,_tag_3
#*                     Specify an explicit list of tags that should be
#*                     evaluated as numeric, without relying on a
#*                     dictionary.
#*   --fix-syntax-errors
#*                     Try to fix those syntax errors in input CIFs that can
#*                     be corrected unambiguously.
#*   --no-fix-syntax-errors,
#*   --dont-fix-syntax-errors
#*                     Do not try to fix syntax errors in input CIFs (default).
#*   --use-perl-parser
#*                     Use development CIF parser written in Perl.
#*   --use-c-parser
#*                     Use faster C/Yacc CIF parser (default).
#*   --debug
#*                     Enable debug messages.
#*   --no-debug
#*                     Disable debug messages (default).
#*   --help, --usage
#*                     Output a short usage message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
@ARGV = getOptions(
    '-d,--dictionary' => \$dictionary_file,

    '-t,--tags-to-fix' => \$tags_to_fix,

    '--debug'    => sub{ $debug = 1 },
    '--no-debug' => sub{ $debug = 0 },

    '--fix-syntax-errors'       => sub { $fix_errors = 1 },
    '--dont-fix-syntax-errors'  => sub { $fix_errors = 0 },
    '--no-fix-syntax-errors'    => sub { $fix_errors = 0 },

    '--use-perl-parser' => sub{ $use_parser = 'perl' },
    '--use-c-parser'    => sub{ $use_parser = 'c' },

    '--options'       => sub { options; exit },
    '--help,--usage'  => sub { usage; exit },
    '--version'       => sub { print get_version_string(), "\n"; exit }
);

# Default numeric tag list taken from CIF core dictionary v. 2.4.5
my @cif_core_numeric = (
'_atom_site_aniso_B_11',
'_atom_site_aniso_B_12',
'_atom_site_aniso_B_13',
'_atom_site_aniso_B_22',
'_atom_site_aniso_B_23',
'_atom_site_aniso_B_33',
'_atom_site_aniso_ratio',
'_atom_site_aniso_U_11',
'_atom_site_aniso_U_12',
'_atom_site_aniso_U_13',
'_atom_site_aniso_U_22',
'_atom_site_aniso_U_23',
'_atom_site_aniso_U_33',
'_atom_site_attached_hydrogens',
'_atom_site_B_equiv_geom_mean',
'_atom_site_B_iso_or_equiv',
'_atom_site_Cartn_x',
'_atom_site_Cartn_y',
'_atom_site_Cartn_z',
'_atom_site_chemical_conn_number',
'_atom_site_fract_x',
'_atom_site_fract_y',
'_atom_site_fract_z',
'_atom_site_occupancy',
'_atom_site_site_symmetry_multiplicity',
'_atom_site_site_symmetry_order',
'_atom_site_symmetry_multiplicity',
'_atom_site_U_equiv_geom_mean',
'_atom_site_U_iso_or_equiv',
'_atom_sites_Cartn_tran_matrix_11',
'_atom_sites_Cartn_tran_matrix_12',
'_atom_sites_Cartn_tran_matrix_13',
'_atom_sites_Cartn_tran_matrix_21',
'_atom_sites_Cartn_tran_matrix_22',
'_atom_sites_Cartn_tran_matrix_23',
'_atom_sites_Cartn_tran_matrix_31',
'_atom_sites_Cartn_tran_matrix_32',
'_atom_sites_Cartn_tran_matrix_33',
'_atom_sites_Cartn_tran_vector_1',
'_atom_sites_Cartn_tran_vector_2',
'_atom_sites_Cartn_tran_vector_3',
'_atom_sites_fract_tran_matrix_11',
'_atom_sites_fract_tran_matrix_12',
'_atom_sites_fract_tran_matrix_13',
'_atom_sites_fract_tran_matrix_21',
'_atom_sites_fract_tran_matrix_22',
'_atom_sites_fract_tran_matrix_23',
'_atom_sites_fract_tran_matrix_31',
'_atom_sites_fract_tran_matrix_32',
'_atom_sites_fract_tran_matrix_33',
'_atom_sites_fract_tran_vector_1',
'_atom_sites_fract_tran_vector_2',
'_atom_sites_fract_tran_vector_3',
'_atom_type_analytical_mass_%',
'_atom_type_number_in_cell',
'_atom_type_oxidation_number',
'_atom_type_radius_bond',
'_atom_type_radius_contact',
'_atom_type_scat_Cromer_Mann_a1',
'_atom_type_scat_Cromer_Mann_a2',
'_atom_type_scat_Cromer_Mann_a3',
'_atom_type_scat_Cromer_Mann_a4',
'_atom_type_scat_Cromer_Mann_b1',
'_atom_type_scat_Cromer_Mann_b2',
'_atom_type_scat_Cromer_Mann_b3',
'_atom_type_scat_Cromer_Mann_b4',
'_atom_type_scat_Cromer_Mann_c',
'_atom_type_scat_dispersion_imag',
'_atom_type_scat_dispersion_real',
'_atom_type_scat_length_neutron',
'_cell_angle_alpha',
'_cell_angle_beta',
'_cell_angle_gamma',
'_cell_formula_units_Z',
'_cell_length_a',
'_cell_length_b',
'_cell_length_c',
'_cell_measurement_pressure',
'_cell_measurement_reflns_used',
'_cell_measurement_temperature',
'_cell_measurement_theta_max',
'_cell_measurement_theta_min',
'_cell_measurement_wavelength',
'_cell_reciprocal_angle_alpha',
'_cell_reciprocal_angle_beta',
'_cell_reciprocal_angle_gamma',
'_cell_reciprocal_length_a',
'_cell_reciprocal_length_b',
'_cell_reciprocal_length_c',
'_cell_volume',
'_cell_measurement_refln_index_h',
'_cell_measurement_refln_index_k',
'_cell_measurement_refln_index_l',
'_cell_measurement_refln_theta',
'_chemical_enantioexcess_bulk',
'_chemical_enantioexcess_crystal',
'_chemical_melting_point',
'_chemical_melting_point_gt',
'_chemical_melting_point_lt',
'_chemical_temperature_decomposition',
'_chemical_temperature_decomposition_gt',
'_chemical_temperature_decomposition_lt',
'_chemical_temperature_sublimation',
'_chemical_temperature_sublimation_gt',
'_chemical_temperature_sublimation_lt',
'_chemical_conn_atom_charge',
'_chemical_conn_atom_display_x',
'_chemical_conn_atom_display_y',
'_chemical_conn_atom_NCA',
'_chemical_conn_atom_NH',
'_chemical_conn_atom_number',
'_chemical_conn_bond_atom_1',
'_chemical_conn_bond_atom_2',
'_chemical_formula_weight',
'_chemical_formula_weight_meas',
'_citation_database_id_Medline',
'_citation_year',
'_diffrn_ambient_pressure',
'_diffrn_ambient_pressure_gt',
'_diffrn_ambient_pressure_lt',
'_diffrn_ambient_temperature',
'_diffrn_ambient_temperature_gt',
'_diffrn_ambient_temperature_lt',
'_diffrn_measured_fraction_theta_full',
'_diffrn_measured_fraction_theta_max',
'_diffrn_attenuator_scale',
'_diffrn_detector_area_resol_mean',
'_diffrn_detector_dtime',
'_diffrn_radiation_detector_dtime',
'_diffrn_orient_matrix_UB_11',
'_diffrn_orient_matrix_UB_12',
'_diffrn_orient_matrix_UB_13',
'_diffrn_orient_matrix_UB_21',
'_diffrn_orient_matrix_UB_22',
'_diffrn_orient_matrix_UB_23',
'_diffrn_orient_matrix_UB_31',
'_diffrn_orient_matrix_UB_32',
'_diffrn_orient_matrix_UB_33',
'_diffrn_orient_refln_angle_chi',
'_diffrn_orient_refln_angle_kappa',
'_diffrn_orient_refln_angle_omega',
'_diffrn_orient_refln_angle_phi',
'_diffrn_orient_refln_angle_psi',
'_diffrn_orient_refln_angle_theta',
'_diffrn_orient_refln_index_h',
'_diffrn_orient_refln_index_k',
'_diffrn_orient_refln_index_l',
'_diffrn_radiation_filter_edge',
'_diffrn_radiation_inhomogeneity',
'_diffrn_radiation_polarisn_norm',
'_diffrn_radiation_polarisn_ratio',
'_diffrn_radiation_wavelength',
'_diffrn_radiation_wavelength_wt',
'_diffrn_refln_angle_chi',
'_diffrn_refln_angle_kappa',
'_diffrn_refln_angle_omega',
'_diffrn_refln_angle_phi',
'_diffrn_refln_angle_psi',
'_diffrn_refln_angle_theta',
'_diffrn_refln_counts_bg_1',
'_diffrn_refln_counts_bg_2',
'_diffrn_refln_counts_net',
'_diffrn_refln_counts_peak',
'_diffrn_refln_counts_total',
'_diffrn_refln_detect_slit_horiz',
'_diffrn_refln_detect_slit_vert',
'_diffrn_refln_elapsed_time',
'_diffrn_refln_index_h',
'_diffrn_refln_index_k',
'_diffrn_refln_index_l',
'_diffrn_refln_intensity_net',
'_diffrn_refln_intensity_sigma',
'_diffrn_refln_intensity_u',
'_diffrn_refln_scan_rate',
'_diffrn_refln_scan_time_backgd',
'_diffrn_refln_scan_width',
'_diffrn_refln_sint/lambda',
'_diffrn_refln_wavelength',
'_diffrn_reflns_av_R_equivalents',
'_diffrn_reflns_av_sigmaI/netI',
'_diffrn_reflns_av_unetI/netI',
'_diffrn_reflns_Laue_measured_fraction_full',
'_diffrn_reflns_Laue_measured_fraction_max',
'_diffrn_reflns_limit_h_max',
'_diffrn_reflns_limit_h_min',
'_diffrn_reflns_limit_k_max',
'_diffrn_reflns_limit_k_min',
'_diffrn_reflns_limit_l_max',
'_diffrn_reflns_limit_l_min',
'_diffrn_reflns_number',
'_diffrn_reflns_point_group_measured_fraction_full',
'_diffrn_reflns_point_group_measured_fraction_max',
'_diffrn_reflns_resolution_full',
'_diffrn_reflns_resolution_max',
'_diffrn_reflns_theta_full',
'_diffrn_reflns_theta_max',
'_diffrn_reflns_theta_min',
'_diffrn_reflns_transf_matrix_11',
'_diffrn_reflns_transf_matrix_12',
'_diffrn_reflns_transf_matrix_13',
'_diffrn_reflns_transf_matrix_21',
'_diffrn_reflns_transf_matrix_22',
'_diffrn_reflns_transf_matrix_23',
'_diffrn_reflns_transf_matrix_31',
'_diffrn_reflns_transf_matrix_32',
'_diffrn_reflns_transf_matrix_33',
'_diffrn_reflns_class_av_R_eq',
'_diffrn_reflns_class_av_sgI/I',
'_diffrn_reflns_class_av_uI/I',
'_diffrn_reflns_class_d_res_high',
'_diffrn_reflns_class_d_res_low',
'_diffrn_reflns_class_number',
'_diffrn_scale_group_I_net',
'_diffrn_source_current',
'_diffrn_source_power',
'_diffrn_source_take-off_angle',
'_diffrn_source_voltage',
'_diffrn_standard_refln_index_h',
'_diffrn_standard_refln_index_k',
'_diffrn_standard_refln_index_l',
'_diffrn_standards_decay_%',
'_diffrn_standards_interval_count',
'_diffrn_standards_interval_time',
'_diffrn_standards_number',
'_diffrn_standards_scale_sigma',
'_diffrn_standards_scale_u',
'_exptl_absorpt_coefficient_mu',
'_exptl_absorpt_correction_T_max',
'_exptl_absorpt_correction_T_min',
'_exptl_crystals_number',
'_exptl_transmission_factor_max',
'_exptl_transmission_factor_min',
'_exptl_crystal_density_diffrn',
'_exptl_crystal_density_meas',
'_exptl_crystal_density_meas_gt',
'_exptl_crystal_density_meas_lt',
'_exptl_crystal_density_meas_temp',
'_exptl_crystal_density_meas_temp_gt',
'_exptl_crystal_density_meas_temp_lt',
'_exptl_crystal_F_000',
'_exptl_crystal_size_length',
'_exptl_crystal_size_max',
'_exptl_crystal_size_mid',
'_exptl_crystal_size_min',
'_exptl_crystal_size_rad',
'_exptl_crystal_face_diffr_chi',
'_exptl_crystal_face_diffr_kappa',
'_exptl_crystal_face_diffr_phi',
'_exptl_crystal_face_diffr_psi',
'_exptl_crystal_face_index_h',
'_exptl_crystal_face_index_k',
'_exptl_crystal_face_index_l',
'_exptl_crystal_face_perp_dist',
'_geom_angle',
'_geom_bond_distance',
'_geom_bond_multiplicity',
'_geom_bond_valence',
'_geom_contact_distance',
'_geom_hbond_angle_DHA',
'_geom_hbond_distance_DH',
'_geom_hbond_distance_HA',
'_geom_hbond_distance_DA',
'_geom_torsion',
'_refine_diff_density_max',
'_refine_diff_density_min',
'_refine_diff_density_rms',
'_refine_ls_abs_structure_Flack',
'_refine_ls_abs_structure_Rogers',
'_refine_ls_d_res_high',
'_refine_ls_d_res_low',
'_refine_ls_extinction_coef',
'_refine_ls_F_calc_precision',
'_refine_ls_goodness_of_fit_all',
'_refine_ls_goodness_of_fit_gt',
'_refine_ls_goodness_of_fit_obs',
'_refine_ls_goodness_of_fit_ref',
'_refine_ls_number_constraints',
'_refine_ls_number_parameters',
'_refine_ls_number_reflns',
'_refine_ls_number_restraints',
'_refine_ls_R_factor_all',
'_refine_ls_R_factor_gt',
'_refine_ls_R_factor_obs',
'_refine_ls_R_Fsqd_factor',
'_refine_ls_R_I_factor',
'_refine_ls_restrained_S_all',
'_refine_ls_restrained_S_gt',
'_refine_ls_restrained_S_obs',
'_refine_ls_shift/esd_max',
'_refine_ls_shift/esd_mean',
'_refine_ls_shift/su_max',
'_refine_ls_shift/su_max_lt',
'_refine_ls_shift/su_mean',
'_refine_ls_shift/su_mean_lt',
'_refine_ls_wR_factor_all',
'_refine_ls_wR_factor_gt',
'_refine_ls_wR_factor_obs',
'_refine_ls_wR_factor_ref',
'_refine_ls_class_d_res_high',
'_refine_ls_class_d_res_low',
'_refine_ls_class_R_factor_all',
'_refine_ls_class_R_factor_gt',
'_refine_ls_class_R_Fsqd_factor',
'_refine_ls_class_R_I_factor',
'_refine_ls_class_wR_factor_all',
'_refln_A_calc',
'_refln_A_meas',
'_refln_B_calc',
'_refln_B_meas',
'_refln_d_spacing',
'_refln_F_calc',
'_refln_F_meas',
'_refln_F_sigma',
'_refln_F_squared_calc',
'_refln_F_squared_meas',
'_refln_F_squared_sigma',
'_refln_index_h',
'_refln_index_k',
'_refln_index_l',
'_refln_intensity_calc',
'_refln_intensity_meas',
'_refln_intensity_sigma',
'_refln_mean_path_length_tbar',
'_refln_phase_calc',
'_refln_phase_meas',
'_refln_sint/lambda',
'_refln_symmetry_epsilon',
'_refln_symmetry_multiplicity',
'_refln_wavelength',
'_reflns_d_resolution_high',
'_reflns_d_resolution_low',
'_reflns_Friedel_coverage',
'_reflns_Friedel_fraction_full',
'_reflns_Friedel_fraction_max',
'_reflns_limit_h_max',
'_reflns_limit_h_min',
'_reflns_limit_k_max',
'_reflns_limit_k_min',
'_reflns_limit_l_max',
'_reflns_limit_l_min',
'_reflns_number_gt',
'_reflns_number_observed',
'_reflns_number_total',
'_reflns_class_d_res_high',
'_reflns_class_d_res_low',
'_reflns_class_number_gt',
'_reflns_class_number_total',
'_reflns_class_R_factor_all',
'_reflns_class_R_factor_gt',
'_reflns_class_R_Fsqd_factor',
'_reflns_class_R_I_factor',
'_reflns_class_wR_factor_all',
'_reflns_scale_meas_F',
'_reflns_scale_meas_F_squared',
'_reflns_scale_meas_intensity',
'_reflns_shell_d_res_high',
'_reflns_shell_d_res_low',
'_reflns_shell_meanI_over_sigI_all',
'_reflns_shell_meanI_over_sigI_gt',
'_reflns_shell_meanI_over_sigI_obs',
'_reflns_shell_meanI_over_uI_all',
'_reflns_shell_meanI_over_uI_gt',
'_reflns_shell_number_measured_all',
'_reflns_shell_number_measured_gt',
'_reflns_shell_number_measured_obs',
'_reflns_shell_number_possible',
'_reflns_shell_number_unique_all',
'_reflns_shell_number_unique_gt',
'_reflns_shell_number_unique_obs',
'_reflns_shell_percent_possible_all',
'_reflns_shell_percent_possible_gt',
'_reflns_shell_percent_possible_obs',
'_reflns_shell_Rmerge_F_all',
'_reflns_shell_Rmerge_F_gt',
'_reflns_shell_Rmerge_F_obs',
'_reflns_shell_Rmerge_I_all',
'_reflns_shell_Rmerge_I_gt',
'_reflns_shell_Rmerge_I_obs',
'_space_group_IT_number',
'_space_group_symop_sg_id',
'_symmetry_Int_Tables_number',
'_symmetry_equiv_pos_site_id',
'_valence_param_atom_1_valence',
'_valence_param_atom_2_valence',
'_valence_param_B',
'_valence_param_Ro',
);

binmode STDOUT, ':encoding(UTF-8)';
binmode STDERR, ':encoding(UTF-8)';

my %tags_to_fix;

if( $tags_to_fix ) {
    %tags_to_fix = map { (lc($_), $_) } split m/,|\s+/, $tags_to_fix;
}

if( defined $dictionary_file ) {
    my ( $dict ) = parse_cif( $dictionary_file, { 'parser' => $use_parser } );
    for my $tag ( @{ get_numeric_data_names( $dict ) } ) {
        $tags_to_fix{lc $tag } = $tag
    }
}

if ( !defined $dictionary_file && !$tags_to_fix ) {
    %tags_to_fix = map { (lc($_), $_) } @cif_core_numeric;
}

if( $debug ) {
    use COD::Serialise qw( serialiseRef );
    serialiseRef( \%tags_to_fix );
}

my $return_status = 0;

my $parser_options = {
    'parser' => $use_parser,
    'no_print' => 1,
    'fix_errors' => $fix_errors

};

@ARGV = ( '-' ) unless @ARGV;

for my $filename (@ARGV) {
    my ( $data, $err_count, $messages ) = parse_cif( $filename, $parser_options );
    process_parser_messages( $messages, $die_on_error_level );
    next if $err_count > 0;

    if( !@{$data} ) {
        warn "$0: $filename: WARNING, file seems to be empty\n";
        next;
    }

    for my $dataset (@{$data}) {
        canonicalize_names( $dataset );
        # Fix numeric tags:
        for my $tag ( map { lc } @{$dataset->{tags}}) {
            next if !exists $tags_to_fix{$tag};
            for my $value (@{$dataset->{values}{$tag}}) {
                my $numeric_value = eval_cif_number( $value );

                if( !defined $numeric_value ) {
                    warn "$0: $filename: WARNING, error evaluating "
                       . "numeric value '$value'\n";
                    $return_status = 2 if $return_status < 2;
                } else {
                    $value = $numeric_value;
                    $return_status = 1 if $return_status < 1;
                }
            }
        }

        # Print out requested tags:
        print_cif( $dataset, {
            'preserve_loop_order' => 1,
            'keep_tag_order' => 1,
        });
    }
}

exit $return_status;

sub get_numeric_data_names
{
    my ( $dict ) = @_;

    my @numeric_tags;
    for my $data_item (@{$dict}) {
        next if !exists $data_item->{'values'}{'_type'};
        next if $data_item->{'values'}{'_type'}[0] ne 'numb';
        push @numeric_tags, @{$data_item->{'values'}{'_name'}};
    }

    return \@numeric_tags;
}

sub eval_cif_number
{
    my ( $value ) = @_;

    # NOTE: '[0-9]' and '\d' are not equivalent in Unicode context
    # TODO: regular expressions used for number recognition are used in
    # multiple scripts and should be refactored into a single module
    my $integer = '[0-9]+';
    my $number  = '[-+]?(?:[0-9]+(?:\.[0-9]*)?|\.(?:[0-9]+))(?:[eE][-+]?[0-9]+)?';

    return $value if $value eq '.' || $value eq '?';
    return $value if $value =~ /^$number(?:[(]$integer[)])?$/;

    my $numeric_value;
    if ( $value =~ /^($number)\/($number)$/ ) {
        $numeric_value = $1/$2
    }

    return $numeric_value;
}
