#! /bin/sh
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2022-05-12 17:31:05 +0300 (Thu, 12 May 2022) $
#$Revision: 9312 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.6.0/scripts/sdf_add_cod_data $
#------------------------------------------------------------------------------
#*
#* Append COD-specific meta-information to an SDF file in a format
#* suitable for PubChem.
#*
#* USAGE:
#*   $0 --options --cod-cif cod-input.cif file1.sdf
#*   $0 --options --cod-cif cod-input.cif file1.sdf > output.sdf
#*   $0 --options --cod-cif cod-input.cif < file1.sdf > output.sdf
#**

set -ue

FILES=""
COD_CIF=""

#* OPTIONS:
#*   -C, --cod-cif 1000000.cif
#*                     Provide the original COD CIF to extract structure metadata.
#*
#*   --help, --usage
#*                     Output a short help message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
while [ $# -gt 0 ]
do
  case $1 in
      -C|--cod-cif|--cod-ci|--cod-c|--cod|--co|--c)
          COD_CIF="$2"
          shift
          ;;
      --options|--option|--optio|--opti|--opt|--op|--o)
          echo "$(basename "$0"):: The '--options' option is a placeholder."
          echo "$(basename "$0"):: It should be replaced by one of the following options:"
          awk '/#\* OPTIONS:/,/#\*\*/ {
                  sub("OPTIONS:", "");
                  sub("^ *#[*]?[*]?", "");
                  gsub("\\$0","'"$0"'");
                  print $0
              }' "$0"
          exit
          ;;
      --help|--hel|--he|--h|--usage)
          awk '/#\*/,/#\*\*/ {
                  sub("^ *#[*]?[*]?", "");
                  gsub("\\$0","'"$0"'");
                  print $0
              }' "$0"
          exit
          ;;
      --version)
          "$(dirname "$0")"/cod-tools-version
          exit
          ;;
      -*) echo "$(basename "$0"):: ERROR, unknown option '$1'." >&2 ; exit 1 ;;
      *)  FILES="$FILES '$1'" ;;
    esac
    shift
done

eval set -- "${FILES}"

cif_values='cif_values --no-header --no-dataname --no-filename --dont-replace-spaces --tags'

grep -v '^\$\$\$\$' ${1+"$@"}

DATA_BLOCK_CODE=$(cif_values --no-header --dataname "${COD_CIF}" | head -n 1)
DATA_BLOCK_CODE="data_${DATA_BLOCK_CODE}"
(
    DATABASE_ID=$(${cif_values} _cod_database_code "${COD_CIF}")
    if [ "${DATABASE_ID}" = "?" ]
    then
        echo "$0: ${COD_CIF} ${DATA_BLOCK_CODE}: WARNING, data item" \
             "'_cod_database_code' was not found -- database identifier will" \
             "be determined from the '_cod_data_source_block' data item." 1>&2
        DATABASE_ID=$(${cif_values} _cod_data_source_block "${COD_CIF}")
    fi
    if [ "${DATABASE_ID}" = "?" ]
    then
        echo "$0: ${COD_CIF} ${DATA_BLOCK_CODE}: WARNING, data item" \
             "'_cod_data_source_block' was not found -- database identifier" \
             "will set to '?'." 1>&2
    fi
    DATABASE_ID=$(echo "${DATABASE_ID}" | perl -pe 's/^\s+|\s+$//g')
    echo '> <PUBCHEM_EXT_DATASOURCE_REGID>'
    echo "${DATABASE_ID}"
    echo ""
    echo '> <PUBCHEM_SUBSTANCE_SYNONYM>'
    (
        ${cif_values} _chemical_name_systematic "${COD_CIF}"
        ${cif_values} _chemical_name_common "${COD_CIF}"
    ) \
        | ( grep -E -v '^\s*\?' || true ) \
        | perl -0777 -pe 's/[ \t]+/ /g; s/^\s*|\s*$//msg;' \
        | awk '!a[$0]++'
    echo ""
    echo '> <PUBCHEM_SUBSTANCE_COMMENT>'
    (
        ${cif_values} _publ_author_name --value-separator "; " "${COD_CIF}" \
            | perl -pe 's/^\s+//g; s/^\s*(;\s*)*$//';
        journal_year="$(${cif_values} _journal_year "${COD_CIF}" \
                       | perl -pe 's/^\s+|\s+$//g; s/^\s*$/?/;')"
        if [ "${journal_year}" != "?" ]
        then
            printf "(%s)\n" "${journal_year}"
        fi
        ${cif_values} _publ_section_title "${COD_CIF}"
        ${cif_values} _journal_name_full "${COD_CIF}"
        ${cif_values} _journal_volume "${COD_CIF}"
        ${cif_values} _journal_issue "${COD_CIF}"
        page_first="$(${cif_values} _journal_page_first "${COD_CIF}" \
                      | perl -pe 's/^\s+|\s+$//g; s/^\s*$/?/;')"
        if [ "${page_first}" != "?" ]
        then
            pages="${page_first}";
            page_last="$(${cif_values} _journal_page_last "${COD_CIF}" \
                         | perl -pe 's/^\s+|\s+$//g; s/^\s*$/?/;')"
            if [ "${page_last}" != "?" ]
            then
                pages="${pages}-${page_last}"
            fi
            printf "%s\n" "${pages}"
        fi
        doi="$(${cif_values} _journal_paper_doi "${COD_CIF}" \
               | perl -pe 's/^\s+|\s+$//g; s/^\s*$/?/;')"
        if [ "${doi}" != "?" ]
        then
            printf "DOI:%s\n" "${doi}"
        fi
    ) \
        | ( grep -E -v -e '^\s*[?]?\s*$' || true ) \
        | perl -0777 -p \
            -e 's/\n(.)/, $1/g;' \
            -e 's/[ \t]+/ /g;' \
            -e 's/\s+([,;])/$1/g;'
    echo ""
    echo '> <PUBCHEM_EXT_DATASOURCE_URL>'
    echo 'https://www.crystallography.net/cod/'
    echo ""
    echo '> <PUBCHEM_EXT_SUBSTANCE_URL>'
    echo "https://www.crystallography.net/cod/${DATABASE_ID}.html"
    echo ""
    echo '$$$$'
) \
| perl -0777 -pe 's/^> <.*?>\n\s*\n//mg' \
| cif-to-utf8 \
| perl -CS -MUnicode::Normalize -pe \
    "# Remove diacritics from Unicode characters.
     # From http://ahinea.com/en/tech/accented-translate.html:
     # 2011.12.10
     \$_ = NFD(\$_); s/\\pM//g;"
