#! /bin/sh
#------------------------------------------------------------------------------
#$Author: antanas $
#$Date: 2021-04-28 19:35:53 +0300 (Wed, 28 Apr 2021) $
#$Revision: 8738 $
#$URL: svn+ssh://www.crystallography.net/home/coder/svn-repositories/cod-tools/tags/v3.2.0/scripts/sdf_add_cod_data $
#------------------------------------------------------------------------------
#*
#* Append COD-specific meta-information to an SDF file in a format
#* suitable for PubChem.
#*
#* USAGE:
#*   $0 --options --cod-cif cod-input.cif file1.sdf
#*   $0 --options --cod-cif cod-input.cif file1.sdf > output.sdf
#*   $0 --options --cod-cif cod-input.cif < file1.sdf > output.sdf
#**

set -ue

FILES=""
COD_CIF=""

#* OPTIONS:
#*   -C, --cod-cif 1000000.cif
#*                     Provide the original COD CIF to extract structure metadata.
#*
#*   --help, --usage
#*                     Output a short help message (this message) and exit.
#*   --version
#*                     Output version information and exit.
#**
while [ $# -gt 0 ]
do
  case $1 in
      -C|--cod-cif|--cod-ci|--cod-c|--cod|--co|--c)
          COD_CIF="$2"
          shift
          ;;
      --options|--option|--optio|--opti|--opt|--op|--o)
          echo "$(basename "$0"):: The '--options' option is a placeholder."
          echo "$(basename "$0"):: It should be replaced by one of the following options:"
          awk '/#\* OPTIONS:/,/#\*\*/ {
                  sub("OPTIONS:", "");
                  sub("^ *#[*]?[*]?", "");
                  gsub("\\$0","'"$0"'");
                  print $0
              }' "$0"
          exit
          ;;
      --help|--hel|--he|--h|--usage)
          awk '/#\*/,/#\*\*/ {
                  sub("^ *#[*]?[*]?", "");
                  gsub("\\$0","'"$0"'");
                  print $0
              }' "$0"
          exit
          ;;
      --version)
          "$(dirname "$0")"/cod-tools-version
          exit
          ;;
      -*) echo "$(basename "$0"):: ERROR, unknown option '$1'." >&2 ; exit 1 ;;
      *)  FILES="$FILES '$1'" ;;
    esac
    shift
done

eval set -- "${FILES}"

cif_values='cif_values --no-header --no-dataname --no-filename --dont-replace-spaces --tags'

grep -v '^\$\$\$\$' ${1+"$@"}

(
    DATABASE_ID=$(${cif_values} _cod_database_code "${COD_CIF}")
    test "${DATABASE_ID}" = "?" && \
        DATABASE_ID=$(${cif_values} _cod_data_source_block "${COD_CIF}")
    echo '> <PUBCHEM_EXT_DATASOURCE_REGID>'
    echo "${DATABASE_ID}"
    echo ""
    echo '> <PUBCHEM_SUBSTANCE_SYNONYM>'
    (
        ${cif_values} _chemical_name_systematic "${COD_CIF}"
        ${cif_values} _chemical_name_common "${COD_CIF}"
    ) \
        | ( grep -E -v '^\s*\?' || true ) \
        | perl -0777 -pe 's/[ \t]+/ /g; s/^\s*|\s*$//g'
    echo ""
    echo ""
    echo '> <PUBCHEM_SUBSTANCE_COMMENT>'
    (
        ${cif_values} _publ_author_name --value-separator "; " "${COD_CIF}"
        echo "($(${cif_values} _journal_year "${COD_CIF}"))"
        ${cif_values} _publ_section_title "${COD_CIF}"
        ${cif_values} _journal_name_full "${COD_CIF}"
        ${cif_values} _journal_volume "${COD_CIF}"
        ${cif_values} _journal_issue "${COD_CIF}"
        printf "%s-%s\n" \
            "$(${cif_values} _journal_page_first "${COD_CIF}")" \
            "$(${cif_values} _journal_page_last "${COD_CIF}")"
        echo DOI:"$(${cif_values} _journal_paper_doi "${COD_CIF}")"
    ) \
        | perl -pe 's/-\?//; s/\(\?\)/?/; s/\?-/?/; s/\?+/?/g' \
        | ( grep -Ev '^\?|DOI:\?' || true ) \
        | perl -0777 -pe 's/\n(.)/, $1/g; s/[ \t]+/ /g'
    echo ""
    echo '> <PUBCHEM_EXT_DATASOURCE_URL>'
    echo 'https://www.crystallography.net/'
    echo ""
    echo '> <PUBCHEM_EXT_SUBSTANCE_URL>'
    echo "https://www.crystallography.net/cod/${DATABASE_ID}.html"
    echo ""
    echo '$$$$'
) \
| perl -0777 -pe 's/^> <.*?>\n\s*\n//mg' \
| cif-to-utf8 \
| perl -CS -MUnicode::Normalize -pe \
    "# from http://ahinea.com/en/tech/accented-translate.html:
     # 2011.12.10
     \$_ = NFD(\$_); s/\\pM//g;"
