#! /bin/sh

# generate Big5 character mapping from Unihan database information 
# (BigFive and HKSCS entries) and additional entries from 
# obsolete unicode.org BIG5 mapping; in ambiguous cases, the 
# Unihan entry takes precedence

if make Unihan.txt
then	true
else	echo Could not acquire Unicode data file Unihan.txt
	exit 1
fi
if make BIG5.TXT
then	true
else	echo Could not acquire Unicode data file BIG5.TXT
	exit 1
fi

echo extracting mappings from Unihan data
sed	-e 's/^U+\([^	]*\)	kBigFive	\([^	]*\)$/	{0x\2, 0x\1},/' \
	-e 's/^U+\([^	]*\)	kHKSCS	\([^	]*\)$/	{0x\2, 0x\1},/' \
	-e t -e d Unihan.txt > cjk-b5.h1

echo extracting further mappings from old BIG5 data
sed	-e 's/^0x\([^	]*\)	0x\([^	]*\).*$/	{0x\1, 0x\2},/' \
	-e t -e d BIG5.TXT > cjk-b5.h2

echo determining unique entries in further BIG5 data
cat cjk-b5.h2 cjk-b5.h1 cjk-b5.h1 | sort | uniq -u > cjk-b5.h3

echo determining ambiguous entries
sed	-e 's/	{0x\([^,]*\),.*/\1,/' cjk-b5.h1 cjk-b5.h3 |
sort | uniq -d > cjk-b5.h4

echo filtering out ambiguous entries
fgrep -v -f cjk-b5.h4 cjk-b5.h3 > cjk-b5.h5

echo merging mappings
cat cjk-b5.h1 cjk-b5.h5 | sort > cjk-b5.h

echo removing auxiliary files
rm -f cjk-b5.h[1-5]
