#! /bin/sh

(
cat <<\/eoc
#include <stdio.h>

void
printutf8 (unichar)
	unsigned long unichar;
{
	/* this is literal output, not C string preparation;
	   so in contrast to the mk* scripts, do not apply \ escaping
	 */

	if (unichar < 0x80) {
		printf ("%c", unichar);
	} else if (unichar < 0x800) {
		printf ("%c", 0xC0 | (unichar >> 6));
		printf ("%c", 0x80 | (unichar & 0x3F));
	} else if (unichar < 0x10000) {
		printf ("%c", 0xE0 | (unichar >> 12));
		printf ("%c", 0x80 | ((unichar >> 6) & 0x3F));
		printf ("%c", 0x80 | (unichar & 0x3F));
	} else if (unichar < 0x200000) {
		printf ("%c", 0xF0 | (unichar >> 18));
		printf ("%c", 0x80 | ((unichar >> 12) & 0x3F));
		printf ("%c", 0x80 | ((unichar >> 6) & 0x3F));
		printf ("%c", 0x80 | (unichar & 0x3F));
	} else if (unichar < 0x4000000) {
		printf ("%c", 0xF8 | (unichar >> 24));
		printf ("%c", 0x80 | ((unichar >> 18) & 0x3F));
		printf ("%c", 0x80 | ((unichar >> 12) & 0x3F));
		printf ("%c", 0x80 | ((unichar >> 6) & 0x3F));
		printf ("%c", 0x80 | (unichar & 0x3F));
	} else if (unichar < 0x80000000) {
		printf ("%c", 0xFC | (unichar >> 30));
		printf ("%c", 0x80 | ((unichar >> 24) & 0x3F));
		printf ("%c", 0x80 | ((unichar >> 18) & 0x3F));
		printf ("%c", 0x80 | ((unichar >> 12) & 0x3F));
		printf ("%c", 0x80 | ((unichar >> 6) & 0x3F));
		printf ("%c", 0x80 | (unichar & 0x3F));
	}
}

int
prio (unichar)
	unsigned long unichar;
{
/*
4E00..9FFF; CJK Unified Ideographs
3400..4DBF; CJK Unified Ideographs Extension A
20000..2A6DF; CJK Unified Ideographs Extension B
2E80..2EFF; CJK Radicals Supplement
F900..FAFF; CJK Compatibility Ideographs
2F800..2FA1F; CJK Compatibility Ideographs Supplement
3300..33FF; CJK Compatibility
FE30..FE4F; CJK Compatibility Forms
3000..303F; CJK Symbols and Punctuation
3200..32FF; Enclosed CJK Letters and Months
*/
	if (unichar >= 0x4E00 && unichar <= 0x9FFF) {
		/* CJK Unified Ideographs */
		return 1;
	} else if (unichar >= 0x3400 && unichar <= 0x4DBF) {
		/* CJK Unified Ideographs Extension A */
		return 2;
	} else if (unichar >= 0x20000 && unichar <= 0x2A6DF) {
		/* CJK Unified Ideographs Extension B */
		return 3;
	} else if (unichar >= 0x2E80 && unichar <= 0x2EFF) {
		/* CJK Radicals Supplement */
		return 11;
	} else if (unichar >= 0xF900 && unichar <= 0xFAFF) {
		/* CJK Compatibility Ideographs */
		return 21;
	} else if (unichar >= 0x2F800 && unichar <= 0x2FA1F) {
		/* CJK Compatibility Ideographs Supplement */
		return 22;
	} else if (unichar >= 0x3300 && unichar <= 0x33FF) {
		/* CJK Compatibility */
		return 23;
	} else if (unichar >= 0xFE30 && unichar <= 0xFE4F) {
		/* CJK Compatibility Forms */
		return 31;
	} else if (unichar >= 0x3000 && unichar <= 0x303F) {
		/* CJK Symbols and Punctuation */
		return 41;
	} else if (unichar >= 0x3200 && unichar <= 0x32FF) {
		/* Enclosed CJK Letters and Months */
		return 42;
	} else {
		return 99;
	}
}

int
main () {
/eoc

case "$1" in
-d)	shift
	cat $* |
	sed	-e 's/\(["%\\]\)/\\\1/g' -e 's/^/	printf ("/' -e 's/$/\\n");/' \
		-e 's/U+\([0-9A-Fa-f][0-9A-Fa-f]*\)/"); printf ("%02d %ld ", prio (0x\1), 0x\1); printutf8 (0x\1); printf (" U+\1/'
	;;
-p)	shift
	cat $* |
	sed	-e 's/\(["%\\]\)/\\\1/g' -e 's/^/	printf ("/' -e 's/$/\\n");/' \
		-e 's/U+\([0-9A-Fa-f][0-9A-Fa-f]*\)/"); printf ("%02d ", prio (0x\1)); printf ("U+\1/'
	;;
*)	case "$1" in
	-u)	shift
		sed -e "s,^\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]*[^0-9A-Za-z]\),U+\1," $*;;
	-U)	shift
		sed -e "s,\([0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f]*[^0-9A-Za-z]\),U+\1,g" $*;;
	*)	cat $*;;
	esac |
	sed	-e 's/\(["%\\]\)/\\\1/g' -e 's/^/	printf ("/' -e 's/$/\\n");/' \
		-e 's/U+\([0-9A-Fa-f][0-9A-Fa-f]*\)/"); printutf8 (0x\1); printf (" U+\1/g'
	;;
esac

cat <</eoc
}
/eoc
) > insutf8.c

cc -o insutf8.exe insutf8.c && ./insutf8.exe
rm -f insutf8.c insutf8.exe

