/* vim: set noet ts=4:
 *
 * Copyright (c) 2002-2007 Martin A. Godisch <martin@godisch.de>.
 *
 * This program is free software; you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option) any later
 * version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
 * St, Fifth Floor, Boston, MA 02110-1301, USA.
 */
#include <data.h>
#include <dictd.h>
#include <latrine.h>
#include <memory.h>
#include <stdio.h>
#include <time.h>

char
	*dictfile = NULL,
	*wordfile = NULL;
size_t
	randcount = DEFAULT_RANDCOUNT,
	wordlimit = DEFAULT_WORDLIMIT;
static size_t
	dictcount = 0,
	wordcount = 0;
static struct word
	*wordlist = NULL;
static double
	smallest_out = 0.0,
	largest_in   = 0.0;

size_t get_wordcount(void) { return wordcount; }
size_t get_dictcount(void) { return dictcount; }

static char *get_hash(const struct word *w)
{
	static char output[2 * MD5_DIGEST_SIZE + 1];
	char buf_un[MD5_DIGEST_SIZE + DIGEST_ALIGN];
	char buf_in[BUFSIZE];
	char *buf_out, *p;
	int i;
	struct Tlang *l;

	p = buf_un + DIGEST_ALIGN - 1;
	buf_out = p - (size_t)p % DIGEST_ALIGN;
	memset(buf_in, 0, sizeof(buf_in));
	memset(buf_un, 0, sizeof(buf_un));

	strncpy(buf_in, w->lang[0]->c, sizeof(buf_in) - 1);
	for (l = w->lang[0]->next; l != NULL; l = l->next) {
		strncat(buf_in + strlen(buf_in), "\1", sizeof(buf_in) - strlen(buf_in) - 1);
		strncat(buf_in + strlen(buf_in), l->c, sizeof(buf_in) - strlen(buf_in) - 1);
	}
	strncat(buf_in + strlen(buf_in), "\2", sizeof(buf_in) - strlen(buf_in) - 1);
	strncat(buf_in + strlen(buf_in), w->lang[1]->c, sizeof(buf_in) - strlen(buf_in) - 1);
	for (l = w->lang[1]->next; l != NULL; l = l->next) {
		strncat(buf_in + strlen(buf_in), "\1", sizeof(buf_in) - strlen(buf_in) - 1);
		strncat(buf_in + strlen(buf_in), l->c, sizeof(buf_in) - strlen(buf_in) - 1);
	}

	md5_buffer(buf_in, strlen(buf_in), buf_out);
	for (i = 0; i < MD5_DIGEST_SIZE; i++)
		sprintf(&output[2*i], "%02hhx", buf_out[i]);
	return output;
}

static int comp_by_access(const void *a, const void *b)
{
	time_t q1 = ((struct word*)a)->timestamp;
	time_t q2 = ((struct word*)b)->timestamp;
	if (q1 == q2)
		return 0;
	else if (q1 == 0)
		return +1;
	else if (q2 == 0)
		return -1;
	else if (q1 < q2)
		return -1;
	return +1;
}

static int comp_by_rate(const void *a, const void *b)
{
	double q1 = ((struct word*)a)->rate;
	double q2 = ((struct word*)b)->rate;
	if (q1 < q2)
		return -1;
	else if (q1 > q2)
		return +1;
	return comp_by_access(a, b);
}

static int comp_by_pos(const void *a, const void *b)
{
	double q1 = ((struct word*)a)->pos;
	double q2 = ((struct word*)b)->pos;
	if (q1 < q2)
		return -1;
	else if (q1 > q2)
		return +1;
	return 0;
}

static inline void rate(struct word *w)
{
	short r = 0;
	int i;

	assert(w != NULL);
	for (i = 0; i < HISTSIZE; i++)
		switch(w->history[i]) {
			case '+': r++; break;
			case '-': r--; break;
			case ' ': break;
			/* FIXME: default */
		}
	w->rate = r;
}

void update_word(struct word *w, int hit)
{
	assert(w != NULL);
	memmove(w->history + 1, w->history, HISTSIZE - 1);
	w->history[0] = hit ? '+' : '-';
	time(&w->timestamp);
	rate(w);
}

void debug_print_wordlist(void)
{
	size_t i;

	if (debug == NULL)
		return;
	for (i = 0; i < wordcount; i++)
		fprintf(debug, "%5d %08lx %3d |%s| \"%s\"\n",
			wordlist[i].pos,
			wordlist[i].timestamp,
			wordlist[i].rate,
			wordlist[i].history,
			wordlist[i].lang[0]->c);
}

struct word *select_word(void)
{
	static size_t last[3] = {(size_t)(-1), (size_t)(-1), (size_t)(-1)};
	static int cycle = 0;
	size_t next = 0;

	if (randcount > wordcount)
		randcount = wordcount;
	assert(randcount > 0);
	if (cycle < 4) {
		if (debug)
			fprintf(debug, "select_word: [%d] sorting by least-known\n", cycle);
		qsort(wordlist, wordcount, sizeof(struct word), comp_by_rate);
		largest_in = wordlist[wordcount-1].rate;
		cycle++;
	} else {
		if (debug)
			fprintf(debug, "select_word: sorting by last-access [%d]\n", cycle);
		qsort(wordlist, wordcount, sizeof(struct word), comp_by_access);
		cycle = 0;
	}
	debug_print_wordlist();
	do {
		next = random() % randcount;
		if (debug)
			fprintf(debug, "select_word: [r=%d w=%d] [l=%d:%d:%d] [p=%d n=%d] selecting \"%s\"\n",
				randcount, wordcount, last[0], last[1], last[2],
				wordlist[next].pos, next, wordlist[next].lang[0]->c);
	} while ((wordlist[next].pos == last[0] && randcount > 1)
		|| (wordlist[next].pos == last[1] && randcount > 2)
		|| (wordlist[next].pos == last[2] && randcount > 3));
	last[2] = last[1];
	last[1] = last[0];
	last[0] = wordlist[next].pos;
	return &wordlist[next];
}

inline void free_langlist(struct word *w)
{
	struct Tlang *p, *q;

	q = w->lang[0];
	while((p = q) != NULL) {
		q = p->next;
		free(p);
	}
	q = w->lang[1];
	while((p = q) != NULL) {
		q = p->next;
		free(p);
	}
	w->lang[0] = NULL;
	w->lang[1] = NULL;
}

static inline void free_wordlist(void)
{
	size_t i;

	for (i = 0; i < wordcount; i++)
		free_langlist(&wordlist[i]);
	if (wordlist != NULL)
		free(wordlist);
	wordlist  = NULL;
	wordcount = 0;
}

static inline void print_wordlist_intro(gzFile *F)
{
	assert(F != NULL);
	gzprintf(F, HEADER, WORDLIST_VERSION);
	gzprintf(F, _("# Dictionary: %s\n"
		"# Do not change the first line or this file cannot be read anymore!\n"
		"# Do not rename this file or it cannot be found anymore!\n"
		"# Do not edit while LaTrine is running, your changes will be overridden!\n"
		"# Be careful not to destroy the position-dependent mapping with the dictionary!\n\n"),
		dictfile);
}

static gzFile open_wordfile(void)
{
	gzFile
		hits = NULL,
		tmp  = NULL;
	char
		buffer[BUFSIZE],
		hbuf[HISTSIZE+1],
		*tmpfile = NULL;
	long unsigned int
		history,
		timestamp;
	int
		version, i;

	if (debug)
		fprintf(debug, "open_wordfile: %s\n", wordfile);
	if ((hits = gzopen(wordfile, "rb")) == NULL) {
		if (errno != ENOENT)
			errmsg("gzopen: %s: %s", wordfile, errno == 0 ? zError(Z_MEM_ERROR) : strerror(errno));
		return(NULL);
	}
	if (gzgets(hits, buffer, BUFSIZE) == Z_NULL) {
		if (debug)
			fprintf(debug, "open_wordfile: empty wordlist file\n");
		gzclose(hits);
		return NULL;
	}
	if (sscanf(buffer, HEADER, &version) == 1) {
		if (debug)
			fprintf(debug, "open_wordfile: file version %d\n", version);
		switch(version) {
		case 2:
			if (debug)
				fprintf(debug, "open_wordfile: converting wordlist file version %d\n", version);
			gzrewind(hits);
			tmpfile = (char*)MALLOC(strlen(wordfile) + 5);
			sprintf(tmpfile, "%s.new", wordfile);
			if ((tmp = gzopen(tmpfile, "wb")) == NULL) {
				errmsg("gzopen: %s: %m", tmpfile);
				gzclose(hits);
				FREE(&tmpfile);
				return NULL;
			}
			print_wordlist_intro(tmp);
			while (gzgets(hits, buffer, sizeof(buffer)) != Z_NULL) {
				if (*buffer == '\n' || *buffer == '#' || sscanf(buffer, "%06lx:%08lx", &history, &timestamp) < 2)
					continue;
				if (history == 0) {
					memset(hbuf, ' ', HISTSIZE);
					hbuf[HISTSIZE] = 0;
				} else {
					for (i = 1; i <= 0x800000; i *= 2)
					if (history & i)
						strcat(hbuf, "+");
					else
						strcat(hbuf, "-");
					for (i = 24; i < HISTSIZE; i++)
						strcat(hbuf, " ");
				}
				gzprintf(tmp, "%08lx:%s:\n", timestamp, hbuf);
			}
			gzclose(hits);
			gzclose(tmp);
			if (rename(tmpfile, wordfile) < 0) {
				errmsg("rename: %s, %s: %m", tmpfile, wordfile);
				FREE(&tmpfile);
				return NULL;
			}
			FREE(&tmpfile);
			hits = gzopen(wordfile, "rb");
			assert(hits != NULL);
			return hits;
		case 3:
			gzrewind(hits);
			return hits;
		default:
			if (debug)
				fprintf(debug, "open_wordfile: ignoring wordlist because of incompatible version\n");
			gzclose(hits);
			return NULL;
		}
	} else if (debug)
		fprintf(debug, "open_wordfile: no version marker found: %s\n", wordfile);
	gzclose(hits);
	return NULL;
}

/* read a struct word from dictionary and wordlist
 *
 * returns  1: success
 * returns  0: no more words available
 * returns -1: failure
 */
static int read_dict(gzFile hits, struct word *w)
{
	char buffer[BUFSIZE];
	char *c = NULL;
	static size_t
		n    = 0,
		line = 1;
	size_t i;

	memset(w, 0, sizeof(*w));

	if ((i = read_dictd(w)) != 1)
		return i;

	if (debug)
		fprintf(debug, "read_dict: hash = %s\n", get_hash(w));

	w->pos = n++;
	w->history[HISTSIZE] = 0;
	memset(w->history, ' ', HISTSIZE);

	for (c = NULL; hits != NULL && gzgets(hits, buffer, BUFSIZE) != Z_NULL; line++) {
		if (*buffer == '\n' || *buffer == '#')
			continue;
		w->timestamp = strtol(buffer, &c, 16);
		if (*c == ':') {
			memcpy(w->history, ++c, HISTSIZE);
			for (i = 0; c != NULL && i < HISTSIZE; i++)
				if (c[i] != ' ' && c[i] != '+' && c[i] != '-') {
					memset(w->history, ' ', HISTSIZE);
					break;
				}
		} else {
			w->timestamp = 0;
			errmsg(_("ignoring invalid format in wordlist line %d"), line);
		}
		return 1;
	}
	/* FIXME: warn about "hits" problems */

	return 1;
}

/* load the dictionary and the corresponding hits file,
 * an existing wordlist will be overridden
 *
 * returns  0: success
 * returns -1: failure
 */
int load_wordlist(void)
{
	int (*open_dict)(const char*) = open_dictd;
	int (*close_dict)(void)       = close_dictd;
	gzFile hits = NULL;
	struct word w;
	size_t cursize, i;
	time_t randinit;
	int    ret;

	assert(dictfile != NULL);
	assert(wordfile != NULL);
	if (wordlimit == 0) {
		if (wordlist == NULL)
			wordlist = (struct word*)MALLOC((cursize = WORDSTEP) * sizeof(struct word));
		else
			cursize  = wordcount;
	} else {
		if (wordlist == NULL)
			wordlist = (struct word*)MALLOC(wordlimit * sizeof(struct word));
		else {
			wordlist = (struct word*)REALLOC(wordlist, wordlimit * sizeof(struct word));
			memset(&wordlist[wordcount], 0, &wordlist[wordlimit] - &wordlist[wordcount]);
		}
		cursize = wordlimit;
	}
	if (open_dict(dictfile) == -1)
		return -1;
	hits = open_wordfile();
	for (dictcount = 0, wordcount = 0; (ret = read_dict(hits, &w)) == 1; dictcount++) {
		if (wordcount >= cursize && wordlimit == 0) {
			assert(wordcount == cursize);
			wordlist = (struct word*)REALLOC(wordlist, (cursize += WORDSTEP) * sizeof(struct word));
			memset(&wordlist[wordcount], 0, &wordlist[cursize] - &wordlist[wordcount]);
		}
		rate(&w);
		if (wordcount < cursize) {
			if (debug)
				fprintf(debug, "load_wordlist: [+] %5d %08lx %3d |%s| \"%s\"\n",
					w.pos, w.timestamp, w.rate, w.history, w.lang[0]->c);
			if (wordcount == 0 || w.rate > largest_in)
				largest_in = w.rate;
			free_langlist(&wordlist[wordcount]);
			wordlist[wordcount++] = w;
			continue;
		}
		/* wordcount >= cursize */
		if (w.rate < largest_in) {
			for (i = 0; i < cursize; i++)
				if (wordlist[i].rate == largest_in)
					break;
			assert(i < cursize);
			if (debug)
				fprintf(debug, "load_wordlist: [=%d] %5d %08lx %3d |%s| \"%s\"\n", i,
					w.pos, w.timestamp, w.rate, w.history, w.lang[0]->c);
			free_langlist(&wordlist[i]);
			wordlist[i]  = w;
			smallest_out = largest_in;
			largest_in   = w.rate;
			for (i = 0; i < cursize; i++)
				if (wordlist[i].rate > largest_in)
					largest_in = wordlist[i].rate;
			continue;
		}
		if (debug)
			fprintf(debug, "load_wordlist: [-] %5d %08lx %3d |%s| \"%s\"\n",
				w.pos, w.timestamp, w.rate, w.history, w.lang[0]->c);
		if (wordcount == cursize || w.rate < smallest_out)
			smallest_out = w.rate;
		free_langlist(&w);
	}
	free_langlist(&w);
	close_dict();
	if (hits != NULL)
		gzclose(hits);
	if (ret == -1 || wordcount == 0)
		free_wordlist();
	else {
		if (wordcount < cursize)
			wordlist = (struct word*)REALLOC(wordlist, wordcount * sizeof(struct word));
		time(&randinit);
		srandom(randinit);
		qsort(wordlist, wordcount, sizeof(struct word), comp_by_rate);
	}
	if (ret == 0 && wordcount == 0) {
		errmsg(_("invalid or empty dictionary"));
		ret = -1;
	}
	return ret;
}

/* save the dictionary and the corresponding hits file
 *
 * returns  0: success
 * returns -1: failure
 */
int save_wordlist(void)
{
	char buffer[BUFSIZE];
	char *tempfile   = NULL;
	gzFile old       = NULL;
	gzFile new       = NULL;
	const char *zmsg = NULL;
	unsigned long
		timestamp = 0;
	size_t i, n, j;
	int ret;
	char *s, history[HISTSIZE+1];

	if (wordcount == 0)
		return 0;
	tempfile = (char*)MALLOC(strlen(wordfile) + 5);
	sprintf(tempfile, "%s.new", wordfile);
	old = open_wordfile();
	if ((new = gzopen(tempfile, "wb")) == NULL) {
		errmsg("gzopen: %s: %s", tempfile, errno == 0 ? zError(Z_MEM_ERROR) : strerror(errno));
		FREE(&tempfile);
		return -1;
	}
	if (debug)
		fprintf(debug, "save_wordlist: writing to %s\n", tempfile);
	print_wordlist_intro(new);
	qsort(wordlist, wordcount, sizeof(struct word), comp_by_pos);
	for (i = 0, n = 0; gzgets(old, buffer, BUFSIZE) != Z_NULL;) {
		if (*buffer == '\n' || *buffer == '#')
			continue;
		if (i < wordcount && wordlist[i].pos == n) {
			gzprintf(new, "%08lx:%s:\n", wordlist[i].timestamp, wordlist[i].history);
			i++;
		} else {
			timestamp = strtol(buffer, &s, 16);
			history[HISTSIZE] = 0;
			if (*s == ':') {
				memcpy(history, ++s, HISTSIZE);
				for (j = 0; s != NULL && j < HISTSIZE; j++)
					if (s[j] != ' ' && s[j] != '+' && s[j] != '-') {
						memset(history, ' ', HISTSIZE);
						break;
					}
			} else {
				timestamp = 0;
				memset(history, ' ', HISTSIZE);
			}
			gzprintf(new, "%08lx:%s:\n", timestamp, history);
		}
		n++;
	}
	if (old != NULL)
		gzclose(old);
	memset(buffer, ' ', HISTSIZE);
	buffer[HISTSIZE] = 0;
	for (; n < dictcount; n++)
		if (i < wordcount && wordlist[i].pos == n) {
			gzprintf(new, "%08lx:%s:\n", wordlist[i].timestamp, wordlist[i].history);
			i++;
		} else
			gzprintf(new, "%08lx:%s:\n", 0, buffer);
	if ((ret = gzclose(new)) != Z_OK) {
		zmsg = gzerror(new, &ret);
		errmsg("gzclose: %s: %s", ret == Z_ERRNO ? strerror(errno) : zmsg);
		FREE(&tempfile);
		return -1;
	}
	if (rename(tempfile, wordfile) != 0) {
		errmsg("rename: %s, %s: %m", tempfile, wordfile);
		FREE(&tempfile);
		return -1;
	}
	FREE(&tempfile);
	return 0;
}
