///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: spamprobe.cc 115 2005-03-29 17:21:48Z brian $
//
// Copyright (C) 2000 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//    http://www.cooldevtools.com/qpl.html
//

#include <unistd.h>
#include <fstream>
#include <cstdio>
#include <iomanip>
#include <locale.h>
#include <signal.h>
#include "CleanupManager.h"
#include "InterruptedException.h"
#include "RegularExpression.h"
#include "File.h"
#include "LockFile.h"
#include "SpamFilter.h"
#include "Configuration.h"
#include "WordData.h"
#include "Tokenizer.h"
#include "RegularExpression.h"
#include "SimpleTokenSelector.h"
#include "BNRTokenSelector.h"
#include "MD5Digester.h"
#include "AbstractMultiLineString.h"
#include "TraditionalMailMessageParser.h"
#include "MessageHeader.h"
#include "MessageHeaderList.h"
#include "StreamMailMessageReader.h"
#include "AutoTrainMailMessageReader.h"
#include "IstreamCharReader.h"
#include "LineReader.h"
#include "MailMessageList.h"
#include "MailMessageDigester.h"

static const bool BTB_DEBUG = false;

static const int MIN_TRAINING_SET = 1500;

static const string DB_FILENAME("sp_words");

static const string NEWL("\n");

static const SpamFilter::ScoreMode_t TRAIN_SCORE_MODE = SpamFilter::SCORE_NORMAL;

static string STATUS_FIELD_NAME("");

static int MAX_CACHE_TERMS = 15000;
static const int MAX_MAX_CACHE_TERMS = 100000;
static const int MAX_MESSAGES_PER_PURGE = 100000;
static const int MAX_SINGLE_MESSAGE_CLASSIFY = 25;
static const int AUTO_PURGE_JUNK_COUNT = 2;

static const string CMD_RECEIVE("receive");
static const string CMD_GOOD("good");
static const string CMD_DUMP("dump");
static const string CMD_CLEANUP("cleanup");
static const string CMD_COUNTS("counts");
static const string CMD_PURGE("purge");
static const string CMD_PURGE_TERMS("purge-terms");
static const string CMD_IMPORT("import");
static const string CMD_EXPORT("export");
static const string CMD_SPAM("spam");
static const string CMD_SCORE("score");
static const string CMD_REMOVE("remove");
static const string CMD_FIND_SPAM("find-spam");
static const string CMD_FIND_GOOD("find-good");
static const string CMD_SUMMARIZE("summarize");
static const string CMD_TOKENIZE("tokenize");
static const string CMD_EDIT_TERM("edit-term");
static const string CMD_TRAIN_SPAM("train-spam");
static const string CMD_TRAIN_GOOD("train-good");
static const string CMD_TRAIN_TEST("train-test");
static const string CMD_AUTO_TRAIN("auto-train");
static const string CMD_AUTO_LEARN("auto-learn");
static const string CMD_TRAIN("train");
static const string CMD_RECEIVE_TEST("receive-test");
static const string CMD_EXEC("exec");
static const string CMD_EXEC_SHARED("exec-shared");

static bool READ_AS_TOKENS = false;

// storint SpamFilter in a global so that an atexit() routine can close the filter
static NewPtr<SpamFilter> s_filter;

struct CommandInfo {
  string name;
  bool read_only;
  bool use_digest_header;
} COMMANDS[] = {
  { CMD_CLEANUP,      false, true },
  { CMD_COUNTS,       true,  true },
  { CMD_DUMP,         true,  true },
  { CMD_EDIT_TERM,    false, true },
  { CMD_EXEC,         false, false },
  { CMD_EXEC_SHARED,  true,  false },
  { CMD_EXPORT,       true,  true },
  { CMD_FIND_GOOD,    true,  true },
  { CMD_FIND_SPAM,    true,  true },
  { CMD_GOOD,         false, true },
  { CMD_IMPORT,       false, true },
  { CMD_PURGE,        false, true },
  { CMD_PURGE_TERMS,  false, true },
  { CMD_RECEIVE,      false, false },
  { CMD_RECEIVE_TEST, false, false },
  { CMD_REMOVE,       false, true },
  { CMD_SCORE,        true,  true },
  { CMD_SPAM,         false, true },
  { CMD_SUMMARIZE,    true,  true },
  { CMD_TOKENIZE,     true,  true },
  { CMD_TRAIN,        false, false },
  { CMD_TRAIN_GOOD,   false, true },
  { CMD_TRAIN_SPAM,   false, true },
  { CMD_TRAIN_TEST,   false, false },
  { CMD_AUTO_TRAIN,   false, false },
  { CMD_AUTO_LEARN,   false, false },
  { "",               false, true },
};

static const CommandInfo *validate_command(const string &command)
{
  for (const CommandInfo *cmd = COMMANDS; cmd->name.length() > 0; ++cmd) {
    if (command == cmd->name) {
      return cmd;
    }
  }
  return 0;
}

static const char *print_score(double score)
{
  static char buffer[30];
  sprintf(buffer, "%9.7f", score);
  return buffer;
}

void print_terms(FrequencyDB *db,
                 ostream &out,
                 Message &msg,
                 const string &indent)
{
  int good_count, spam_count;
  out << indent << "Spam Prob   Count    Good    Spam  Word" << NEWL;
  for (int i = 0; i < msg.getTopTokenCount(); ++i) {
    Token *tok = msg.getTopToken(i);
    db->getWordCounts(tok->getWord(), good_count, spam_count);
    out << indent
        << print_score(tok->getScore())
        << setw(8) << tok->getCount()
        << setw(8) << good_count
        << setw(8) << spam_count
        << "  " << tok->getWord()
        << NEWL;
  }
}

void find_message(SpamFilter &filter,
		  const char *filename,
                  Message &msg,
                  int message_num,
                  bool find_spam,
                  bool show_all,
                  bool show_terms)
{
  SpamFilter::Score score = filter.scoreMessage(msg);
  if (show_all || score.isSpam() == find_spam) {
    string subject, msg_id;
    msg.getHeader("subject", subject);
    if (filename) {
      cout << "File: " << filename << NEWL;
    }
    cout << "Message Num: " << message_num << NEWL
         << "Message-ID: " << msg.getID(msg_id) << NEWL
         << "Digest: " << msg.getDigest() << NEWL
         << "Subject: " << subject << NEWL
         << "NumTerms: " << msg.getTokenCount() << NEWL
         << "Score: "
         << print_score(score.getValue())
         << NEWL;
    if (show_terms && msg.getTopTokenCount() > 0) {
      print_terms(filter.getDB(), cout, msg, "       ");
    }
  }
}

void dump_message_words(SpamFilter &filter,
                        const string &filename,
                        Message &msg,
                        int message_num)
{
  int good_message_count, spam_message_count;
  filter.getDB()->getMessageCounts(good_message_count, spam_message_count);

  int good_count, spam_count;
  for (int token_num = 0; token_num < msg.getTokenCount(); ++token_num) {
    Token *token = msg.getToken(token_num);
    filter.getDB()->getWordCounts(token->getWord(), good_count, spam_count);
    double score = filter.scoreTerm(good_count, spam_count,
                                    good_message_count, spam_message_count);
    cout << print_score(score)
         << dec
         << setw(8) << good_count
         << setw(8) << spam_count
         << setw(8) << token->getCount()
         << "  " << token->getWord()
         << NEWL;
  }
}

void dump_words(SpamFilter &filter,
                const char *regex_str = 0)
{
  int good_message_count, spam_message_count;
  filter.getDB()->getMessageCounts(good_message_count, spam_message_count);

  RegularExpression regex;
  if (regex_str) {
    regex.setExpression(regex_str);
  }

  string word;
  WordData counts;
  bool again = filter.getDB()->firstWord(word, counts);
  while (again) {
    if (!regex_str || regex.match(word)) {
      double score = filter.scoreTerm(counts.goodCount(), counts.spamCount(),
                                      good_message_count, spam_message_count);
      cout << print_score(score)
           << dec
           << setw(8) << counts.goodCount()
           << setw(8) << counts.spamCount()
           << "  0x" << setfill('0') << setw(8) << hex << counts.flags() << setfill(' ')
           << "  " << word
           << NEWL;
    }
    again = filter.getDB()->nextWord(word, counts);
  }
}

void purge_terms(SpamFilter &filter,
                 const char *regex_str)
{
  RegularExpression regex;
  if (regex_str) {
    regex.setExpression(regex_str);
  }

  string word;
  WordData counts;
  bool again = filter.getDB()->firstWord(word, counts);
  while (again) {
    if (regex.match(word)) {
      filter.getDB()->removeWord(word, counts.goodCount(), counts.spamCount());
    }
    again = filter.getDB()->nextWord(word, counts);
  }
}

static void edit_term(SpamFilter &filter,
                      const string &term,
                      int good_count,
                      int spam_count)
{
  if (is_debug) {
    cerr << "edit-term '" << term << "' gc " << good_count << " sc " << spam_count << endl;
  }

  if (good_count < 0 || spam_count < 0) {
    cerr << "error: negative counts are not allowed" << endl;
    return;
  }

  filter.getDB()->setWordCounts(term, good_count, spam_count);
}

void cleanup_database(SpamFilter &filter,
                      char **argv)
{
  CleanupManager cleanman;
  do {
    int junk_count = 2;
    int max_age = 7;
    if (*argv) {
      junk_count = atoi(*argv++);
    }
    if (*argv) {
      max_age = atoi(*argv++);
    }
    cleanman.addLimit(junk_count, max_age);
  } while (*argv);

  filter.getDB()->sweepOutOldTerms(cleanman);
}

void purge_database(SpamFilter &filter,
                    int junk_count)
{
  CleanupManager purger(junk_count, -1); // -1 forces todays to be removed too
  filter.getDB()->sweepOutOldTerms(purger);
}

void export_words(SpamFilter &filter)
{
  string word;
  WordData counts;
  bool again = filter.getDB()->firstWord(word, counts);
  while (again) {
    string encoded;
    encode_string(word, encoded);
    cout << dec << counts.goodCount()
         << ','
         << counts.spamCount()
         << ','
         << counts.flags()
         << ",\""
         << encoded
         << "\""
         << NEWL;
    again = filter.getDB()->nextWord(word, counts);
  }
}

void import_words(SpamFilter &filter,
                  istream &in_stream)
{
  RegularExpression parts_expr("^(-?[0-9]+),(-?[0-9]+)(,([0-9]+))?,\"?([^\"]+)\"?$", 5);
  assert(parts_expr.isValid());

  string match_buffer;
  int record_count = 0;
  IstreamCharReader char_reader(&in_stream);
  LineReader line_reader(&char_reader);
  while (line_reader.forward()) {
    if (is_debug) {
      cerr << "IMPORT " << line_reader.currentLine() << endl;
    }
    if (parts_expr.match(line_reader.currentLine())) {
      if (parts_expr.matchCount() >= 6) {
        int good_count = atoi(parts_expr.getMatch(1, match_buffer).c_str());
        int spam_count = atoi(parts_expr.getMatch(2, match_buffer).c_str());
        string flags_str = parts_expr.getMatch(4, match_buffer);
        parts_expr.getMatch(5, match_buffer);

        string decoded;
        decode_string(match_buffer, decoded);

        if (is_debug) {
          cerr << "gc " << good_count
               << " sc " << spam_count
               << " flags " << flags_str
               << " term " << match_buffer
               << endl;
        }

        if (flags_str.length() > 0) {
          unsigned long flags = atoi(flags_str.c_str());
          filter.getDB()->addWord(decoded, good_count, spam_count, flags);
        } else {
          filter.getDB()->addWord(decoded, good_count, spam_count);
        }
        if (++record_count > 10000) {
          filter.flush();
          record_count = 0;
        }
      }
    }
  }
}

bool classify_message(Message &msg,
                      SpamFilter &filter,
                      bool is_spam,
                      int max_loops)
{
  if (is_spam) {
    filter.ensureSpamMessage(msg, false);
  } else {
    filter.ensureGoodMessage(msg, false);
  }

  SpamFilter::Score score = filter.scoreMessage(msg);
  while (!score.isConfident(is_spam)) {
    bool is_spam_in_db = false;
    int message_count = filter.getDB()->getMessageCount(msg, is_spam_in_db);
    assert(!is_spam_in_db == !is_spam);

    if (message_count >= max_loops) {
      break;
    }

    if (BTB_DEBUG || is_debug) {
      cerr << "ANOTHER ATTEMPT to classify message " << msg.getDigest()
           << " SCORE " << score.getValue() << endl;
    }

    if (is_spam) {
      filter.ensureSpamMessage(msg, true);
    } else {
      filter.ensureGoodMessage(msg, true);
    }

    score = filter.scoreMessage(msg);
  }

  return is_spam;
}

bool train_on_message(SpamFilter &filter,
                      Message &msg,
                      const SpamFilter::Score &score,
                      bool is_spam,
                      bool update_timestamps)
{
  int good_count, spam_count;
  filter.getDB()->getMessageCounts(good_count, spam_count);

  bool counts_unbalanced, below_min_count;
  if (is_spam) {
    counts_unbalanced = spam_count < good_count;
    below_min_count   = spam_count < MIN_TRAINING_SET;
  } else {
    counts_unbalanced = good_count < spam_count;
    below_min_count   = good_count < MIN_TRAINING_SET;
  }

  if (below_min_count || counts_unbalanced || (score.isSpam() != is_spam) ||
      !filter.scoreMessageIfWrongMode(score, msg, TRAIN_SCORE_MODE).isConfident(is_spam)) {
    // train on the message if it was wrong or not a definite decision
    classify_message(msg, filter, is_spam, MAX_SINGLE_MESSAGE_CLASSIFY);
  } else if (update_timestamps) {
    // Otherwise just update term time stamps to keep them from expiring.
    // If we didn't do this terms that appear frequently in messages that
    // aren't classified would expire from the database.
    filter.getDB()->touchMessage(msg);
  }

  return is_spam;
}

void print_message_score(const SpamFilter::Score &score,
                         SpamFilter &filter,
                         Message &msg,
                         bool show_terms)
{
  cout << (score.isSpam() ? "SPAM " : "GOOD ")
       << print_score(score.getValue())
       << ' '
       << msg.getDigest()
       << endl;
  if (show_terms && msg.getTopTokenCount() > 0) {
    print_terms(filter.getDB(), cout, msg, "    ");
  }
}

bool should_skip_message(Message &msg)
{
  if (STATUS_FIELD_NAME.length() == 0) {
    return false;
  }

  string value;
  if (msg.getHeader(STATUS_FIELD_NAME, value).length() == 0) {
    return false;
  }

  if (value.find('D') == string::npos) {
    return false;
  }

  if (is_debug) {
    cerr << "SKIPPING MESSAGE WITH STATUS: " << value << endl;
  }
  return true;
}

void log_message_processing(bool ignored,
                            Message &msg,
                            const char *filename,
                            const string &command)
{
  if (!is_verbose) {
    return;
  }

  string subject;

  const string::size_type MAX_SUBJECT_LENGTH = 20;
  msg.getHeader("subject", subject);
  if (subject.length() > MAX_SUBJECT_LENGTH) {
    subject.erase(MAX_SUBJECT_LENGTH, string::npos);
    subject += "...";
  }

  cerr << "COMMAND " << command;

  if (filename) {
    cerr << " FILE " << filename;
  }

  cerr << " DIGEST " << msg.getDigest()
       << " SUBJECT " << subject;

  if (ignored) {
    cerr << " IGNORED";
  } else {
    cerr << " PROCESSED";
  }
  cerr << endl;
}

bool process_message(SpamFilter &filter,
                     Message &msg,
                     const char *filename,
                     const string &command,
                     bool show_terms,
                     int &message_num,
                     int &cumulative_message_count,
                     int messages_per_purge)
{
  if (should_skip_message(msg)) {
    log_message_processing(true, msg, filename, command);
    return false;
  }

  if (is_debug) {
    cerr << "*** NEW MESSAGE ***" << endl;
  }

  bool is_message_spam = false;

  ++message_num;
  ++cumulative_message_count;
  if (command == CMD_GOOD) {
    is_message_spam = classify_message(msg, filter, false, MAX_SINGLE_MESSAGE_CLASSIFY);
  } else if (command == CMD_SPAM) {
    is_message_spam = classify_message(msg, filter, true, MAX_SINGLE_MESSAGE_CLASSIFY);
  } else if (command == CMD_REMOVE) {
    filter.removeMessage(msg);
  } else if (command == CMD_RECEIVE) {
    SpamFilter::Score score = filter.scoreMessage(msg);
    print_message_score(score, filter, msg, show_terms);
    is_message_spam = classify_message(msg, filter, score.isSpam(), MAX_SINGLE_MESSAGE_CLASSIFY);
  } else if (command == CMD_TRAIN_SPAM) {
    is_message_spam = train_on_message(filter, msg, filter.scoreMessage(msg), true, false);
  } else if (command == CMD_TRAIN_GOOD) {
    is_message_spam = train_on_message(filter, msg, filter.scoreMessage(msg), false, false);
  } else if (command == CMD_TRAIN) {
    SpamFilter::Score score = filter.scoreMessage(msg);
    print_message_score(score, filter, msg, show_terms);
    is_message_spam = train_on_message(filter, msg, score, score.isSpam(), true);
  } else if (command == CMD_FIND_SPAM) {
    find_message(filter, filename, msg, message_num, true, false, show_terms);
  } else if (command == CMD_FIND_GOOD) {
    find_message(filter, filename, msg, message_num, false, false, show_terms);
  } else if (command == CMD_SUMMARIZE) {
    find_message(filter, filename, msg, message_num, true, true, show_terms);
  } else if (command == CMD_TOKENIZE) {
    dump_message_words(filter, filename, msg, message_num);
  } else if (command == CMD_SCORE) {
    SpamFilter::Score score = filter.scoreMessage(msg);
    print_message_score(score, filter, msg, show_terms);
    is_message_spam = score.isSpam();
  } else {
    cerr << "error: unknown command " << command << endl;
    exit(1);
  }

  log_message_processing(false, msg, filename, command);

  if (messages_per_purge > 0 &&  cumulative_message_count % messages_per_purge == 0) {
    purge_database(filter, AUTO_PURGE_JUNK_COUNT);
  }

  return is_message_spam;
}

bool process_mime_stream(SpamFilter &filter,
                         istream &in_stream,
                         const char *filename,
                         const string &command,
                         bool ignore_from,
                         bool ignore_content_length,
                         bool show_terms,
                         bool is_stdin,
                         Configuration *config,
                         int &cumulative_message_count,
                         int messages_per_purge)
{
  bool is_message_spam = false;
  int message_num = 0;

  TraditionalMailMessageParser parser(config);
  MailMessageDigester digester;

  NewPtr<StreamMailMessageReader> mail_reader(new StreamMailMessageReader);
  mail_reader->setStream(&in_stream, !is_stdin, ignore_from, ignore_content_length);

  NewPtr<MailMessage> mail_message;
  mail_message.set(mail_reader->readMessage());
  while (mail_message.isNotNull()) {
      NewPtr<Message> msg(parser.parseMailMessage(mail_message.get()));
      msg->setSource(mail_message.release());
      digester.assignDigestToMessage(msg.get(), msg->source(), config->spamprobeFieldName());

      is_message_spam = process_message(filter, *msg.get(), filename, command, show_terms,
                                        message_num, cumulative_message_count, messages_per_purge);
      mail_message.set(mail_reader->readMessage());
  }
  return is_message_spam;
}

bool read_tokens(LineReader &in,
                 Message &msg)
{
  MD5Digester digester;
  msg.clear();
  digester.start();
  while (in.forward() && in.currentLine().length() > 0) {
    digester.add(in.currentLine());
    msg.addToken(in.currentLine(), Token::FLAG_NORMAL);
  }
  digester.stop();
  msg.setDigest(digester.asString());
  return msg.getTokenCount() > 0;
}

bool process_token_stream(SpamFilter &filter,
                          istream &in_stream,
                          const char *filename,
                          const string &command,
                          bool show_terms,
                          int &cumulative_message_count,
                          int messages_per_purge)
{
  Message msg;

  bool is_message_spam = false;
  int message_num = 0;
  IstreamCharReader char_reader(&in_stream);
  LineReader line_reader(&char_reader);

  while (read_tokens(line_reader, msg)) {
    is_message_spam = process_message(filter, msg, filename, command, show_terms,
                                      message_num, cumulative_message_count, messages_per_purge);
  }

  return is_message_spam;
}

bool process_stream(SpamFilter &filter,
                    istream &in_stream,
                    const char *filename,
                    const string &command,
                    bool ignore_from,
                    bool ignore_content_length,
                    bool show_terms,
                    bool is_stdin,
                    Configuration *config,
                    int &cumulative_message_count,
                    int messages_per_purge)
{
  if (READ_AS_TOKENS) {
    return process_token_stream(filter, in_stream, filename, command,
                                show_terms, cumulative_message_count, messages_per_purge);
  } else {
    return process_mime_stream(filter, in_stream, filename, command,
                               ignore_from, ignore_content_length, show_terms, is_stdin,
                               config, cumulative_message_count, messages_per_purge);
  }
}

void execute_command(char **argv)
{
  string command;
  while (*argv) {
    if (command.length() > 0) {
      command += ' ';
    }
    command += *argv;
    ++argv;
  }
  if (is_debug) {
    cerr << "EXEC: " << command << endl;
  }
  int rc = system(command.c_str());
  if (is_debug) {
    cerr << "EXEC RC: " << rc << endl;
  }
}

void auto_train(SpamFilter &filter,
                bool ignore_from,
                bool ignore_content_length,
                bool show_terms,
                Configuration *config,
                int messages_per_purge,
                bool train_mode,
                char **argv)
{
  NewPtr<AutoTrainMailMessageReader> mail_reader(new AutoTrainMailMessageReader());
  bool is_spam_file = false;
  while (*argv) {
    string arg(*argv);
    ++argv;

    if (arg == "SPAM") {
      is_spam_file = true;
    } else if (arg == "GOOD") {
      is_spam_file = false;
    } else {
      File file(arg);
      if (!file.isFile()) {
        throw runtime_error(string("file does not exist: ") + arg);
      }
      mail_reader->addMailboxFile(is_spam_file, arg);
    }
  }

  bool is_message_spam = false;
  int message_num = 0;

  TraditionalMailMessageParser parser(config);
  MailMessageDigester digester;

  int cumulative_message_count = 0;
  string command;
  NewPtr<MailMessage> mail_message;
  mail_message.set(mail_reader->readMessage());
  while (mail_message.isNotNull()) {
      NewPtr<Message> msg(parser.parseMailMessage(mail_message.get()));
      msg->setSource(mail_message.release());
      digester.assignDigestToMessage(msg.get(), msg->source(), config->spamprobeFieldName());

      if (mail_reader->messageWasSpam()) {
        command = train_mode ? CMD_TRAIN_SPAM : CMD_SPAM;
      } else {
        command = train_mode ? CMD_TRAIN_GOOD : CMD_GOOD;
      }

      process_message(filter, *msg.get(), mail_reader->messagePath().c_str(), command, show_terms,
                      message_num, cumulative_message_count, messages_per_purge);
      mail_message.set(mail_reader->readMessage());
  }
}

void train_test(SpamFilter &filter,
                istream &in,
                bool ignore_from,
                bool ignore_content_length,
                bool show_terms,
                bool is_stdin,
                Configuration *config,
                int messages_per_purge,
                bool train_mode)
{
  int cumulative_message_count = 0;
  string type_name, file_name, command;
  while (in) {
    in >> type_name;
    in >> file_name;

    if (in) {
      File message_file(file_name);
      if (!message_file.isFile()) {
        throw runtime_error(file_name + ": does not exist");
      }

      ifstream message_stream(file_name.c_str());
      if (type_name == "spam") {
        command = train_mode ? CMD_TRAIN_SPAM : CMD_SPAM;
      } else if (type_name == "good") {
        command = train_mode ? CMD_TRAIN_GOOD : CMD_GOOD;
      } else {
        throw runtime_error(string("invalid message type: ") + type_name);
      }
      process_stream(filter, message_stream, file_name.c_str(), command,
                     ignore_from, ignore_content_length, show_terms, is_stdin,
                     config, cumulative_message_count, messages_per_purge);
    }
  }
}

bool process_extended_options(const vector<string> &option_names,
                              Configuration *config,
                              SpamFilter *filter)
{
  bool failed = false;

  for (vector<string>::const_iterator i = option_names.begin(); i != option_names.end(); ++i) {
    string option_name(*i);
    if (option_name.length() > 0) {
      if (option_name == "graham") {
        config->setMaxPhraseTerms(1);
        config->setRemoveHTML(false);
        config->setMinTermLength(1);
        config->setMaxTermLength(90);
        filter->setTermsForScore(15);
        filter->setMaxWordRepeats(1);
        filter->setNewWordScore(0.4);
        filter->setExtendTopTerms(false);
        config->headers()->setAllHeadersMode();
        filter->setScoreMode(SpamFilter::SCORE_ORIGINAL);
      } else if (option_name == "suspicious-tags") {
        config->setKeepSuspiciousTags(true);
      } else if (option_name == "honor-status-header") {
        STATUS_FIELD_NAME = "status";
      } else if (option_name == "honor-xstatus-header") {
        STATUS_FIELD_NAME = "x-status";
      } else if (option_name == "orig-score") {
        filter->setScoreMode(SpamFilter::SCORE_ORIGINAL);
      } else if (option_name == "tokenized") {
        READ_AS_TOKENS = true;
      } else if (option_name != "normal") {
        cerr << "error: unknown option: " << option_name << endl;
        failed = true;
      }
    }
  }

  return failed;
}

bool process_test_cases(const vector<string> &test_cases,
                        Configuration *config,
                        SpamFilter *filter)
{
  bool failed = false;

  for (vector<string>::const_iterator i = test_cases.begin(); i != test_cases.end(); ++i) {
    string test_case(*i);
    if (test_case.length() > 0) {
      if (test_case == "graham") {
        config->setMaxPhraseTerms(1);
        config->setRemoveHTML(false);
        config->setMinTermLength(1);
        config->setMaxTermLength(90);
        filter->setTermsForScore(15);
        filter->setMaxWordRepeats(1);
        filter->setNewWordScore(0.4);
        filter->setExtendTopTerms(false);
        config->headers()->setAllHeadersMode();
      } else if (test_case == "all-sigs") {
        filter->setTermsForScore(5);
        filter->setMaxWordRepeats(5);
        filter->setExtendTopTerms(true);
      } else if (test_case == "all-sigs-3") {
        filter->setTermsForScore(5);
        filter->setMaxWordRepeats(5);
        filter->setExtendTopTerms(true);
        config->setMaxPhraseTerms(3);
      } else if (test_case == "all-sigs-5") {
        filter->setTermsForScore(5);
        filter->setMaxWordRepeats(5);
        filter->setExtendTopTerms(true);
        config->setMaxPhraseTerms(5);
      } else if (test_case == "all-phrases-2-3") {
        config->setMinPhraseTerms(2);
        config->setMaxPhraseTerms(3);
      } else if (test_case == "all-phrases-3-4") {
        config->setMinPhraseTerms(3);
        config->setMaxPhraseTerms(4);
      } else if (test_case == "all-phrases-4-5") {
        config->setMinPhraseTerms(4);
        config->setMaxPhraseTerms(5);
      } else if (test_case == "all-phrases-2-5") {
        config->setMinPhraseTerms(2);
        config->setMaxPhraseTerms(5);
      } else if (test_case == "all-phrases-2-3-ext") {
        filter->setTermsForScore(5);
        filter->setMaxWordRepeats(5);
        filter->setExtendTopTerms(true);
        config->setMinPhraseTerms(2);
        config->setMaxPhraseTerms(3);
      } else if (test_case == "all-phrases-2-3-water") {
        filter->setWaterCounts(true);
        config->setMinPhraseTerms(2);
        config->setMaxPhraseTerms(3);
      } else if (test_case == "all-phrases-3-5") {
        filter->setTermsForScore(5);
        filter->setMaxWordRepeats(5);
        filter->setExtendTopTerms(true);
        config->setMinPhraseTerms(3);
        config->setMaxPhraseTerms(5);
      } else if (test_case == "all-phrases-5") {
        filter->setTermsForScore(5);
        filter->setMaxWordRepeats(5);
        filter->setExtendTopTerms(true);
        config->setMinPhraseTerms(5);
        config->setMaxPhraseTerms(5);
      } else if (test_case == "old-graham") {
        config->setMaxPhraseTerms(1);
        config->setRemoveHTML(false);
        config->setMinTermLength(1);
        config->setMaxTermLength(90);
        filter->setTermsForScore(15);
        filter->setMaxWordRepeats(1);
        filter->setNewWordScore(0.2);
        filter->setExtendTopTerms(false);
        config->headers()->setAllHeadersMode();
      } else if (test_case == "nox") {
        config->headers()->setNonXHeadersMode();
      } else if (test_case == "all") {
        config->headers()->setAllHeadersMode();
      } else if (test_case == "wide-open") {
        config->setRemoveHTML(false);
        config->headers()->setAllHeadersMode();
      } else if (test_case == "alt1") {
        filter->setScoreMode(SpamFilter::SCORE_ALT1);
      } else if (test_case == "low-score") {
        filter->setNewWordScore(0.2);
        filter->setMinWordCount(5);
      } else if (test_case == "high-score") {
        filter->setNewWordScore(0.88);
        filter->setMinWordCount(5);
      } else if (test_case == "nox-high-score") {
        filter->setNewWordScore(0.88);
        filter->setMinWordCount(5);
        config->headers()->setNonXHeadersMode();
      } else if (test_case == "no-phrase") {
        config->setMaxPhraseTerms(1);
      } else if (test_case == "normal-ext") {
        filter->setExtendTopTerms(true);
      } else if (test_case == "no-min") {
        filter->setMinWordCount(1);
      } else if (test_case == "1-days") {
        WordData::setTodayDate(1);
      } else if (test_case == "2-days") {
        WordData::setTodayDate(2);
      } else if (test_case == "3-days") {
        WordData::setTodayDate(3);
      } else if (test_case == "10-days") {
        WordData::setTodayDate(10);
      } else if (test_case == "normal-3") {
        config->setMaxPhraseTerms(3);
      } else if (test_case == "normal-5") {
        config->setMaxPhraseTerms(5);
      } else if (test_case == "phrases-2") {
        config->setMaxPhraseTerms(2);
        config->setMinPhraseTerms(2);
      } else if (test_case == "phrases-3") {
        config->setMaxPhraseTerms(3);
        config->setMinPhraseTerms(3);
      } else if (test_case == "phrases-4") {
        config->setMaxPhraseTerms(4);
        config->setMinPhraseTerms(4);
      } else if (test_case == "phrases-5") {
        config->setMaxPhraseTerms(5);
        config->setMinPhraseTerms(5);
      } else if (test_case == "no-prefixes") {
        config->headers()->setBlankPrefixesMode();
      } else if (test_case == "10-char-phrase") {
        config->setMaxPhraseChars(10);
      } else if (test_case == "20-char-phrase") {
        config->setMaxPhraseTerms(20);
        config->setMaxPhraseChars(20);
      } else if (test_case == "html") {
        config->setRemoveHTML(false);
      } else if (test_case == "15/1") {
        filter->setTermsForScore(15);
        filter->setMaxWordRepeats(1);
      } else if (test_case == "15/1-no-phrase") {
        filter->setTermsForScore(15);
        filter->setMaxWordRepeats(1);
        config->setMaxPhraseTerms(1);
        config->headers()->setNonXHeadersMode();
      } else if (test_case == "suspicious") {
        config->setKeepSuspiciousTags(true);
      } else if (test_case == "nox-suspicious") {
        config->setKeepSuspiciousTags(true);
        config->headers()->setNonXHeadersMode();
      } else if (test_case == "min-0.25") {
        filter->setMinDistanceForScore(0.25);
        filter->setMinArraySize(8);
      } else if (test_case == "min-0.33-10") {
        filter->setMinDistanceForScore(0.33);
        filter->setMinArraySize(10);
      } else if (test_case == "min-0.33-8") {
        filter->setMinDistanceForScore(0.33);
        filter->setMinArraySize(8);
      } else if (test_case == "min-0.40") {
        filter->setMinDistanceForScore(0.40);
        filter->setMinArraySize(8);
      } else if (test_case == "min-0.45") {
        filter->setMinDistanceForScore(0.45);
        filter->setMinArraySize(8);
      } else if (test_case == "min-0.49") {
        filter->setMinDistanceForScore(0.49);
        filter->setMinArraySize(8);
      } else if (test_case == "min-0.495") {
        filter->setMinDistanceForScore(0.495);
        filter->setMinArraySize(8);
      } else if (test_case == "trial") {
        filter->setMinDistanceForScore(0.40);
        filter->setMinArraySize(10);
      } else if (test_case == "trial-2") {
        filter->setMinDistanceForScore(0.25);
        filter->setMinArraySize(12);
        filter->setTermsForScore(30);
        filter->setMaxWordRepeats(2);
        filter->setWaterCounts(true);
        filter->setNewWordScore(0.5);
        filter->setExtendTopTerms(true);
      } else if (test_case == "trial-3") {
        filter->setExtendTopTerms(true);
        filter->setWaterCounts(true);
      } else if (test_case == "susp-high-score") {
        filter->setNewWordScore(0.88);
        config->setKeepSuspiciousTags(true);
      } else if (test_case == "use-sa") {
        config->headers()->addHeaderPrefix("x-spam-status", "sa", "sa");
        config->headers()->addSimpleHeaderPrefix("x-spam-status");
      } else if (test_case == "water-counts") {
        filter->setWaterCounts(true);
        filter->setNewWordScore(0.4);
      } else if (test_case == "original") {
        filter->setScoreMode(SpamFilter::SCORE_ORIGINAL);
      } else if (test_case == "0.5") {
        filter->setDefaultThreshold(0.5);
      } else if (test_case == "0.6") {
        filter->setDefaultThreshold(0.6);
      } else if (test_case == "0.7") {
        filter->setDefaultThreshold(0.7);
      } else if (test_case == "0.8") {
        filter->setDefaultThreshold(0.8);
      } else if (test_case == "0.9") {
        filter->setDefaultThreshold(0.9);
      } else if (test_case == "min-10") {
        filter->setMinWordCount(10);
      } else if (test_case == "normal-headers-only") {
        filter->setMinDistanceForScore(0.30);
        filter->setMinArraySize(10);
        config->setIgnoreBody(true);
        config->headers()->setDefaultHeadersMode();
      } else if (test_case == "nox-headers-only") {
        filter->setMinDistanceForScore(0.30);
        filter->setMinArraySize(10);
        config->setIgnoreBody(true);
        config->headers()->setNonXHeadersMode();
      } else if (test_case == "all-headers-only") {
        filter->setMinDistanceForScore(0.30);
        filter->setMinArraySize(10);
        config->setIgnoreBody(true);
        config->headers()->setAllHeadersMode();
      } else if (test_case == "headers-only") {
        config->headers()->setNoHeadersMode();
        config->headers()->addSimpleHeaderPrefix("from");
        config->headers()->addSimpleHeaderPrefix("to");
        config->headers()->addSimpleHeaderPrefix("cc");
        config->headers()->addSimpleHeaderPrefix("bcc");
        config->headers()->addSimpleHeaderPrefix("subject");
        config->headers()->addSimpleHeaderPrefix("reply-to");
        config->headers()->addSimpleHeaderPrefix("received");
        config->headers()->addSimpleHeaderPrefix("sender");
        config->headers()->addSimpleHeaderPrefix("x-mailer");
        config->headers()->addSimpleHeaderPrefix("errors-to");
        config->headers()->addSimpleHeaderPrefix("x-beenthere");
        config->headers()->addSimpleHeaderPrefix("list-id");
        config->headers()->addSimpleHeaderPrefix("user-agent");
        config->headers()->addSimpleHeaderPrefix("references");
        config->headers()->addSimpleHeaderPrefix("message-id");
        filter->setMinDistanceForScore(0.30);
        filter->setMinArraySize(10);
        config->setIgnoreBody(true);
      } else if (test_case == "multi") {
        filter->clearTokenSelectors();
        filter->addTokenSelector(new SimpleTokenSelector(Token::FLAG_ANY, "H"));
        filter->addTokenSelector(new SimpleTokenSelector(Token::FLAG_ANY - Token::FLAG_PHRASE, ""));
        filter->addTokenSelector(new SimpleTokenSelector(Token::FLAG_ANY - Token::FLAG_DERIVED, ""));
        filter->addTokenSelector(new TokenSelector());
      } else if (test_case == "bnr") {
        filter->clearTokenSelectors();
        filter->addTokenSelector(new BNRTokenSelector());
      } else if (test_case == "multi-bnr") {
        filter->clearTokenSelectors();
        filter->addTokenSelector(new SimpleTokenSelector(Token::FLAG_ANY, "H"));
        filter->addTokenSelector(new SimpleTokenSelector(Token::FLAG_ANY - Token::FLAG_PHRASE, ""));
        filter->addTokenSelector(new SimpleTokenSelector(Token::FLAG_ANY - Token::FLAG_DERIVED, ""));
        filter->addTokenSelector(new TokenSelector());
        filter->addTokenSelector(new BNRTokenSelector());
      } else if (test_case == "max-250-terms") {
        filter->setMinDistanceForScore(0.30);
        filter->setMinArraySize(10);
        config->headers()->setNonXHeadersMode();
        config->setMaxTermsPerMessage(250);
      } else if (test_case == "max-350-terms") {
        filter->setMinDistanceForScore(0.30);
        filter->setMinArraySize(10);
        config->headers()->setNonXHeadersMode();
        config->setMaxTermsPerMessage(350);
      } else if (test_case == "max-500-terms") {
        filter->setMinDistanceForScore(0.30);
        filter->setMinArraySize(10);
        config->headers()->setNonXHeadersMode();
        config->setMaxTermsPerMessage(500);
      } else if (test_case != "normal") {
        cerr << "error: unknown test case: " << test_case << endl;
        failed = true;
      }
    }
  }

  return failed;
}

static bool parse_int_arg(const char *arg,
                          const char *option,
                          int &value,
                          int min_val,
                          int max_val)
{
  value = atoi(arg);
  if (value >= min_val && value <= max_val) {
    return true;
  }

  cerr << "error: -" << option << " requires integer argument between "
       << min_val << " and " << max_val << endl;
  return false;
}

static bool parse_double_arg(const char *arg,
                             const char *option,
                             double &value,
                             double min_val,
                             double max_val)
{
  value = atof(arg);
  if (value >= min_val && value <= max_val) {
    return true;
  }

  cerr << "error: -" << option << " requires number argument between "
       << min_val << " and " << max_val << endl;
  return false;
}

static bool set_headers(Configuration *config,
                        const char *optarg)
{
  bool successful = true;
  if (strcmp(optarg, "all") == 0) {
    config->headers()->setAllHeadersMode();
  } else if (strcmp(optarg, "nox") == 0) {
    config->headers()->setNonXHeadersMode();
  } else if (strcmp(optarg, "normal") == 0) {
    config->headers()->setDefaultHeadersMode();
  } else if (strcmp(optarg, "none") == 0) {
    config->headers()->setNoHeadersMode();
  } else if (optarg[0] == '+') {
    config->headers()->addSimpleHeaderPrefix(optarg + 1);
  } else {
    cerr << "error: -H option requires all, nox, none, or normal" << endl;
    successful = false;
  }
  return successful;
}

static void print_version(SpamFilter *filter)
{
  cout << "SpamProbe v" << VERSION;
  if (filter && filter->getDB()) {
    cout << " using " << filter->getDB()->getDatabaseType();
  }
  cout << " database." << endl
       << endl
       << "Copyright 2002-2005 Burton Computer Corporation" << endl
       << "This program is distributed in the hope that it will be useful," << endl
       << "but WITHOUT ANY WARRANTY; without even the implied warranty of" << endl
       << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the" << endl
       << "Q Public License for more details." << endl;
}

static void print_usage()
{
  cerr << "usage: spamprobe [-chmTvVxY78] [-a char] [-d basedir] [-H headers]"
       << " command [filename...]" << endl;
}

static bool directory_test(File dir,
                           bool can_create)
{
  string path(dir.getPath());
  string::size_type colon_pos = path.rfind(':');
  if (colon_pos != string::npos) {
    dir.setPath(path.substr(colon_pos + 1));
  }

  bool ok = true;

  if (!dir.isDirectory()) {
    if (can_create) {
      dir.makeDirectory(0700);
    } else {
      cerr << "error: " << dir.getPath() << " is not a directory" << endl;
      ok = false;
    }
  }

  return ok;
}

//
// Cleanup function for use by signal handlers.
//
void quick_close(int signum)
{
  signal(signum,  SIG_DFL);
  try {
    cerr << "caught signal " << signum << ": quitting" << endl;
  } catch (...) {
    // not allowed to throw any exceptions in signal handlers
  }
}

//
// For user interrupts give the any outstanding database operation time
// to complete before exiting.
//
bool interrupt_or_exit(int signum)
{
  if (s_filter.get() && s_filter->getDB()) {
    return s_filter->getDB()->requestInterrupt();
  } else {
    // cerr << "OK TO QUIT" << endl;
    return true;
  }
}

//
// Signal handler to close the database on receipt of a signal
// to ensure that any locks it might hold are cleared.
//
void close_and_exit(int signum)
{
  if (interrupt_or_exit(signum)) {
    quick_close(signum);
    exit(127);
  }
}

//
// Cleanup function to close the database on receipt of a signal
// to ensure that any locks it might hold are cleared.  After closing
// calls abort to dump core.
//
void close_and_abort(int signum)
{
  quick_close(signum);
  signal(SIGABRT, SIG_DFL);
  abort();
}

//
// Register signal handler to cleanup on abnormal exit from program (i.e.
// ^C or a crash.
//
void install_signal_handler()
{
  // clean shutdown
  signal(SIGINT,  close_and_exit);
  signal(SIGTERM, close_and_exit);
  signal(SIGPIPE, close_and_exit);

  // close and abort
  signal(SIGQUIT, close_and_abort);
  signal(SIGSEGV, close_and_abort);
  signal(SIGBUS,  close_and_abort);
  signal(SIGABRT, close_and_abort);
}

int main(int argc,
         char **argv)
{
  try {
    int opt;
    bool usage_error = false;
    bool force_mbox = false;
    bool create_dir = false;
    bool ignore_from = false;
    bool ignore_content_length = false;
    bool show_terms = false;
    bool single_message_file = false;
    bool wants_version = false;
    bool return_spam_status = false;
    int messages_per_purge = 0;
    string spamprobe_field_name("x-spamprobe");
    File shared_db_dir;
    File basedir(File::getHomeDir(), ".spamprobe");
    vector<string> test_cases;
    vector<string> extended_options;

    NewPtr<Configuration> config(new Configuration);
#ifdef USE_8BIT
    config->setReplaceNonAsciiChars(0);
#else
    config->setReplaceNonAsciiChars('Z');
#endif
    config->headers()->addSimpleHeaderPrefix("from");
    config->headers()->addSimpleHeaderPrefix("to");
    config->headers()->addSimpleHeaderPrefix("cc");
    config->headers()->addSimpleHeaderPrefix("subject");
    config->headers()->addHeaderPrefix("received", "recv", "recvx");

    s_filter.set(new SpamFilter);

    optind = 1;
    int opt_value;
    double opt_double_value;
    while ((opt = getopt(argc, argv, "a:cC:d:D:hH:g:l:mMo:p:P:r:Rs:t:TvVw:xXY78")) != EOF) {
      switch (opt) {
      case 'a':
        if (strlen(optarg) != 1) {
          cerr << "error: -a option requires a one character argument" << endl;
          usage_error = true;
        } else {
          config->setReplaceNonAsciiChars(optarg[0]);
        }
        break;

      case 'c':
        create_dir = true;
        break;

      case 'C':
        if (parse_int_arg(optarg, "C", opt_value, 0, 10000)) {
          s_filter->setMinWordCount(opt_value);
        } else {
          usage_error = true;
        }
        break;

      case 'd':
        basedir.setPath(optarg);
        break;

      case 'D':
        shared_db_dir.setPath(optarg);
        break;

      case 'g':
        spamprobe_field_name = to_lower(optarg);
        break;

      case 'h':
        config->setRemoveHTML(false);
        break;

      case 'H':
        if (!set_headers(config.get(), optarg)) {
          usage_error = true;
        }
        break;

      case 'l':
        if (parse_double_arg(optarg, "l", opt_double_value, 0.0, 1.0)) {
          s_filter->setDefaultThreshold(opt_double_value);
        } else {
          usage_error = true;
        }
        break;

      case 'm':
        force_mbox = true;
        break;

      case 'M':
        single_message_file = true;
        break;

      case 'o':
        extended_options.push_back(optarg);
        break;

      case 'p':
        if (parse_int_arg(optarg, "p", opt_value, 1, 10)) {
          config->setMaxPhraseTerms(opt_value);
        } else {
          usage_error = true;
        }
        break;

      case 'P':
        if (parse_int_arg(optarg, "P", opt_value, 1, MAX_MESSAGES_PER_PURGE)) {
          messages_per_purge = opt_value;
        } else {
          usage_error = true;
        }
        break;

      case 'r':
        if (parse_int_arg(optarg, "r", opt_value, 1, 10)) {
          s_filter->setMaxWordRepeats(opt_value);
        } else {
          usage_error = true;
        }
        break;

      case 'R':
        single_message_file = true;
        return_spam_status = true;
        break;

      case 's':
        if (parse_int_arg(optarg, "s", opt_value, 0, MAX_MAX_CACHE_TERMS)) {
          if (opt_value == 0) {
            opt_value = MAX_MAX_CACHE_TERMS;
          }
          MAX_CACHE_TERMS = opt_value;
        } else {
          usage_error = true;
        }
        break;

      case 't':
        test_cases.push_back(optarg);
        break;

      case 'T':
        show_terms = true;
        break;

      case 'v':
        if (is_verbose) {
          is_debug = true;
        } else {
          is_verbose = true;
        }
        break;

      case 'V':
        wants_version = true;
        break;

      case 'w':
        if (parse_int_arg(optarg, "w", opt_value, 5, 500)) {
          s_filter->setTermsForScore(opt_value);
        } else {
          usage_error = true;
        }
        break;

      case 'x':
        s_filter->setExtendTopTerms(true);
        break;

      case 'X':
        s_filter->setTermsForScore(5);
        s_filter->setMaxWordRepeats(5);
        s_filter->setExtendTopTerms(true);
        break;

      case 'Y':
        ignore_content_length = true;
        break;

      case '7':
        config->setReplaceNonAsciiChars(' ');
        break;

      case '8':
        config->setReplaceNonAsciiChars(-1);
        break;

      default:
        usage_error = true;
        break;
      }
    }

    // import locales so that tolower() will work properly
    setlocale(LC_ALL, "");

    if (shared_db_dir.getPath().length() > 0 && !directory_test(shared_db_dir, false)) {
      usage_error = true;
    }

    if (!directory_test(basedir, create_dir)) {
      usage_error = true;
    }

    if ((argc - optind) < 1) {
      usage_error = true;
    }

    string command;
    const CommandInfo *cmd = 0;
    if (!usage_error) {
      command = argv[optind++];
      cmd = validate_command(command);
      if (!cmd) {
	cerr << "error: invalid command name: " << command << endl;
	usage_error = true;
      } else if (cmd->read_only && messages_per_purge > 0) {
        messages_per_purge = 0;
      }
      if (command == CMD_PURGE_TERMS && optind >= argc) {
        usage_error = true;
      }
    }

    if (single_message_file || (command == CMD_RECEIVE && !force_mbox)) {
      ignore_from = true;
      ignore_content_length = true;
    }

    try {
      install_signal_handler();
      if (shared_db_dir.getPath().length() > 0) {
        s_filter->open(File(shared_db_dir, DB_FILENAME),
                       File(basedir, DB_FILENAME),
                       cmd ? cmd->read_only : true,
                       MAX_CACHE_TERMS);
      } else {
        s_filter->open(File(basedir, DB_FILENAME),
                       cmd ? cmd->read_only : true,
                       MAX_CACHE_TERMS);
      }
    } catch (runtime_error &ex) {
      if (wants_version) {
        usage_error = true;
      } else {
        throw;
      }
    }

    if (usage_error || wants_version) {
      print_usage();
      print_version(s_filter.get());
      return usage_error ? 1 : 0;
    }

    if (cmd->use_digest_header) {
      config->setSpamProbeFieldName(spamprobe_field_name);
    }

    if (process_test_cases(test_cases, config.get(), s_filter.get())) {
      print_usage();
      return 1;
    }

    if (process_extended_options(extended_options, config.get(), s_filter.get())) {
      print_usage();
      return 1;
    }

    bool is_message_spam = false;

    if (command == CMD_DUMP) {
      if (optind < argc) {
        dump_words(*s_filter.get(), argv[optind]);
      } else {
        dump_words(*s_filter.get());
      }
    } else if (command == CMD_PURGE_TERMS) {
      purge_terms(*s_filter.get(), argv[optind]);
    } else if (command == CMD_CLEANUP) {
      cleanup_database(*s_filter.get(), argv + optind);
    } else if (command == CMD_EXEC || command == CMD_EXEC_SHARED) {
      execute_command(argv + optind);
    } else if (command == CMD_COUNTS) {
      int spam_count, good_count;
      s_filter->getDB()->getMessageCounts(good_count, spam_count);
      cout << "GOOD " << good_count << " SPAM " << spam_count << endl;
    } else if (command == CMD_PURGE) {
      int junk_count = (optind >= argc) ? 2 : atoi(argv[optind]);
      purge_database(*s_filter.get(), junk_count);
    } else if (command == CMD_EXPORT) {
      export_words(*s_filter.get());
    } else if (command == CMD_EDIT_TERM) {
      if ((argc - optind) != 3) {
        print_usage();
      } else {
        int good_count = atoi(argv[optind + 1]);
        int spam_count = atoi(argv[optind + 2]);
        edit_term(*s_filter.get(), argv[optind], good_count, spam_count);
      }
    } else if (optind == argc) {
      if (command == CMD_IMPORT) {
        import_words(*s_filter.get(), cin);
      } else if (command == CMD_TRAIN_TEST) {
        train_test(*s_filter.get(), cin, ignore_from, ignore_content_length, true,
                   show_terms, config.get(), messages_per_purge, true);
      } else if (command == CMD_RECEIVE_TEST) {
        train_test(*s_filter.get(), cin, ignore_from, ignore_content_length, true,
                   show_terms, config.get(), messages_per_purge, false);
      } else {
        int cumulative_message_count = 0;
        is_message_spam = process_stream(*s_filter.get(), cin, 0, command,
                                         ignore_from, ignore_content_length, show_terms, true,
                                         config.get(), cumulative_message_count, messages_per_purge);
      }
    } else if (command == CMD_AUTO_TRAIN) {
      auto_train(*s_filter.get(), ignore_from, ignore_content_length,
                 show_terms, config.get(), messages_per_purge, true, argv + optind);
    } else if (command == CMD_AUTO_LEARN) {
      auto_train(*s_filter.get(), ignore_from, ignore_content_length,
                 show_terms, config.get(), messages_per_purge, false, argv + optind);
    } else {
      int cumulative_message_count = 0;
      for (int i = optind; i < argc; ++i) {
        File mbox(argv[i]);
        if (!mbox.isFile()) {
          cerr << "error: skipped " << mbox.getPath() << ": file not found" << endl;
        } else {
          LockFile lock(mbox);
          lock.lock(LockFD::SHARED_LOCK);

          ifstream in(argv[i]);
          if (command == CMD_IMPORT) {
            import_words(*s_filter.get(), in);
          } else {
            is_message_spam = process_stream(*s_filter.get(), in, argv[i], command,
                                             ignore_from, ignore_content_length, show_terms, false,
                                             config.get(), cumulative_message_count, messages_per_purge);
          }
        }
      }
    }

    s_filter.clear();

    if (return_spam_status) {
      return is_message_spam ? 0 : 1;
    }
  } catch (InterruptedException &ex) {
    cerr << "interrupted by signal" << endl;
  } catch (runtime_error &ex) {
    cerr << "caught runtime exception: " << ex.what() << endl;
    return 1;
  } catch (logic_error &ex) {
    cerr << "caught logic exception: " << ex.what() << endl;
    return 1;
  } catch (...) {
    cerr << "caught unknown exception" << endl;
    return 1;
  }

  return 0;
}
