/****************************************************************************
**  SCALASCA    http://www.scalasca.org/                                   **
*****************************************************************************
**  Copyright (c) 1998-2013                                                **
**  Forschungszentrum Juelich GmbH, Juelich Supercomputing Centre          **
**                                                                         **
**  Copyright (c) 2009-2013                                                **
**  German Research School for Simulation Sciences GmbH,                   **
**  Laboratory for Parallel Programming                                    **
**                                                                         **
**  This software may be modified and distributed under the terms of       **
**  a BSD-style license.  See the COPYING file in the package base         **
**  directory for details.                                                 **
****************************************************************************/


#include <config.h>
#include <pearl/pearl.h>

#include <cfloat>
#include <new>

#include <pearl/Callpath.h>
#include <pearl/Callsite.h>
#include <pearl/Enter_rep.h>
#include <pearl/Error.h>
#include <pearl/Event.h>
#include <pearl/GlobalDefs.h>
#include <pearl/Leave_rep.h>
#include <pearl/LocalTrace.h>
#include <pearl/Region.h>

#include "Calltree.h"
#include "DefsFactory.h"
#include "EventFactory.h"
#include "SmallObjAllocator.h"
#include "Threading.h"

using namespace std;
using namespace pearl;


//--- Local variables -------------------------------------------------------

namespace {
    /// Mutex variable protecting calltree updates during trace preprocessing
    Mutex calltreeMutex;
}   // unnamed namespace


//--- Local defines ---------------------------------------------------------

#define BALANCED          0
#define TOO_MANY_ENTERS   1
#define TOO_MANY_LEAVES   2


//--- Function prototypes ---------------------------------------------------

namespace pearl
{

extern void PEARL_new_handler();

}   // namespace pearl


//--- Library initialization ------------------------------------------------

/**
 *  Initializes the PEARL library and installs a custom out-of-memory handler.
 *  It is required to call one of the PEARL initialization functions before
 *  calling any other PEARL function or instantiating any PEARL class.
 *
 *  @attention Make sure to initialize PEARL before installing any exit
 *             handlers using atexit(). Otherwise, you might get error
 *             messages about memory leaks.
 *
 *  @note This function is intended to be used in serial PEARL programs.
 *
 *  @see PEARL_omp_init(), PEARL_mpi_init(), PEARL_hybrid_init()
 **/
void pearl::PEARL_init()
{
  // Register factories & allocators
  DefsFactory::registerFactory(new DefsFactory());
  EventFactory::registerFactory(new EventFactory());
  SmallObjAllocator::registerAllocator(new SmallObjAllocator());

  // Register new handler (out of memory)
  set_new_handler(PEARL_new_handler);
}


//--- Call tree verification ------------------------------------------------

/**
 *  Verifies whether the global call tree provided by the trace definition
 *  data @a defs is complete with respect to the local @a trace data. If not,
 *  the process-local call tree is extended accordingly. This has to be done
 *  before PEARL_preprocess_trace() is called.
 *
 *  @param defs  Global definitions object
 *  @param trace Local trace data object
 *
 *  @see PEARL_mpi_unify_calltree(), PEARL_preprocess_trace()
 *
 *  @todo Add trace consistency checks (matching ENTER/LEAVE)
 **/
void pearl::PEARL_verify_calltree(GlobalDefs& defs, const LocalTrace& trace)
{
  // Check whether time offset subtraction is required
  Event       event     = trace.begin();
  timestamp_t firstEvtT = DBL_MAX;
  if (event != trace.end())
    firstEvtT = trace.begin()->getTimestamp();
  calltreeMutex.lock();
  if (firstEvtT < defs.getGlobalOffset())
    defs.setGlobalOffset(firstEvtT);
  calltreeMutex.unlock();
  PEARL_Barrier();

  // This variable is shared!
  // Used to provide fork cnode on master thread to worker threads
  static Callpath* fork_cnode = NULL;

  // Calltree verification
  Calltree* ctree   = defs.get_calltree();
  Callpath* current = NULL;
  long      depth   = 0;
  int       status  = BALANCED;
  while (event != trace.end()) {
    // OpenMP fork event, can only occur on master thread
    if (event->isOfType(OMP_FORK))
    {
      // Provide current cnode to worker threads
      fork_cnode = current;
    }

    // Enter event
    else if (event->isOfType(GROUP_ENTER))
    {
      Enter_rep& enter = event_cast<Enter_rep>(*event);

      // Begin of parallel region, immediately after OpenMP fork on master
      // ==> get parent cnode from master
      if (is_omp_parallel(enter.getRegionEntered())) {
        // Make sure all threads have reached this point
        PEARL_Barrier();

        if (PEARL_GetThreadNumber() != 0) {
          // Verify correct nesting of Enter/Leave events on worker threads
          // for *previous* parallel region
          // Something left on the call stack ==> too many Enter events
          if (depth > 0 && status == BALANCED)
            status = TOO_MANY_ENTERS;

          // Get parent cnode from master thread
          current = fork_cnode;
        }

        // Synchronize again to avoid race conditions
        PEARL_Barrier();
      }

      // Update global call tree
      calltreeMutex.lock();
      current = ctree->getCallpath(enter.getRegionEntered(),
                                   enter.getCallsite(),
                                   current);
      calltreeMutex.unlock();

      // Increase call stack depth
      depth++;
    }

    // Leave event
    else if (event->isOfType(LEAVE))
    {
      // Decrease call stack depth
      depth--;

      // Verify correct nesting of Enter/Leave events
      // Call stack empty ==> too many Leave events
      if (depth < 0 && status == BALANCED)
        status = TOO_MANY_LEAVES;

      // Update current cnode
      current = current->getParent();
    }

    // Go to next event
    ++event;
  }

  // Verify correct nesting of Enter/Leave events
  // Something left on the call stack ==> too many Enter events
  // Call stack empty ==> too many Leave events
  if (depth < 0 || status == TOO_MANY_LEAVES)
    throw FatalError("Unbalanced ENTER/LEAVE events (Too many LEAVEs).");
  else if (depth > 0 || status == TOO_MANY_ENTERS)
    throw FatalError("Unbalanced ENTER/LEAVE events (Too many ENTERs).");
}


//--- Trace preprocessing ---------------------------------------------------

/**
 *  Performs some local preprocessing of the given @a trace which is required
 *  to provide the full trace-access functionality. This has to be done as the
 *  last step in setting up the data structures, i.e., after calling
 *  PEARL_verify_calltree() and PEARL_mpi_unify_calltree().
 *
 *  @param defs  Global definitions object
 *  @param trace Local trace data object
 *
 *  @see PEARL_verify_calltree(), PEARL_mpi_unify_calltree()
 **/
void pearl::PEARL_preprocess_trace(const GlobalDefs& defs,
                                   const LocalTrace& trace)
{
  // No explicit thread synchronization is required here. The threads will be
  // synchronized when examining the Enter event of the first parallel region
  // of the trace.

  // This variable is shared!
  // Used to provide fork cnode on master thread to worker threads
  static Callpath* fork_cnode = NULL;

  // Precompute callpath pointers and apply global time offset shifting
  timestamp_t offset  = defs.getGlobalOffset();
  Calltree*   ctree   = defs.get_calltree();
  Callpath*   current = NULL;
  Event       event   = trace.begin();
  while (event != trace.end()) {
    // Perform global time offset shift
    event->setTimestamp(event->getTimestamp() - offset);

    // OpenMP fork event, can only occur on master thread
    if (event->isOfType(OMP_FORK))
    {
      // Provide current cnode to worker threads
      fork_cnode = current;
    }

    // Enter event
    else if (event->isOfType(GROUP_ENTER))
    {
      Enter_rep& enter = event_cast<Enter_rep>(*event);

      // Begin of parallel region, immediately after OpenMP fork on master
      // ==> get parent callpath from master
      if (is_omp_parallel(enter.getRegionEntered())) {
        // Make sure all threads have reached this point
        PEARL_Barrier();

        if (PEARL_GetThreadNumber() != 0) {
          // Get parent cnode from master thread
          current = fork_cnode;
        }

        // Synchronize again to avoid race conditions
        PEARL_Barrier();
      }

      // Update current callpath
      // Only read access, so no synchronization necessary
      current = ctree->getCallpath(enter.getRegionEntered(),
                                   enter.getCallsite(),
                                   current);

      // Set callpath pointer
      enter.setCallpath(current);
    }

    // Leave event
    else if (event->isOfType(LEAVE))
    {
      Leave_rep& leave = event_cast<Leave_rep>(*event);
      leave.setCallpath(current);

      // Update current callpath
      current = current->getParent();
    }

    ++event;
  }

  // Synchronize threads
  PEARL_Barrier();
}
