// Copyright (C)  2000 Intel Corporation.  All rights reserved.
//
// $Header: /usr/development/orp/orp/common/gc_v2/cheney_fetcher.cpp,v 1.12 2002/01/09 14:50:13 weldon Exp $
//


//Use _beginthreadex instead of CreateThread so libc knows what to do.

#include <stdio.h>
#include <stdlib.h>
#include "gc_header.h"
#include "descendents.h"
#include "gc_perf.h"


#ifdef ORP_NT
#include <process.h>    /* _beginthread, _endthread */
#include "gc_globals.h"

int units_fetched = 0;

#ifdef _DEBUG
// These are some accounting numbers that can be used to ensure that 
// real work is being done. If the monitor_waits are much less than the
// monitor_writes then the fetcher thread is being kept very busy and
// we should try to decrease the amount of work that it is doing.
// Waits should never be more than 1 greater than the writes.
// objects_fetched/units_fetched should give us a rough idea of how
// many objects are in a unit.
int objects_fetched = 0;
int monitor_waits = 0;
int monitor_writes = 0;
#endif 

// The software convention is straight forward. The fetching thread
// can only change the state from stopping to stopped and dieing to dead.
// The cheney thread must do all the other changes.
enum fetcher_thread_states {
    unborn,
    working,
    stopping,
    stopped,
    dieing,
    dead
};

volatile fetcher_thread_states fetcher_thread_state = unborn;

volatile block_info *fetch_block = NULL;

HANDLE jt_monitor_wait_event = (HANDLE)0;

// Use to dump out information about the trace for debugging. Turn off
// when running the actual tests.
//

#if 0
inline void gc_trace_fetcher (const char *string_x)
{
    orp_cout << string_x;
    return;
}
#else
inline void gc_trace_fetcher (const char *string_x)
{
    return;
}
#endif
// These two routines work as simple communication primitives between threads.
// The simplest way to look at this is that jt_monitor writes value into location
// and informs threads executing a jt_wait that the write has taken place.
// All threads executing jt_monitor_wait will be in a wait state and it 
// will leave the wait state returning the value in the location specified.

// The goal here is to define these routines so that we can map them into single 
// instructions. 

// Here the jt_monitor_wait and jt_monitor_write do not have to be 
// reading and writing to the same location. 
// Is this what we want or do we want to enforce that the wait location 
// matches the write location?

// If location is in WB memory then all writes prior to the 
// jt_monitor_write will be visible to the thread doing the wait. Should
// loads be allowed to pass the jt_monitor_write? I think so.

// On the wait side should we allow reads or writes
// to pass the wait. As far as reads and writes between the two
// threads is concerned I think they need to be ordered w.r.t. 
// the jt_monitor_wait. I need help expressing this in terms
// compatible with the Pentium constucts such as lfence, sfence, 
// and mfence.

// And we should punt if *location is in WC by stating the *location
// must be in WB if one expects memory access ordering to be defined.
void jt_monitor_write ()
{
#ifdef _DEBUG
    monitor_writes++;
#endif
    gc_trace_fetcher ("^");

    ////    SetEvent(jt_monitor_wait_event);
    PulseEvent(jt_monitor_wait_event);
}

void jt_monitor_wait ()
{
#ifdef _DEBUG
    monitor_waits++;
#endif
    gc_trace_fetcher ("v");

    WaitForSingleObject(jt_monitor_wait_event, INFINITE);
}

// This routine must be called prior to the above routines 
// jt_monitor_write and jt_monitor_wait being used.
void jt_monitor_wait_init ()
{
    // Create and event that can be used by cheney scan to tell cheney_fetch to do some work.
    // second arg indicates that we do manual resets, for example by using pulse.
    jt_monitor_wait_event = CreateEvent(NULL, TRUE, FALSE, NULL);
    if (!jt_monitor_wait_event) {
        orp_cout << "jt monitor wait event is bogus." << endl;
        assert(0);
        orp_exit(911);
    }
}

// Fetch into the cache the lines needed to scan this object.
// Input - obj - the object that is about to be scanned by the other scanner thread.
// Return  - a pointer to the start of the object following this object.

// Assume that the cache line is 64. Oddly enough if it is more these routines will 
// run just about as fast since we will just be fetching the same line mulitple times
// which should not be too expensive.

#define CACHE_LINE_SIZE 64
//
// This takes a slot in the heap in an array or an object, if the target is in a c area
// we move the target and update the slot. We also check to see if the slot
// is interesting and if it is we mark the related card.
//
void fetch_array_object(Java_java_lang_Object *p_object);
//
// fetch_slot - Given a slot in an object (or array) scan it, possible causing the
//             object to be fetceh in preperation for allocating a version in "TO" space. 
//             If the object is in "TO" space then update the slot to point to the correct version.
// return    - true if this results in an interesting pointer being installed that 
//             requires that the corresponding card needs to be marked.
//
boolean fetch_slot(Java_java_lang_Object **pp_target_object, block_info *slot_block_info) 
{
    boolean mark_card = false;
    register POINTER_SIZE_INT prefetch_reg = 0;
    // ignore NULL
    if (*pp_target_object != NULL) {
        //
        // Race condition, **pp_target_object could be forwarded but the mover thread updates the slot
        // *pp_target_object before we have a chance.
        //
        // Grab the object once, if the slot (*pp_target_object) is updated then we just end up
        // doing it twice.
        Java_java_lang_Object *p_the_object = *pp_target_object;
        block_info *target_block_info = GC_BLOCK_INFO(p_the_object);
        assert (!slot_block_info->in_nursery_p); // We never scan or fetech nursery items.
        // We have a slot holding an object in a c area. 
        // If the target c area is a nursery, move it to the step.
        // If the target c area is a step, move it to the youngest car in the youngest train.
        
        if (target_block_info->c_area_p) {
            if (is_object_forwarded(p_the_object)) {
                // This object has already been copied into "to" space so update the slot.
                // OK update the slot. mover thread might have beat us to this point but that is OK.
                *pp_target_object = p_get_already_forwarded_object(p_the_object);
                // Mover thread needs this in the cache also.
                target_block_info = GC_BLOCK_INFO(*pp_target_object);
                // target object is in a nursery so we move the object into a step.
            } else {
                // fetch the object in "from" space into the cache so
                // that the allocater thread has it available.           
                unsigned int lines = 1 +(get_object_size_bytes(p_the_object) / CACHE_LINE_SIZE);
                POINTER_SIZE_INT *cache_line = (POINTER_SIZE_INT *)p_the_object;
                while (lines) { // THIS SHOULD BE A PREFETCH
                    // How do I do this for Pete sake
//
// The prefetch I want is a prefetch into level 2 cache it has the opcode  0f 18 /2   PREFETCHT2 m8
// is it "_emit 0fh _emit 18h emit -2h cache_line" or what I need a clue here.
//
//                    __asm {
//                        prefetcht2 cache_line
//                    }
//
// __asm nop
// __asm mov  edx,[cache_line]
                   prefetch_reg = *cache_line; // fetch the line into the cache.

                    cache_line = (POINTER_SIZE_INT *) ((POINTER_SIZE_INT)cache_line + CACHE_LINE_SIZE);
                    lines--;
                }
            }
        }
        // Only in MOS are interesting.
        // YOS and LOS objects are considered born at time 0 so are always older.
        // Slots in trains younger than targets are interesting. This includes all MOS->YOS
        // Slots and targets in the same train where target is in younger car is interesting.
        // Since we are comparing birthdays > means younger. Someone born in 1980 is younger (>) than 1950 
        if (slot_block_info->train_birthday != 0)  {
            // Slot is in MOS.
            if (slot_block_info->train_birthday > target_block_info->train_birthday) {
                // Slot is in younger train so remember this.
                mark_card = true;
            } else {
                if (slot_block_info->train_birthday == target_block_info->train_birthday) {
                    // We are in the same train.
                    if (slot_block_info->car_birthday > target_block_info->car_birthday) {
                        // We are in the same train but slot car is younger than target car
                        mark_card = true;
                    }
                }
            }
        }
    }
    return mark_card;
}
void fetch_object(Java_java_lang_Object *p_object)
{   
    
    // We have an object header, get the object
    boolean mark_card = false; // set to true if we find a slot that is interesting.
    
    gc_trace (p_object, "In fetch_object scanning this object");
    
    if (is_array(p_object)) {
        fetch_array_object (p_object);
        return;
    }
    block_info *slot_block_info = GC_BLOCK_INFO(p_object);
    unsigned int *offset_scanner = init_object_scanner (p_object);
    Java_java_lang_Object **pp_target_object;
    while ((pp_target_object = p_get_ref(offset_scanner, p_object)) != NULL) {
        // Move the scanner to the next reference.
        offset_scanner = p_next_ref (offset_scanner);
        if (fetch_slot (pp_target_object, slot_block_info)) {
            mark_card = true;
        }
    }
    
    if (mark_card) {
        // Mark the card so it will be scanned next time around.
        slot_block_info->card_table[GC_CARD_INDEX(p_object)] = true;
    }
    
    return;
}

void fetch_array_object(Java_java_lang_Object *p_object)
{

#ifdef _DEBUG
    objects_fetched++;
#endif 

    bool mark_card = false;
    block_info *slot_block_info = GC_BLOCK_INFO(p_object);
    
    // If array is an array of primitives, then there are no references, so return.
    if (is_array_of_primitives(p_object)) {
        return;
    }
    
    // Initialize the array scanner which will scan the array from the
    // top to the bottom. IE from the last element to the first element.
    
    unsigned int offset = init_array_scanner (p_object);
    
    //
    // Cycle through all the descendents.
    //
    Java_java_lang_Object **pp_target_object;
    while ((pp_target_object = p_get_array_ref(p_object, offset)) != NULL) {
        offset = next_array_ref (offset);
        if (fetch_slot (pp_target_object, slot_block_info)) {
            mark_card = true;
        }
    }
    
    if (mark_card) {
        // Mark the card so it will be scanned next time around.
        slot_block_info->card_table[GC_CARD_INDEX(p_object)] = true;
    }
    return; 
}

//
// When we need to wait for the cheney worker thread to catch up we come here.
// If we return true it means we might have more work. If we return false
// there is no more work for this thread to do and the thread should exit.
//
boolean wait_for_more_work () {
    // race here since fetch_block can be changed and monitor_write can happen here and
    // be missed by the following wait, in which case we lose an opportunity to do some prefetching.
    jt_monitor_wait(); // wait for more work.
    
    if (fetcher_thread_state == stopping) {
        fetcher_thread_state = stopped;      
        jt_monitor_wait(); // wait for more work.
    }
    
    if (fetcher_thread_state == dieing) {
        
        gc_trace_fetcher ("D");
        
        fetcher_thread_state = dead;
        // My work here is done so just return and let the system deal with killing
        // the thread. This is done at system bring down time.
        return false;
    }
    return true;
}

void fetcher_thread(LPVOID lpParameter_ignored)
{
    // Hang out until mover tells you to start.
    POINTER_SIZE_INT ignore_location = NULL;
    // Fall through since fetch_block will be NULL so we will just wait until
    // we have work to do.
    volatile block_info *block_being_fetched = NULL;
    while (true) {
        
        gc_trace_fetcher ("t");
        block_being_fetched = fetch_block; // Get the work to do.

        if  (block_being_fetched) {
            
            // We have work to do.
            // Start fetching at the scan pointer.
            void *fetch_ptr = block_being_fetched->scan;
            void *gap_ptr = (char *)fetch_ptr + fetcher_gap;
            void *free_ptr = NULL;

            while ((block_being_fetched == fetch_block) && (fetch_ptr < block_being_fetched->free)) {
                
                free_ptr = block_being_fetched->free;
                // Don't ever go past the (current) free ptr.
                if (free_ptr < gap_ptr) {
                    gap_ptr = free_ptr;
                }

                units_fetched++;

                // free pointer can move but if it does we don't need to chase it since the
                // normal thread will have moved the objects into the cache, lets go find some different
                // work to do.
                // do the work
                while(fetch_ptr < gap_ptr) {
                    fetch_object(P_OBJ_FROM_START(fetch_ptr));
                    fetch_ptr = (void *)((unsigned int)fetch_ptr + get_object_size_bytes(P_OBJ_FROM_START(fetch_ptr)));
                }
                
                if (fetch_ptr < block_being_fetched->scan) {
                    // It might be that the cheney worker thread has caught up so we need
                    // to jump ahead to the scan pointer and go back to work.
                    fetch_ptr = block_being_fetched->scan;
                }
                
                if (((char *)(block_being_fetched->scan) + fetcher_gap) >= fetch_ptr) {
                    // Do the wait.
                    if (!wait_for_more_work ()) {
                        return;
                    }
                }
                // The scan pointer has moved on so lets do some more scanning.
                // Notice we never scan beyond 2 times the fetcher_gap.
                gap_ptr = (char *)fetch_ptr + fetcher_gap;
            }
            
            // wait until we have more work to do.
            gc_trace_fetcher ("+");
        }
        if (fetch_block) {
            if ((fetch_block != block_being_fetched)) {
                
                gc_trace_fetcher ("c");
                
                continue; // loop back we have more work to do.
            }
        }
        
        if (!wait_for_more_work()) {
            return;
        }
        gc_trace_fetcher ("b");
  }
}

//
// Initializes the cheney fetcher thread. 
// Aborts hard if it fails
//

void init_fetcher_thread()
{
    // Make sure we have not been here before.
    assert (!jt_monitor_wait_event);
    
    jt_monitor_wait_init();
    // Create the cheney_fetcher thread.
    // First 2 params ignored, 
    // Third param is thread body
    // Fourth is param to thread body that is ignored.
    // fifth is 0 which means start up the thread immediately.
    // sixth says don't pass back a thread id.
    
    assert(fetcher_thread_state == unborn);
    fetcher_thread_state = working;
    //	 cheney_fetcher_thread = CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)fetcher_thread,NULL,0,NULL);
    
    
    unsigned long cheney_fetcher_thread = _beginthreadex( 0, (64*1024), ( unsigned int (__stdcall *)(void *) )fetcher_thread,NULL,0,NULL);
    
    
    if (!cheney_fetcher_thread) {
        orp_cout << "Cheney fetcher thread is bogus." << endl;
        orp_exit(911);
    }
    // Here is where affinity information goes but W2K doesn't have an interface yet so I'm ignoring it for now.
}

void start_fetcher_thread(block_info *block_to_fetch)
{
    assert(fetcher_thread_state != stopping);

    gc_trace_fetcher ("s");

    fetcher_thread_state = working; 
    fetch_block = block_to_fetch;
    jt_monitor_write();
}

void stop_fetcher_thread()
{
    assert(fetcher_thread_state != stopping);
    fetch_block = NULL;
    fetcher_thread_state = stopping;

    gc_trace_fetcher ("x");

    while (fetcher_thread_state != stopped) {    

        gc_trace_fetcher ("-");

        jt_monitor_write();
    }
    assert(fetcher_thread_state == stopped);
}

void kill_fetcher_thread()
{
    POINTER_SIZE_INT ignore_location = NULL;
    POINTER_SIZE_INT ignore_value = 0;
    fetcher_thread_state = dieing;

    gc_trace_fetcher ("d");

    fetch_block = NULL;
    while (fetcher_thread_state != dead) { 

        gc_trace_fetcher ("d");

        jt_monitor_write();
    }
    assert(fetcher_thread_state == dead);
}
// These are called at the start and at the end of each cheney scan. They
// can be used for collecting interesting information.
LARGE_INTEGER cheney_fetch_start_time;
unsigned long int cheney_fetch_total_time = 0;

void start_cheney_clock()
{
    QueryPerformanceCounter(&cheney_fetch_start_time);
}

int cheney_count = 0;

void stop_cheney_clock()
{
    cheney_count++;
    LARGE_INTEGER cheney_fetch_stop_time;
    QueryPerformanceCounter(&cheney_fetch_stop_time);
    unsigned long this_fetch_time = get_time_microseconds(cheney_fetch_start_time,
        cheney_fetch_stop_time);
    cheney_fetch_total_time += this_fetch_time;
    orp_cout << "<--Cheney fetch " << cheney_count << " latency (microsec) = " << this_fetch_time << " total time = " << cheney_fetch_total_time << " >" << endl;
    orp_cout << "<--Cheney units fetched = " <<  units_fetched  << " > " << endl;;
    units_fetched = 0;
  
#ifdef _DEBUG
     orp_cout << "<--Cheney fetch objects = " << objects_fetched << ", waits = " << monitor_waits << " writes = " << monitor_writes << " > " << endl;;
     objects_fetched = 0;
     monitor_waits = 0;
     monitor_writes = 0;
#endif
}

#else

//
// Initializes the cheney fetcher thread. 
// Aborts hard if it fails
//

void init_fetcher_thread()
{
   orp_cout << "-gc fetcher_thread Not supported for Linux." << endl;
}

void start_fetcher_thread(block_info *block_to_fetch)
{
   orp_cout <<  "-gc fetcher_thread Not supported for Linux." << endl;
}

void stop_fetcher_thread()
{
   orp_cout <<  "-gc fetcher_thread Not supported for Linux." << endl;
}

void start_cheney_clock()
{
   orp_cout <<  "-gc fetcher_thread Not supported for Linux." << endl;
}

void stop_cheney_clock()
{
   orp_cout <<  "-gc fetcher_thread Not supported for Linux." << endl;
}

#endif
