/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                         Copyright (c) 1996                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, modify, distribute this software and its    */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                         Author :  Alan W Black                        */
/*                         Date   :  July 1996                           */
/*-----------------------------------------------------------------------*/
/*                                                                       */
/* Simple statistics (for discrete probability distributions             */
/*                                                                       */
/*=======================================================================*/

#include <iostream.h>
#include <fstream.h>
#include <stdlib.h>
#include <string.h>
#include "EST_String.h"
#include "EST_KV.h"
#include "EST_simplestats.h"
#include "EST_multistats.h"
#include "EST_Token.h"

const EST_String nullString("");   /* used in probability distributions */
static void Discrete_val_delete_funct(void *d) { delete (int *)d; }

EST_Discrete::~EST_Discrete() 
{
    nametrie.clear(Discrete_val_delete_funct);
}

EST_Discrete::EST_Discrete(const EST_StrList &vocab)
{
    if(!init(vocab))
    {
	cerr << "WARNING from EST_Discrete ctor : invalid vocab list !";
	nametrie.clear(Discrete_val_delete_funct);
	namevector.resize(0);
    }
}

bool EST_Discrete::init(const EST_StrList &vocab)
{

    // initialize a new EST_Discrete to given set of names
    EST_TBI *w;
    int i,*tmp;

    p_def_val = -1;
    namevector.resize(vocab.length());

    for (i=0,w=vocab.head(); w != 0; i++,w=next(w)){
	namevector(i) = vocab.item_C(w);
	tmp = new int;
	*tmp = i;

	// check for repeated items - just not allowed
	if(nametrie.lookup(vocab.item_C(w)) != NULL)
	{
	    cerr << "EST_Discrete : found repeated item '";
	    cerr << vocab.item_C(w) << "' in vocab list !" << endl;
	    return false;
	}

	nametrie.add(vocab.item_C(w),tmp);
    }
    return true;
}


bool EST_Discrete::operator ==(const EST_Discrete &d)
{
    // assume, if name vectors are the same, the stringtries
    // are too
    return (bool)(namevector == d.namevector);
}

bool EST_Discrete::operator !=(const EST_Discrete &d)
{
    return (bool)(namevector != d.namevector);
}

EST_DiscreteProbDistribution::EST_DiscreteProbDistribution(const EST_Discrete *d,const double n_samples, double *counts)
{
    int i;

    // REORG check with simon about counts: it is *now* copied.
    type = tprob_discrete;
    num_samples = 0;
    discrete = d;
    num_samples = n_samples;
    size = d->size();

    icounts = new double[size];
    for (i=0; i<size; i++)
	icounts[i] = counts[i];

}

EST_DiscreteProbDistribution::EST_DiscreteProbDistribution(const EST_DiscreteProbDistribution &b)
{
    int i;

    // REORG check with simon about counts: it is *now* copied.
    type = b.type;
    num_samples = b.num_samples;
    discrete = b.discrete;
    size = b.size;

    icounts = new double[size];
    for (i=0; i<size; i++)
	icounts[i] = b.icounts[i];

}

void EST_DiscreteProbDistribution::clear(void)
{
    if (type == tprob_discrete)
    {
	if (size > 0) delete [] icounts;
	size = 0; 
	icounts = 0;
//	delete discrete;
    }
}

void EST_DiscreteProbDistribution::init(void)
{ 
    type = tprob_string; 
    num_samples = 0;
    size = 0;
    discrete = 0;
    icounts = 0;
}

bool EST_DiscreteProbDistribution::init(const EST_StrList &vocab)
{
    int i;
    clear();
    type = tprob_discrete;
    num_samples = 0;
    discrete = new EST_Discrete(vocab);

    // should use this, but it's a const object ...... :-(
    //if(!discrete->init(vocab))
    //  return false;

    size = vocab.length();
    icounts = new double[size];
    for (i=0; i<size; i++)
	icounts[i] = 0;

    return true;
}

void EST_DiscreteProbDistribution::init(const EST_Discrete *d)
{
    int i;
    clear();
    type = tprob_discrete;
    num_samples = 0;
    discrete = d;
    size = d->size();
    icounts = new double[size];
    for (i=0; i<size; i++)
	icounts[i] = 0;
}

void EST_DiscreteProbDistribution::cumulate(const int i,double count)
{
    icounts[i]+=count;
    num_samples+=count;
}

void EST_DiscreteProbDistribution::cumulate(const EST_String &s,double count)
{
    EST_TBI *p;

    if (type == tprob_discrete)
    {
	int idx = discrete->index(s);
	icounts[idx]+=count;

    }
    else // its a (slow) string type 
    {

	for (p=scounts.list.head(); p != 0; p=next(p))
	{
	    if (scounts.list(p).k == s)
	    {
		scounts.list(p).v+=count; 
		break;
	    }
	}
	if (p == 0) // first occurence
	    scounts.add_item(s,count,1);  // add without search
    }
    num_samples+=count;

}

const EST_String &EST_DiscreteProbDistribution::most_probable(double *prob) const
{
    EST_TBI *p,*t;
    double max = 0;

    if (type == tprob_discrete)
    {
	int i,pt=-1;
	for (i=0; i < size; i++)
	    if (icounts[i] > max)
	    {
		pt = i;
		max = icounts[i];
	    }
	if (max == 0)
	{
	    if(prob != NULL)
		*prob = 0.0;
	    return nullString;
	}
	else
	{
	    if(prob != NULL)
		*prob = probability(pt);
	    return discrete->name(pt);
	}
    }
    else
    {
	t = 0;
	for (p=scounts.list.head(); p != 0; p=next(p))
	    if (scounts.list(p).v > max)
	    {
		t = p;
		max = scounts.list(p).v;
	    }
	if (max == 0)
	{
	    if(prob != NULL)
		*prob = 0.0;
	    return nullString;
	}
	else
	{
	    if(prob != NULL)
		*prob = (double)max/num_samples;
	    return scounts.list(t).k;
	}
    }
}

double EST_DiscreteProbDistribution::probability(const EST_String &s) const
{
    if (frequency(s) == 0.0)
	return 0.0;
    else
	return (double)frequency(s)/num_samples;
}

double EST_DiscreteProbDistribution::probability(const int i) const
{
    if (frequency(i) == 0.0)
	return 0.0;
    else
	return (double)frequency(i)/num_samples;
}

double EST_DiscreteProbDistribution::frequency(const EST_String &s) const
{
    if (type == tprob_discrete)
	return icounts[discrete->index(s)];
    else
	return  scounts.val_def(s,0);
}

double EST_DiscreteProbDistribution::frequency(const int i) const
{
    if (type == tprob_discrete)
	return icounts[i];
    else
    {
	cerr << "ProbDistribution: can't access string type pd with int\n";
	return 0;
    }
}

void EST_DiscreteProbDistribution::set_frequency(const EST_String &s,double c)
{
    if (type == tprob_discrete)
    {
	num_samples -= icounts[discrete->index(s)];
	num_samples += c;
	icounts[discrete->index(s)] = c;
    }
    else
    {
	num_samples -= scounts.val_def(s,0);
	num_samples += c;
	scounts.add_item(s,c);
    }
}

void EST_DiscreteProbDistribution::set_frequency(int i,double c)
{
    if (type == tprob_discrete)
    {
	num_samples -= icounts[i];
	num_samples += c;
	icounts[i] = c;
    }
    else
    {
	cerr << "ProbDistribution: can't access string type pd with int\n";
    }

}


void EST_DiscreteProbDistribution::override_frequency(const EST_String &s,double c)
{
    if (type == tprob_discrete)
	icounts[discrete->index(s)] = c;
    else
	scounts.add_item(s,c);
}

void EST_DiscreteProbDistribution::override_frequency(int i,double c)
{
    if (type == tprob_discrete)
	icounts[i] = c;
    else
	cerr << "ProbDistribution: can't access string type pd with int\n";
}

double EST_DiscreteProbDistribution::entropy() const
{
    // Returns the entropy of the current distribution
    double e=0.0;
    EST_TBI *p;
    int i;

    if (type == tprob_discrete)
    {
	for (i=0; i < size; i++)
	{
	    double prob = icounts[i]/num_samples;
	    if (prob != 0.0)
		e += prob * log(prob);  /* log10(prob)/log10(2) */
	}
    }
    else
    {
	for (p=scounts.list.head(); p != 0; p=next(p))
	{
	    double prob = scounts.list(p).v/num_samples;
	    if (prob != 0.0)
		e += prob * log(prob);  /* log10(prob)/log10(2) */
	}
    }

    return -e;

}

//  For iterating through members of a probability distribution
int EST_DiscreteProbDistribution::item_start(void) const
{
    if (type == tprob_discrete)
	return 0;
    else
	return (int)scounts.list.head();
}

int EST_DiscreteProbDistribution::item_end(int idx) const
{
    if (type == tprob_discrete)
	return (idx >= size);
    else
	return ((EST_TBI *)idx == 0);
}

int EST_DiscreteProbDistribution::item_next(int idx) const
{
    if (type == tprob_discrete)
	return ++idx;
    else
	return (int)next((EST_TBI *)idx);
}

const EST_String &EST_DiscreteProbDistribution::item_name(int idx) const
{
    if (type == tprob_discrete)
	return discrete->name(idx);
    else
	return scounts.list((EST_TBI *)idx).k;
}

void EST_DiscreteProbDistribution::item_freq(int idx,EST_String &s,double &freq) const
{
    if (type == tprob_discrete)
    {
	s = discrete->name(idx);
	freq = icounts[idx];
    }
    else
    {
	s = scounts.list((EST_TBI *)idx).k;
	freq = scounts.list((EST_TBI *)idx).v;
    }
}

void EST_DiscreteProbDistribution::item_prob(int idx,EST_String &s,double &prob) const
{
    if (type == tprob_discrete)
    {
	prob = probability(idx);
	s = discrete->name(idx);

    }
    else
    {
	s = scounts.list((EST_TBI *)idx).k;
	prob = (double)scounts.list((EST_TBI *)idx).v/num_samples;
    }
}

ostream & operator<<(ostream &s, const EST_DiscreteProbDistribution &pd)
{
    // Output best with probabilities
    int i;
    double prob;
    double sum=0;
    EST_String name;
 
    s << "(";
    for (i=pd.item_start(); !pd.item_end(i); i=pd.item_next(i))
    {
	pd.item_prob(i,name,prob);
	s << "(" << name << "=" << prob << ") ";
	sum+=prob;
    }
    s << "best=" << pd.most_probable(&prob) << " samples=" 
      << pd.samples() << " sum=" << sum << ")";
    return s;
}

EST_DiscreteProbDistribution &EST_DiscreteProbDistribution::operator=(const EST_DiscreteProbDistribution &a)
{
    // I'd much rather this was never called
    type = a.type;
    num_samples = a.num_samples;
    discrete = a.discrete;
    size = a.size;
    icounts = new double[size];
    memmove(icounts,a.icounts,sizeof(double)*size);
    scounts = a.scounts;
    return *this;
}

ostream& operator <<(ostream& s, const EST_Discrete &d)
{
    int i;
    for(i=0;i<d.size();i++)
	s << d.name(i) << " ";
    return s;
}
