/* nadaraya_watson_mc.c
 *
 * Copyright (C) 2007 Stephane Germain
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

/**
   \file
   \brief Functions to estimate the option response functions by Nadaraya-Watson.
   (Kernel smoothing, Testgraf).

   \author Stephane Germain <germste@gmail.com>
*/

#include "libirt.h"

#include <stdio.h>
#include <math.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_sort_vector_double.h>
#include <gsl/gsl_cdf.h>
#include <gsl/gsl_randist.h>

/**
   \brief Estimate the response functions by Nadaraya-Watson kernel smoothing (Testgraf).

   @param[in] bandwidth The smoothing parameter, will be multiply by the number
   of distinct scores to the power of -0.2 to give the bandwidth.
   @param[in] patterns A matrix(patterns x options) of binary response.
   @param[in] items_pos A vector(items) with the position of the first option of each item
   in patterns (and probs).
   @param[in] nbr_options A vector(items) with the number of option of each item
   in patterns (and probs).
   @param[in] options_weights A vector(options) of weights to compute the score.
   @param[in] quad_points A vector(classes) with the middle points of each quadrature class.
   @param[in] quad_weights A vector(classes) with the prior weights of each quadrature class.
   @param[out] probs A matrix(options x classes) with the estimated response functions.
   @param[out] probs_stddev matrix(options x classes) with the standard error.

   \warning Do not group the patterns before.

   \warning The memory for the response functions should be allocated before.
*/
void
nadaraya_watson_mc (double bandwidth, gsl_matrix_int * patterns,
		    gsl_vector_int * items_pos, gsl_vector_int * nbr_options,
		    gsl_vector * options_weights,
		    gsl_vector * quad_points, gsl_vector * quad_weights,
		    gsl_matrix * probs, gsl_matrix * probs_stddev)
{
  int nbr_quad = quad_points->size;
  int nbr_subject = patterns->size1;
  int nbr_option_tot = patterns->size2;
  int nbr_item = items_pos->size;
  int i, j, k, g, resp, k_max, o, pos, nbr_option;
  double score_tmp, first_weight, step_size, num_con, den_con, weight, resp_d;
  gsl_vector * scores = gsl_vector_alloc (nbr_subject);
  gsl_vector * abil_discr = gsl_vector_alloc (nbr_quad);
  gsl_matrix * resp_discr = gsl_matrix_alloc (nbr_option_tot, nbr_quad);
  gsl_vector * kern_discr = gsl_vector_alloc (nbr_quad);
  gsl_permutation * perm = gsl_permutation_alloc(nbr_subject);

  /* compute the total scores */
  for(j = 0; j < nbr_subject; j++) {
    score_tmp = 0;
    for(i = 0; i < nbr_item; i++) {
      nbr_option = gsl_vector_int_get(nbr_options, i);
      pos = gsl_vector_int_get(items_pos, i);
      for (o = 0; o < nbr_option; o++)
	{
	  weight = options_weights?gsl_vector_get(options_weights, pos+o):1;
	  resp = gsl_matrix_int_get(patterns, j, pos+o);
	  if(resp==BLANK)
	    score_tmp += weight / nbr_option;
	  else
	    score_tmp += resp * weight;	
	}
    }
    gsl_vector_set(scores, j, score_tmp);
  }

  /* get the rank of each pattern */
  gsl_sort_vector_index (perm, scores);

  /* estimate the initial abilities */

  /* get the rank not taking into account duplicate scores */
  /* loop in ascending order using perm */
  for(g = 0; g < nbr_subject; g++) {
    j = gsl_permutation_get(perm,g);
    gsl_vector_set(scores, j, g);
  }

  /* use the inverse gausian cdf */
  for(j = 0; j < nbr_subject; j++) {
    gsl_vector_set(scores, j,
		   gsl_cdf_ugaussian_Pinv((gsl_vector_get(scores, j)+1.0)
					  / (nbr_subject+1.0)));
  }  

  /* discretize the data */

  gsl_vector_set_all (abil_discr, 0);
  gsl_matrix_set_all (resp_discr, 0);

  k = 0; /* the nearest classes are k-1 and k */

  for(g = 0; g < nbr_subject; g++) { /* loop in ascending order using perm */

    j = gsl_permutation_get(perm,g);
    score_tmp = gsl_vector_get(scores, j);

    /* locate the nearest classes */
    while(k < nbr_quad && gsl_vector_get(quad_points, k) < score_tmp) k++;

    if(k == 0) { /* all in the first class */
      gsl_vector_set(abil_discr, 0,
		     gsl_vector_get(abil_discr, 0));

      for(i = 0; i < nbr_item; i++) {
	nbr_option = gsl_vector_int_get(nbr_options, i);
	pos = gsl_vector_int_get(items_pos, i);
	for (o = 0; o < nbr_option; o++)
	  {
	    resp_d = gsl_matrix_int_get(patterns, j, pos+o);
	    if ( ((int)resp_d) == BLANK ) resp_d = 1.0/nbr_option;
	    gsl_matrix_set(resp_discr, pos+o, 0,
			   gsl_matrix_get(resp_discr, pos+o, 0) + resp_d);
	  }
      }
    } else if(k == nbr_quad) { /* all in the last class */
      gsl_vector_set(abil_discr, k-1,
		     gsl_vector_get(abil_discr, k-1));

      for(i = 0; i < nbr_item; i++) {
	nbr_option = gsl_vector_int_get(nbr_options, i);
	pos = gsl_vector_int_get(items_pos, i);
	for (o = 0; o < nbr_option; o++)
	  {
	    resp_d = gsl_matrix_int_get(patterns, j, pos+o);
	    if ( ((int)resp_d) == BLANK ) resp_d = 1.0/nbr_option;
	    gsl_matrix_set(resp_discr, pos+o, k-1,
			   gsl_matrix_get(resp_discr, pos+o, k-1) + resp_d);
	  }
	}
    } else { /* distribute linearly between the two nearest classes */
      first_weight = (gsl_vector_get(quad_points, k) - score_tmp)
	/ (gsl_vector_get(quad_points, k) - gsl_vector_get(quad_points, k-1));

      gsl_vector_set(abil_discr, k-1,
		    gsl_vector_get(abil_discr, k-1)
		     + first_weight);
      gsl_vector_set(abil_discr, k,
		     gsl_vector_get(abil_discr, k)
		     + (1-first_weight));

      for(i = 0; i < nbr_item; i++) {
	nbr_option = gsl_vector_int_get(nbr_options, i);
	pos = gsl_vector_int_get(items_pos, i);
	for (o = 0; o < nbr_option; o++)
	  {
	    resp_d = gsl_matrix_int_get(patterns, j, pos+o);
	    if ( ((int)resp_d) == BLANK ) resp_d = 1.0/nbr_option;
	    gsl_matrix_set(resp_discr, pos+o, k-1,
			   gsl_matrix_get(resp_discr, pos+o, k-1)
			   + resp_d * first_weight);
	    gsl_matrix_set(resp_discr, pos+o, k,
			   gsl_matrix_get(resp_discr, pos+o, k)
			   + resp_d * (1-first_weight));
	  }
      }
    }
  }

  /* discretize the kernel */
  step_size = (gsl_vector_get(quad_points, nbr_quad-1)
    - gsl_vector_get(quad_points, 0))
    / (nbr_quad-1); /* the distance between two classes */
  /* the last class where the kernel is not almost zero */
  k_max = 1 + (int)(4.0 * bandwidth / step_size);
  k_max = (k_max<nbr_quad)?k_max:(nbr_quad-1);
  for(k = 0; k <= k_max; k++)
    gsl_vector_set(kern_discr, k,
		   gsl_ran_gaussian_pdf(k * step_size, bandwidth));

  /* convolution */

  gsl_matrix_set_all (probs, 0);
  if(probs_stddev) gsl_matrix_set_all (probs_stddev, 0);

  for(i = 0; i < nbr_option_tot; i++) { /* for each options */
    for(g = 0; g < nbr_quad; g++) { /* for each class */
      num_con = 0; /* numerator convolution */
      den_con = 0; /* denominator convolution */
      for(k = -k_max; k <= k_max; k++)
	if(g-k >= 0 && g-k < nbr_quad) {
	  den_con += gsl_vector_get(abil_discr, g-k)
	    * gsl_vector_get(kern_discr, abs(k));
	  num_con += gsl_matrix_get(resp_discr, i, g-k)
	    * gsl_vector_get(kern_discr, abs(k));
	}
      if(den_con>0) num_con /= den_con;
      else num_con = 0;
      /* reset the probabilities inside the open interval (0,1) */
      if (num_con < VERY_SMALL_PROB) num_con = VERY_SMALL_PROB;
      if (num_con > 1 - VERY_SMALL_PROB) num_con = 1 - VERY_SMALL_PROB;
      gsl_matrix_set(probs, i, g, num_con);
    }

    /* compute the standard error */
    if(probs_stddev) {
      for(g = 0; g < nbr_quad; g++) { /* for each class */
	num_con = 0; /* numerator convolution */
	den_con = 0; /* denominator convolution */
	for(k = -k_max; k <= k_max; k++)
	  if(g-k >= 0 && g-k < nbr_quad) {
	    den_con += gsl_vector_get(abil_discr, g-k)
	      * gsl_vector_get(kern_discr, abs(k));
	    num_con += gsl_matrix_get(probs, i, g-k)
	      * (1-gsl_matrix_get(probs, i, g-k))
	      * gsl_vector_get(kern_discr, abs(k))
	      * gsl_vector_get(kern_discr, abs(k));
	  }	
	if(den_con>0)
	  gsl_matrix_set(probs_stddev, i, g, sqrt(num_con/(den_con * den_con)));
      }
    }
  }

  /* free the memory */
  gsl_vector_free(scores);
  gsl_vector_free(abil_discr);
  gsl_matrix_free(resp_discr);
  gsl_vector_free(kern_discr);
  gsl_permutation_free (perm);
}
