Here is the main kernel, the one that takes the most of the GPUs time, followed by the serial host code that verifies the device results. Sorry, I have a lot of code and can only post a portion of it.

```
__global__ void kernelKDE
(
const int n_samples, // # of Monte Carlo runs or samples
const int n_vars, // # of Monte Carlo condition/variables
const int n_failure_metrics, // # of failure metrics
const int d_monte_carlo_pitch_, // # of (padded) MC matrix columns
const int d_failure_pitch_, // # of (padded) failure matrix columns
const int metric, // index of failure metric to use
const int n_density_points_, // # of points in each probability density
const int n_density_blocks_, // size of z-axis of d_dens0_temp_, etc.
const int n_sets, // # of shared memory sets
const int n_first_elements_, // # of cross-initialized elem in sort array
const int n_input_vars_, // # of input variable indices
const int d_n_dens_per_sort_kde_, // length of d_cvar_sort_kde_ array
FLOAT_TYPE_KDE *d_monte_carlo_data_, // Monte Carlo data matrix
int *d_failure_matrix_, // failure metric matrix
int *d_var_list_kde_, // list of input variable indices
FLOAT_TYPE_KDE *d_min_, // minimum for each condition/variable
FLOAT_TYPE_KDE *d_max_, // maximum for each condition/variable
FLOAT_TYPE_KDE *d_min_exp_kde_, // expanded minimums for each variable
FLOAT_TYPE_KDE *d_max_exp_kde_, // expanded maximums for each variable
FLOAT_TYPE_KDE *d_h0_, // constants for the failed runs
FLOAT_TYPE_KDE *d_h1_, // constants for the sucessful runs
FLOAT_TYPE_KDE *d_dens0_temp_, // results for failed runs
FLOAT_TYPE_KDE *d_dens1_temp_, // results for successful runs
CompoundVarKDEC *d_cvar_sort_kde_ // sorting array
)
{
extern __shared__ FLOAT_TYPE_KDE shared_memory[];
FLOAT_TYPE_KDE *d_bin0_sh = &shared_memory[0];
FLOAT_TYPE_KDE *d_bin1_sh = &shared_memory[n_sets * n_density_points_];
FLOAT_TYPE_KDE *vmin_sh = &shared_memory[2 * n_sets * n_density_points_];
FLOAT_TYPE_KDE *vrange_sh = &shared_memory[2 * n_sets * n_density_points_ + 1];
FLOAT_TYPE_KDE *h0_sh = &shared_memory[2 * n_sets * n_density_points_ + 2];
FLOAT_TYPE_KDE *h1_sh = &shared_memory[2 * n_sets * n_density_points_ + 3];
FLOAT_TYPE_KDE *skip_sh = &shared_memory[2 * n_sets * n_density_points_ + 4];
int tx = threadIdx.x;
int bx = blockIdx.x;
int sampx = bx * blockDim.x + tx; // absolute sample index
int pf, i, ndx, v, vid;
int stride;
FLOAT_TYPE_KDE xs, xi, expand;
FLOAT_TYPE_KDE divisor = (FLOAT_TYPE_KDE) (n_density_points_ - 1);
CompoundVarKDEC *cvar;
// the shared memory arrays d_bin0_sh[] and d_bin1_sh[] are expanded into
// "n_sets" sets of n_density_points_ elements. This is so that there will
// be no conflicts in access to them by all of the threads in the block.
// Without this feature, I found that this kernel wouldn't produce correct
// results when compiled with the -G option. Each set contains the complete
// set of x-axis points of the density. Midway through this kernel, the sets
// will be reduced to one set. "set_base" is the first index in each array of
// the set associated with this thread.
int set_base = ((int) (tx / n_density_points_)) * n_density_points_;
// for each variable/condition...
for (v = 0; v < n_input_vars_; v++)
{
// retrofit to make us process only vars that are on the input_var_list.
// variables not on the list will be skipped below
__syncthreads();
vid = d_var_list_kde_[v];
// thread 0 loads this variable's minimum, maximum and range into shared
// memory, as well as the bandwidths h0 and h1 associated with it. If
// h0 or h1 is zero, it will result in density values == NaN
if (!tx)
{
expand = d_expansion_factor_kde * fabs(d_max_[vid] - d_min_[vid]);
d_min_exp_kde_[vid] = d_min_[vid] - expand;
d_max_exp_kde_[vid] = d_max_[vid] + expand;
*vmin_sh = d_min_exp_kde_[vid];
*vrange_sh = d_max_exp_kde_[vid] - *vmin_sh;
*h0_sh = d_h0_[vid];
*h1_sh = d_h1_[vid];
// NOTE: this needs to be figured out. Arbitrary minimum values for h0 and h1
FLOAT_TYPE_KDE temp = *vrange_sh / (divisor * 2.0);
if (*h1_sh < temp) *h1_sh = temp;
if (*h0_sh < temp) *h0_sh = temp; // TEMP
if (!bx) // NOTE: kem DANGER, what if block 0 doesn't go first?
{
// initialize the CompoundVarKDEC objects sorting array, just the variable
// indices and operation. The objective values will be put here in a later
// kernel. We start at element "n_first_elements_", which will be non-zero if
// some CompoundVarKDEC objects from another KernelGroup* were placed in
// this sort array. NOTE: this is being done in EVERY BLOCK!
cvar = d_cvar_sort_kde_ + n_first_elements_ + vid;
cvar->v1_ = vid;
cvar->v2_ = -1;
cvar->op_ = -1;
cvar->index_ = vid;
cvar->on_host_ = false;
cvar->J_ = 0.0f;
// skip this variable if it is not dispersed. If a non-dispersed variable
// slips through the cracks here, it may result in density values == NaN.
// Note that variables are skipped by default, so that only those that
// are not on the input list will be skipped. That's because all skips_
// have been initialized to true in kdata_->d_loadKDE()
if (fabs(*vrange_sh) >= d_eps_skip_kde) cvar->skip_ = false;
}
//*skip_sh = (cvar->skip_ ? 1.0 : 0.0);
*skip_sh = ((fabs(*vrange_sh) < d_eps_skip_kde) ? 1.0f : 0.0f); // kem NEW
}
__syncthreads();
// if this variable was not dispersed, skip it
if (*skip_sh > 0.5f)
{
// the continue statment must be executed by ALL threads, not just the
// ones for which tx < n_density_points_
if (tx < n_density_points_)
{
// accessing the 3-dim matrices, with n_vars rows (x-axis),
// n_density_points columns (y-axis) and n_blocks layers (z-axis)
ndx = bx * (n_density_points_ * n_vars) + (vid * n_density_points_) + tx;
d_dens0_temp_[ndx] = 0.0f;
d_dens1_temp_[ndx] = 0.0f;
}
continue;
}
// there must be at least as many threads per block as there are points in
// each distribution (n_density_points_) so that each element of these two
// shared memory arrays are initialized to zero by the lower n_density_points_
// threads
if (tx < n_density_points_)
{
// do this for each set of n_density_points_ elements
for (stride = 0; stride < n_sets * n_density_points_;
stride += n_density_points_)
{
d_bin0_sh[tx + stride] = d_bin1_sh[tx + stride] = 0.0;
}
}
// the last block may have < n_density_points_ usable threads. There are no
// samples to read in for the unusable threads.
if (sampx < n_samples)
{
// each thread read one Monte Carlo sample from global memory, along with
// the associated pass/fail metric value for that run
xs = (FLOAT_TYPE_KDE) d_monte_carlo_data_[vid * d_monte_carlo_pitch_ + sampx];
pf = d_failure_matrix_[metric * d_failure_pitch_ + sampx];
} else
{
// we give the unusable threads data that will contribute nothing to the
// densities
xs = 1.0e+25f;
pf = 1;
}
// for each xi in the interval [vmin_sh, vmax_sh], where xi is a point where
// a density is computed...
for (i = 0; i < n_density_points_; i++)
{
// sync the threads so that all accesses to shared memory are in lock-step,
// keeping them interleaved.
__syncthreads();
// interleave the shared memory accesses by shifting the index ndx forward
// for each successive thread. Every thread visits all xi but at different
// times. "ndx" is relative to the base of this thread's set of xi points.
ndx = (i + (tx % n_density_points_)) % n_density_points_;
xi = *vmin_sh + ((FLOAT_TYPE_KDE) ndx / divisor) * (*vrange_sh);
// distinguish between runs or samples that were from successful and
// failed runs
if (pf)
{
d_bin1_sh[ndx + set_base] += gaussian((xs - xi) / (*h1_sh));
} else
{
d_bin0_sh[ndx + set_base] += gaussian((xs - xi) / (*h0_sh));
}
}
// if there is more than one set in shared memory, reduce them all to a single
// set. This loop won't execute if there is only one set
for (stride = n_density_points_; stride < n_sets * n_density_points_;
stride += n_density_points_)
{
__syncthreads();
if (tx < n_density_points_) // && tx + stride < n_sets * n_density_points_)
{
d_bin1_sh[tx] += d_bin1_sh[tx + stride];
d_bin0_sh[tx] += d_bin0_sh[tx + stride];
}
}
// all xi for the partial density are now finished. Write the results for
// it to global memory. There must be >= n_density_points_ threads for this
// to work. By the way, the unused threads in the last block do work here --
// each writing an element of the partial density.
__syncthreads();
if (tx < n_density_points_)
{
// accessing the 3-dim matrices, with n_vars rows (x-axis),
// n_density_points columns (y-axis) and n_blocks layers (z-axis)
ndx = bx * (n_density_points_ * n_vars) + (vid * n_density_points_) + tx;
// the densities need to be normalized by d_C_sqrt_2pi/(n * h). We do the h here,
// and do the d_C_sqrt_2pi and n in kernelKDEreduce()
d_dens0_temp_[ndx] = d_bin0_sh[tx] / (*h0_sh);
d_dens1_temp_[ndx] = d_bin1_sh[tx] / (*h1_sh);
}
}
}
// *********************************************************************************************
// SERIAL HOST CODE
// *********************************************************************************************
void KernelGroupKDE::verify2(void)
{
int v_ndx, x_ndx, samp_ndx, size, dens_ndx;
int metric_ndx, n_success, n_failure;
FLOAT_TYPE_KDE delta, samp, h0, h1, xi, temp, expand;
FLOAT_TYPE_KDE divisor = (FLOAT_TYPE_KDE) (kdata_->n_density_points_ - 1);
FLOAT_TYPE_KDE success_diff2, fail_diff2;
// use either the special failure vector computed by the client (usually from
// a combination of metrics) or the regular failure matrix with a single
// metric index
int *fmv_failure_matrix;
int fmv_metric;
if (!kdata_->use_failure_vector_kde_)
{
// use the regular failure matrix with a single metric index
fmv_failure_matrix = kdata_->failure_matrix_;
fmv_metric = kdata_->failure_metric_;
} else
{
// use the failure vector, which is 1-dimensional
fmv_failure_matrix = kdata_->failure_vector_kde_;
fmv_metric = 0;
}
metric_ndx = fmv_metric; //kdata_->failure_metric_;
// allocate the arrays that will hold the densities
size = kdata_->n_density_variables_ * kdata_->n_density_points_;
if (v_dens_size_ < size)
{
if (v_dens0_) delete [] v_dens0_;
if (v_dens1_) delete [] v_dens1_;
if (v_device_dens0_) delete [] v_device_dens0_;
if (v_device_dens1_) delete [] v_device_dens1_;
v_dens0_ = new FLOAT_TYPE_KDE;
v_dens1_ = new FLOAT_TYPE_KDE;
v_device_dens0_ = new FLOAT_TYPE_KDE;
v_device_dens1_ = new FLOAT_TYPE_KDE;
v_dens_size_ = size;
}
// allocate the array that will hold the differences between the densities
// computed on the GPUs and those computed here on the host
size = kdata_->n_density_variables_;
if (v_dens_diffs_size_ < size)
{
if (v_dens0_diffs_) delete [] v_dens0_diffs_;
if (v_dens1_diffs_) delete [] v_dens1_diffs_;
v_dens0_diffs_ = new FLOAT_TYPE_KDE;
v_dens1_diffs_ = new FLOAT_TYPE_KDE;
v_dens_diffs_size_ = size;
}
// array of failure metric flags
size = kdata_->n_failure_samples_;
if (v_pf_array_size_ < size)
{
if (v_pf_array_) delete [] v_pf_array_;
v_pf_array_ = new int;
v_pf_array_size_ = size;
}
// unload all of the densities for each Monte Carlo variable from the device.
// These densities are in the same order as the Monte Carlo variables. We can't
// use those in density_success_ and density_fail_ because they have been
// reordered by SaveKDEC::saveBestDensities().
kdata_->d_unloadVerifyKDE3(v_device_dens1_, v_device_dens0_, v_dens_size_);
timer_.time(HOST_TIMER_OUTER_, ElapsedTime::START_OP_, "KDE outer host 1");
// count the number of successes and failures, and store the failure flag
// for each run in an array
int nmcs = kdata_->n_monte_carlo_samples_;
int nfs = kdata_->n_failure_samples_;
int x = metric_ndx * nfs;
n_success = n_failure = 0.0f;
for (int f = 0; f < nfs; f++)
{
v_pf_array_[f] = fmv_failure_matrix[x + f];
if (v_pf_array_[f]) n_success++;
else n_failure++;
}
// Compute the densities: for each Monte Carlo condition/variable...
int n_vars = kdata_->n_input_vars_kde_;
for (int v = 0; v < n_vars; v++)
{
v_ndx = kdata_->var_list_kde_[v];
float mn = kdata_->min_[v_ndx];
float mx = kdata_->max_[v_ndx];
float d = mx - mn;
// if the variable hasn't been dispersed, skip it
if (d < eps_skip_) continue;
// starting point on the densities' x-axis
expand = expansion_factor_kde_ * d;
xi = mn - expand;
delta = (d + (2.0f * expand)) / divisor;
// get the bandwidths for this variable, one for successful runs and one for
// failed runs
h1 = kdata_->h1_[v_ndx];
h0 = kdata_->h0_[v_ndx];
// NOTE: this needs to be figured out. Arbitrary minimums for h0 and h1
temp = delta / 2.0;
if (h1 < temp) h1 = temp; // TEMP
if (h0 < temp) h0 = temp;
// for each point on the x-axis of this variable's density...
int ndp = kdata_->n_density_points_;
for (x_ndx = 0; x_ndx < ndp; x_ndx++)
{
// initialize the density at this point on the x-axis
dens_ndx = v_ndx * ndp + x_ndx;
float vd0 = 0.0f, vd1 = 0.0f;
// for each sample or run for this variable...
for (samp_ndx = 0; samp_ndx < nmcs; samp_ndx++)
{
samp = kdata_->monte_carlo_data_[v_ndx * nmcs + samp_ndx];
// distinguish between failed and successful runs
if (v_pf_array_[samp_ndx])
{
temp = (xi - samp)/h1;
vd1 += EXP(-0.5f * temp * temp);
} else
{
temp = (xi - samp)/h0;
vd0 += EXP(-0.5f * temp * temp);
}
}
// normalize to complete the computation of the densities at this point xi
v_dens1_[dens_ndx] = vd1 * C_sqrt_2pi_ / ((FLOAT_TYPE_KDE) n_success * h1);
v_dens0_[dens_ndx] = vd0 * C_sqrt_2pi_ / ((FLOAT_TYPE_KDE) n_failure * h0);
// move to the next point on the densities' x-axis
xi += delta;
}
} // variable index
timer_.time(HOST_TIMER_OUTER_, ElapsedTime::END_OP_, "KDE outer host 2");
// compute the relative errors between the CPU and GPU results
relativeErrors(&success_diff2, &fail_diff2);
oss_.str("");
oss_ << "KernelGroupKDE::verify2: max rel diff density_success_, density_fail_ = " << success_diff2 << " " << fail_diff2 << endl;
log_.log(oss_.str());
}
```