Here is the kernel that is having the problems, all 475 lines of it. It isn’t self-contained and can’t be compiled and run, but maybe there is enough info for you to spot some error that is causing a thread to race ahead and modify my shared variable. That variable is s_skip1 in kernelOuterDistKNN(), and is marked with the text “<=== THIS SHARED VARIABLE IS SPURIOUSLY MODIFIED”. I’m sorry, I know this isn’t ideal.

```
// -------------------------------------------------------------------------------------
// kernelOuterDistKNN
//
// Layout is as follows -- 2-dimensional CUDA grid, where every thread computes a single
// distance between one sample vector and one data grid point. The rows of the CUDA
// grid (the Y-axis) correspond to the sample vectors (the Monte Carlo runs), and the
// columns of the CUDA grid (the X-axis) correspond to the uniform data grid points.
// -------------------------------------------------------------------------------------
__global__ void kernelOuterDistKNN
(
const int n_samples, // # of Monte Carlo samples
const int n_cvars_, // # of cvars in outer set
const int cvar_ndx1, // compound variable index 1
const int cvar_ndx2, // compound variable index 2
const int n_to_generate, // # of graphs to generate
const int start_graph_ndx, // NOTE
const int base_graph_ndx, // index of first graph for this
// subpass, in the sequence of
// graphs generated during a pass
const int n_graphs_per_subpass_, // # of graphs generated on
// each iteration (max)
const int n_points_per_axis_, // # of points per graph axis
const int n_data_grid_points_, // # total of data grid points
const int n_sample_blocks_, // # of data grid blocks on
// y-axis of CUDA grid
const int metric_ndx, // which failure metric
const int d_failure_pitch_, // pitch of failure matrxi
const int d_buf_pitch_, // pitch of d_buf_
const int *d_failure_matrix_, // pass or fail for each sample
int *d_n_graphs_generated_, // take skips into account
int *d_n_succ_per_block1_, // partial counts of successes
int *d_n_succ_per_block2_, // partial counts of successes
int *d_n_succ_per_block3_, // partial counts of successes
int *d_n_fail_per_block1_, // partial counts of failures
int *d_n_fail_per_block2_, // partial counts of failures
int *d_n_fail_per_block3_, // partial counts of failures
GraphVarsKNN *d_gvars_, // info about each graph created
const float *d_outer_buf_, // buffer of cvars
const float *d_min_, // minimums for variables in d_buf_
const float *d_max_, // maximums for variables in d_buf_
const CompoundVarKDEC *d_cvars_, // info for cvars in d_buf_
const char *dv_fine_counts_,
int *dv_mismatch_count_,
float *dv_var_bands_,
int *d_scratch_,
float *d_scratch_f_
)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
int sampy = by * blockDim.y + ty;
int gridx = bx * blockDim.x + tx;
int cvid1, cvid2, n_graphs;
bool thread_0_0, block_0_0, sample_less;
float cxs[2];
__shared__ float s_min[2];
__shared__ float s_range[2];
__shared__ float s_delta[2];
__shared__ int s_pf[KernelGroupKNN::BLOCK_SIZE_2D];
__shared__ int s_skip1;
__shared__ int s_skip2;
n_graphs = 0;
cvid1 = cvar_ndx1;
cvid2 = cvar_ndx2;
thread_0_0 = (!tx && !ty);
block_0_0 = (!bx && !by);
sample_less = (sampy < n_samples);
if (!sample_less || gridx >= n_data_grid_points_) return;
// whether the sample was a success (1) or a failure (0). Each row of the
// block (each data grid point) shares this single Monte Carlo failure metric
// value
if (!tx)
{
s_pf[ty] = (sample_less ? d_failure_matrix_[metric_ndx * d_failure_pitch_ + sampy] : 1);
}
while (cvid1 < n_cvars_ - 1)
{
// if this cvar is not dispersed, skip it and do not generate a graph
// with it
__syncthreads();
if (thread_0_0) s_skip1 = (int) d_cvars_[cvid1].skip_;
__syncthreads();
if (s_skip1) // <=== THIS SHARED VARIABLE IS SPURIOUSLY MODIFIED
{
cvid1++;
cvid2 = cvid1+1;
continue;
}
// each thread read in one sample of cvar 1
cxs[0] = (sample_less ? d_outer_buf_[cvid1 * d_buf_pitch_ + sampy] : 0.0);
if (thread_0_0)
{
// only one min, max, range, delta for each variable, so only thread
// zero reads them into shared memory
s_min[0] = d_min_[cvid1];
s_range[0] = fabs(d_max_[cvid1] - s_min[0]);
s_delta[0] = s_range[0]/(n_points_per_axis_ - 1);
}
__syncthreads();
while (cvid2 < n_cvars_)
{
// if this cvar is not dispersed, skip it and do not generate a graph
// with it
__syncthreads();
if (thread_0_0) s_skip2 = (int) d_cvars_[cvid2].skip_;
__syncthreads();
if (s_skip2)
{
cvid2++;
continue;
}
// each thread read in one sample of cvar 2
cxs[1] = (sample_less ? d_outer_buf_[cvid2 * d_buf_pitch_ + sampy] : 0.0);
if (thread_0_0)
{
s_min[1] = d_min_[cvid2];
s_range[1] = fabs(d_max_[cvid2] - s_min[1]);
s_delta[1] = s_range[1]/(n_points_per_axis_ - 1);
}
__syncthreads();
if (thread_0_0 && block_0_0)
{
// store the X- and Y-axis parameters for this graph. this is
// is only done once per gvar for the entire kernel launch. See
// the comment regarding the last argument "n_graphs" in the
// function itself.
kernelSetGvarParams(0, d_gvars_ + base_graph_ndx + n_graphs,
d_cvars_ + cvid1, s_min[0], s_min[0] + s_range[0],
s_delta[0], base_graph_ndx + n_graphs);
kernelSetGvarParams(1, d_gvars_ + base_graph_ndx + n_graphs,
d_cvars_ + cvid2, s_min[1], s_min[1] + s_range[1],
s_delta[1], base_graph_ndx + n_graphs);
}
__syncthreads();
kernelCountNearestNeighbors(
n_samples,
n_data_grid_points_,
n_points_per_axis_,
n_graphs_per_subpass_,
n_sample_blocks_,
sampy,
gridx,
n_graphs ,
start_graph_ndx + n_graphs,
s_pf[ty],
cxs,
s_min,
s_range,
s_delta,
d_n_succ_per_block1_,
d_n_succ_per_block2_,
d_n_succ_per_block3_,
d_n_fail_per_block1_,
d_n_fail_per_block2_,
d_n_fail_per_block3_,
dv_fine_counts_,
dv_mismatch_count_,
NULL,
dv_var_bands_,
d_outer_buf_,
d_buf_pitch_,
cvid1,
cvid2,
NULL,
&s_skip2,
d_scratch_, d_scratch_f_);
if (++n_graphs >= n_to_generate) break;
cvid2++;
}
if (n_graphs >= n_to_generate) break;
cvid1++;
cvid2 = cvid1+1;
}
// this count of graphs generated is updated only once for the entire launch
if (thread_0_0 && block_0_0) *d_n_graphs_generated_ = n_graphs;
}
// -------------------------------------------------------------------------------------
// kernelSetGvarParams
//
// When a graph is evaluated, store all of its parameters in a GraphVarsKNN object.
// With an "x_or_y" argument of 0, the X-axis parameters of the graph are updated,
// otherwise the Y-axis parameters are updated
// -------------------------------------------------------------------------------------
__device__ void kernelSetGvarParams(const int x_or_y, GraphVarsKNN *gvar,
const CompoundVarKDEC *cvar, const float min, const float max,
const float delta, const int index)
{
if (!x_or_y)
{
// this is done only once, although this function is called twice for each
// graph, once for the X-axis and once for the Y-axis. gvar->index_ is a
// little tricky. Since the gvar array is used by knndata_->d_unloadGraphKNN()
// in SaveKNN to unload the "best" colorings from the device. The
// knndata_->d_colorings does not include the previous winners, so we set
// index here to a number ranging from 0 to the number of graphs generated
// on this pass, previous winners ignored.
gvar->index_ = index;
gvar->on_host_ = false;
gvar->J_ = 0.0f;
// update the X-axis parameters of the Graph
gvar->minx_ = min;
gvar->maxx_ = max;
gvar->deltax_ = delta;
gvar->v1x_ = cvar->v1_;
gvar->v2x_ = cvar->v2_;
if (cvar->op_ == CompoundVarKDEC::OP_SUBTRACT_)
gvar->opx_ = GraphVarsKNN::OP_SUBTRACT_;
else if (cvar->op_ == CompoundVarKDEC::OP_DIVIDE_)
gvar->opx_ = GraphVarsKNN::OP_DIVIDE_;
else gvar->opx_ = GraphVarsKNN::OP_NONE_;
} else
{
// update the Y-axis parameters of the Graph
gvar->miny_ = min;
gvar->maxy_ = max;
gvar->deltay_ = delta;
gvar->v1y_ = cvar->v1_;
gvar->v2y_ = cvar->v2_;
if (cvar->op_ == CompoundVarKDEC::OP_SUBTRACT_)
gvar->opy_ = GraphVarsKNN::OP_SUBTRACT_;
else if (cvar->op_ == CompoundVarKDEC::OP_DIVIDE_)
gvar->opy_ = GraphVarsKNN::OP_DIVIDE_;
else gvar->opy_ = GraphVarsKNN::OP_NONE_;
}
}
// -------------------------------------------------------------------------------------
// kernelCountNearestNeighbors
//
// Generate a uniform grid over the space covered by the two compound variables, and
// compute the distance of every grid point from every graph point (the graph points
// are formed from the ordered pairs of the compound variables (cvid1, cvid2)). For
// each CUDA block, determine the numbers of successes and failures in the neighborhood
// of each grid point.
// -------------------------------------------------------------------------------------
__device__ void kernelCountNearestNeighbors
(
const int n_samples, // # of Monte Carlo samples
const int n_data_grid_points_, // # total number of grid points
const int n_points_per_axis_, // # of points per graph axis
const int n_graphs_per_subpass_,// # of graphs to be generated
const int n_sample_blocks_, // # of blocks on y-axis of CUDA grid
const int sampy, // Monte Carlo sample/run index
const int gridx, // grid point index
const int graph_ndx, // index of graph to be created, out
// of n_graphs_per_subpass_
const int graph_ndx_abs, // NOTE
const int pf, // success or failure
const float cxs[2], // compound variable values
const float min[2], // minimum for each compound variable
const float range[2], // range for each compound variable
const float delta[2], // division of the axis for each
// compound variable
int *d_n_succ_per_block1_, // arrays of partial counts of
int *d_n_succ_per_block2_, // success samples near each
int *d_n_succ_per_block3_, // grid point, on a per-block
// basis
int *d_n_fail_per_block1_, // arrays of partial counts of
int *d_n_fail_per_block2_, // failure samples near each
int *d_n_fail_per_block3_, // grid point, on a per-block
// basis
const char *dv_fine_counts_,
int *dv_mismatch_count_,
int *s_flag,
float *dv_var_bands_,
const float *d_outer_buf_,
const int d_buf_pitch_,
const int cvid1,
const int cvid2,
const float cv_old[2],
int *s_skip2,
int *d_scratch_,
float *d_scratch_f_
)
{
const int BLOCK_SIZE_2D = KernelGroupKNN::BLOCK_SIZE_2D;
// s_A: one column for each 2-dimensional Monte Carlo sample
// s_B: one column for each 2-dimensional uniform data grid point
// s_n_succ: one count for each thread (each distance) in the CUDA block
// s_n_fail: one count for each thread (each distance) in the CUDA block
__shared__ float s_A[2][BLOCK_SIZE_2D];
__shared__ float s_B[2][BLOCK_SIZE_2D];
__shared__ int s_n_succ1[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
__shared__ int s_n_succ2[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
__shared__ int s_n_succ3[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
__shared__ int s_n_fail1[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
__shared__ int s_n_fail2[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
__shared__ int s_n_fail3[BLOCK_SIZE_2D][BLOCK_SIZE_2D];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
// designed to create the data grid in row-major order, starting from the
// lowest X-axis value and working upward, row by row.
int grid_row = gridx / n_points_per_axis_;
int grid_col = gridx % n_points_per_axis_;
int begin_A = BLOCK_SIZE_2D * by;
int begin_B = BLOCK_SIZE_2D * bx;
int ndx;
unsigned int stride;
// 1.0001 keeps points on boundary from being missed
float grid_delta1 = (1.0001f / (2.0f * (float) (n_points_per_axis_ - 1)));
float grid_delta2 = (float) d_radius2_knn * grid_delta1;
float grid_delta3 = (float) d_radius3_knn * grid_delta1;
grid_delta1 *= (float) d_radius1_knn;
float L_inf_dist = 0.0f, temp;
bool cond1 = (begin_B + tx < n_data_grid_points_);
bool cond2 = (begin_A + ty < n_samples);
// read a 2-dimensional sample vector into column "ty" of s_A. We
// do it only for tx == 0 so that each vector is read only once
if (!tx)
{
if (cond2)
{
s_A[0][ty] = cxs[0];
s_A[1][ty] = cxs[1];
} else
{
s_A[0][ty] = 0.0f;
s_A[1][ty] = 0.0f;
}
}
__syncthreads();
// create a 2-dimensional data grid point in column "tx" of s_B. We
// do it only for ty == 0 so that each grid point is created only once
if (!ty)
{
if (cond1)
{
s_B[0][tx] = min[0] + delta[0] * grid_col;
s_B[1][tx] = min[1] + delta[1] * grid_row;
} else
{
s_B[0][tx] = 0.0f;
s_B[1][tx] = 0.0f;
}
}
__syncthreads();
if (cond2 && cond1)
{
// L-infinity norm distance -- each component is weighted by the inverse
// of the corresponding compound variable's range. Compute the distance
// between a sample "ty" and a grid point "tx".
L_inf_dist = fabs((s_A[0][ty] - s_B[0][tx]) / range[0]);
temp = fabs((s_A[1][ty] - s_B[1][tx]) / range[1]);
if (temp > L_inf_dist) L_inf_dist = temp;
}
__syncthreads();
s_n_succ1[ty][tx] = 0;
s_n_fail1[ty][tx] = 0;
s_n_succ2[ty][tx] = 0;
s_n_fail2[ty][tx] = 0;
s_n_succ3[ty][tx] = 0;
s_n_fail3[ty][tx] = 0;
__syncthreads();
if (cond2 && cond1) // NOTE maybe cond2 and cond1 doesn't hold for some members of shared memory
{
if (pf)
{
// innermost, middle and outermost "radii"
if (L_inf_dist <= grid_delta1) s_n_succ1[ty][tx] = 1;
else if (L_inf_dist <= grid_delta2) s_n_succ2[ty][tx] = 1;
else if (L_inf_dist <= grid_delta3) s_n_succ3[ty][tx] = 1;
} else
{
// innermost, middle and outermost "radii"
if (L_inf_dist <= grid_delta1) s_n_fail1[ty][tx] = 1;
else if (L_inf_dist <= grid_delta2) s_n_fail2[ty][tx] = 1;
else if (L_inf_dist <= grid_delta3) s_n_fail3[ty][tx] = 1;
}
}
// NOTE: remove me
__syncthreads();
// compute the stride for the reduction below. stride will be the largest power
// of two that is less than the block size.
for (stride = 1; stride < BLOCK_SIZE_2D; stride <<= 1);
if (stride >= BLOCK_SIZE_2D) stride >>= 1;
// perform a reduction within this block -- add up all the # of success and failure
// samples near each grid point tx. stride is designed for the situation
// when the block size is/is not a power of two or is odd. In those cases, one
// the first iteration, the lowest numbered threads in each block will be combined
// with those numbered "stride" or higher. So for an block size of 11, stride
// would be 8 and on the first iteration, threads 0, 1 and 2 (relative to the base
// of each set) would reduce with threads 8, 9 and 10 relative to the base of each
// set. After that, the reduction proceeds normally, reducing thread divergence by
// making blocks of adjacent threads perform the same instructions.
for ( ; stride > 0; stride >>= 1)
{
__syncthreads();
if (ty < stride && cond2 && cond1 && begin_A + ty + stride < n_samples && ty + stride < BLOCK_SIZE_2D) // NOTE: added third and 4th conditions, may not be necessary
{
s_n_succ1[ty][tx] += s_n_succ1[ty + stride][tx];
s_n_fail1[ty][tx] += s_n_fail1[ty + stride][tx];
s_n_succ2[ty][tx] += s_n_succ2[ty + stride][tx];
s_n_fail2[ty][tx] += s_n_fail2[ty + stride][tx];
s_n_succ3[ty][tx] += s_n_succ3[ty + stride][tx];
s_n_fail3[ty][tx] += s_n_fail3[ty + stride][tx];
}
}
// store the count, for this CUDA block, of success samples near the grid point
// that is indexed "gridx". Do the same for failure samples. The arrays
// d_n_succ_per_block_[] and d_n_fail_per_block_[] are 3-dimensional, with
// n_graphs_per_subpass_ rows (y-axis), n_sample_blocks_ columns (x-axis) and
// a z-axis depth of n_points_per_axis_^2.
if (!ty && gridx < n_data_grid_points_)
{
ndx = gridx * (n_sample_blocks_ * n_graphs_per_subpass_) +
graph_ndx * n_sample_blocks_ + by;
d_n_succ_per_block1_[ndx] = s_n_succ1[0][tx];
d_n_fail_per_block1_[ndx] = s_n_fail1[0][tx];
d_n_succ_per_block2_[ndx] = s_n_succ2[0][tx];
d_n_fail_per_block2_[ndx] = s_n_fail2[0][tx];
d_n_succ_per_block3_[ndx] = s_n_succ3[0][tx];
d_n_fail_per_block3_[ndx] = s_n_fail3[0][tx];
}
}
```