// ------------------------------------------------------------------------------------- // kernelOuterDistKNN // // Layout is as follows -- 2-dimensional CUDA grid, where every thread computes a single // distance between one sample vector and one data grid point. The rows of the CUDA // grid (the Y-axis) correspond to the sample vectors (the Monte Carlo runs), and the // columns of the CUDA grid (the X-axis) correspond to the uniform data grid points. // ------------------------------------------------------------------------------------- __global__ void kernelOuterDistKNN ( const int n_samples, // # of Monte Carlo samples const int n_cvars_, // # of cvars in outer set const int cvar_ndx1, // compound variable index 1 const int cvar_ndx2, // compound variable index 2 const int n_to_generate, // # of graphs to generate const int start_graph_ndx, // NOTE const int base_graph_ndx, // index of first graph for this // subpass, in the sequence of // graphs generated during a pass const int n_graphs_per_subpass_, // # of graphs generated on // each iteration (max) const int n_points_per_axis_, // # of points per graph axis const int n_data_grid_points_, // # total of data grid points const int n_sample_blocks_, // # of data grid blocks on // y-axis of CUDA grid const int metric_ndx, // which failure metric const int d_failure_pitch_, // pitch of failure matrxi const int d_buf_pitch_, // pitch of d_buf_ const int *d_failure_matrix_, // pass or fail for each sample int *d_n_graphs_generated_, // take skips into account int *d_n_succ_per_block1_, // partial counts of successes int *d_n_succ_per_block2_, // partial counts of successes int *d_n_succ_per_block3_, // partial counts of successes int *d_n_fail_per_block1_, // partial counts of failures int *d_n_fail_per_block2_, // partial counts of failures int *d_n_fail_per_block3_, // partial counts of failures GraphVarsKNN *d_gvars_, // info about each graph created const float *d_outer_buf_, // buffer of cvars const float *d_min_, // minimums for variables in d_buf_ const float *d_max_, // maximums for variables in d_buf_ const CompoundVarKDEC *d_cvars_, // info for cvars in d_buf_ const char *dv_fine_counts_, int *dv_mismatch_count_, float *dv_var_bands_, int *d_scratch_, float *d_scratch_f_ ) { int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int sampy = by * blockDim.y + ty; int gridx = bx * blockDim.x + tx; int cvid1, cvid2, n_graphs; bool thread_0_0, block_0_0, sample_less; float cxs[2]; __shared__ float s_min[2]; __shared__ float s_range[2]; __shared__ float s_delta[2]; __shared__ int s_pf[KernelGroupKNN::BLOCK_SIZE_2D]; __shared__ int s_skip1; __shared__ int s_skip2; n_graphs = 0; cvid1 = cvar_ndx1; cvid2 = cvar_ndx2; thread_0_0 = (!tx && !ty); block_0_0 = (!bx && !by); sample_less = (sampy < n_samples); if (!sample_less || gridx >= n_data_grid_points_) return; // whether the sample was a success (1) or a failure (0). Each row of the // block (each data grid point) shares this single Monte Carlo failure metric // value if (!tx) { s_pf[ty] = (sample_less ? d_failure_matrix_[metric_ndx * d_failure_pitch_ + sampy] : 1); } while (cvid1 < n_cvars_ - 1) { // if this cvar is not dispersed, skip it and do not generate a graph // with it __syncthreads(); if (thread_0_0) s_skip1 = (int) d_cvars_[cvid1].skip_; __syncthreads(); if (s_skip1) // <=== THIS SHARED VARIABLE IS SPURIOUSLY MODIFIED { cvid1++; cvid2 = cvid1+1; continue; } // each thread read in one sample of cvar 1 cxs[0] = (sample_less ? d_outer_buf_[cvid1 * d_buf_pitch_ + sampy] : 0.0); if (thread_0_0) { // only one min, max, range, delta for each variable, so only thread // zero reads them into shared memory s_min[0] = d_min_[cvid1]; s_range[0] = fabs(d_max_[cvid1] - s_min[0]); s_delta[0] = s_range[0]/(n_points_per_axis_ - 1); } __syncthreads(); while (cvid2 < n_cvars_) { // if this cvar is not dispersed, skip it and do not generate a graph // with it __syncthreads(); if (thread_0_0) s_skip2 = (int) d_cvars_[cvid2].skip_; __syncthreads(); if (s_skip2) { cvid2++; continue; } // each thread read in one sample of cvar 2 cxs[1] = (sample_less ? d_outer_buf_[cvid2 * d_buf_pitch_ + sampy] : 0.0); if (thread_0_0) { s_min[1] = d_min_[cvid2]; s_range[1] = fabs(d_max_[cvid2] - s_min[1]); s_delta[1] = s_range[1]/(n_points_per_axis_ - 1); } __syncthreads(); if (thread_0_0 && block_0_0) { // store the X- and Y-axis parameters for this graph. this is // is only done once per gvar for the entire kernel launch. See // the comment regarding the last argument "n_graphs" in the // function itself. kernelSetGvarParams(0, d_gvars_ + base_graph_ndx + n_graphs, d_cvars_ + cvid1, s_min[0], s_min[0] + s_range[0], s_delta[0], base_graph_ndx + n_graphs); kernelSetGvarParams(1, d_gvars_ + base_graph_ndx + n_graphs, d_cvars_ + cvid2, s_min[1], s_min[1] + s_range[1], s_delta[1], base_graph_ndx + n_graphs); } __syncthreads(); kernelCountNearestNeighbors( n_samples, n_data_grid_points_, n_points_per_axis_, n_graphs_per_subpass_, n_sample_blocks_, sampy, gridx, n_graphs , start_graph_ndx + n_graphs, s_pf[ty], cxs, s_min, s_range, s_delta, d_n_succ_per_block1_, d_n_succ_per_block2_, d_n_succ_per_block3_, d_n_fail_per_block1_, d_n_fail_per_block2_, d_n_fail_per_block3_, dv_fine_counts_, dv_mismatch_count_, NULL, dv_var_bands_, d_outer_buf_, d_buf_pitch_, cvid1, cvid2, NULL, &s_skip2, d_scratch_, d_scratch_f_); if (++n_graphs >= n_to_generate) break; cvid2++; } if (n_graphs >= n_to_generate) break; cvid1++; cvid2 = cvid1+1; } // this count of graphs generated is updated only once for the entire launch if (thread_0_0 && block_0_0) *d_n_graphs_generated_ = n_graphs; } // ------------------------------------------------------------------------------------- // kernelSetGvarParams // // When a graph is evaluated, store all of its parameters in a GraphVarsKNN object. // With an "x_or_y" argument of 0, the X-axis parameters of the graph are updated, // otherwise the Y-axis parameters are updated // ------------------------------------------------------------------------------------- __device__ void kernelSetGvarParams(const int x_or_y, GraphVarsKNN *gvar, const CompoundVarKDEC *cvar, const float min, const float max, const float delta, const int index) { if (!x_or_y) { // this is done only once, although this function is called twice for each // graph, once for the X-axis and once for the Y-axis. gvar->index_ is a // little tricky. Since the gvar array is used by knndata_->d_unloadGraphKNN() // in SaveKNN to unload the "best" colorings from the device. The // knndata_->d_colorings does not include the previous winners, so we set // index here to a number ranging from 0 to the number of graphs generated // on this pass, previous winners ignored. gvar->index_ = index; gvar->on_host_ = false; gvar->J_ = 0.0f; // update the X-axis parameters of the Graph gvar->minx_ = min; gvar->maxx_ = max; gvar->deltax_ = delta; gvar->v1x_ = cvar->v1_; gvar->v2x_ = cvar->v2_; if (cvar->op_ == CompoundVarKDEC::OP_SUBTRACT_) gvar->opx_ = GraphVarsKNN::OP_SUBTRACT_; else if (cvar->op_ == CompoundVarKDEC::OP_DIVIDE_) gvar->opx_ = GraphVarsKNN::OP_DIVIDE_; else gvar->opx_ = GraphVarsKNN::OP_NONE_; } else { // update the Y-axis parameters of the Graph gvar->miny_ = min; gvar->maxy_ = max; gvar->deltay_ = delta; gvar->v1y_ = cvar->v1_; gvar->v2y_ = cvar->v2_; if (cvar->op_ == CompoundVarKDEC::OP_SUBTRACT_) gvar->opy_ = GraphVarsKNN::OP_SUBTRACT_; else if (cvar->op_ == CompoundVarKDEC::OP_DIVIDE_) gvar->opy_ = GraphVarsKNN::OP_DIVIDE_; else gvar->opy_ = GraphVarsKNN::OP_NONE_; } } // ------------------------------------------------------------------------------------- // kernelCountNearestNeighbors // // Generate a uniform grid over the space covered by the two compound variables, and // compute the distance of every grid point from every graph point (the graph points // are formed from the ordered pairs of the compound variables (cvid1, cvid2)). For // each CUDA block, determine the numbers of successes and failures in the neighborhood // of each grid point. // ------------------------------------------------------------------------------------- __device__ void kernelCountNearestNeighbors ( const int n_samples, // # of Monte Carlo samples const int n_data_grid_points_, // # total number of grid points const int n_points_per_axis_, // # of points per graph axis const int n_graphs_per_subpass_,// # of graphs to be generated const int n_sample_blocks_, // # of blocks on y-axis of CUDA grid const int sampy, // Monte Carlo sample/run index const int gridx, // grid point index const int graph_ndx, // index of graph to be created, out // of n_graphs_per_subpass_ const int graph_ndx_abs, // NOTE const int pf, // success or failure const float cxs[2], // compound variable values const float min[2], // minimum for each compound variable const float range[2], // range for each compound variable const float delta[2], // division of the axis for each // compound variable int *d_n_succ_per_block1_, // arrays of partial counts of int *d_n_succ_per_block2_, // success samples near each int *d_n_succ_per_block3_, // grid point, on a per-block // basis int *d_n_fail_per_block1_, // arrays of partial counts of int *d_n_fail_per_block2_, // failure samples near each int *d_n_fail_per_block3_, // grid point, on a per-block // basis const char *dv_fine_counts_, int *dv_mismatch_count_, int *s_flag, float *dv_var_bands_, const float *d_outer_buf_, const int d_buf_pitch_, const int cvid1, const int cvid2, const float cv_old[2], int *s_skip2, int *d_scratch_, float *d_scratch_f_ ) { const int BLOCK_SIZE_2D = KernelGroupKNN::BLOCK_SIZE_2D; // s_A: one column for each 2-dimensional Monte Carlo sample // s_B: one column for each 2-dimensional uniform data grid point // s_n_succ: one count for each thread (each distance) in the CUDA block // s_n_fail: one count for each thread (each distance) in the CUDA block __shared__ float s_A[2][BLOCK_SIZE_2D]; __shared__ float s_B[2][BLOCK_SIZE_2D]; __shared__ int s_n_succ1[BLOCK_SIZE_2D][BLOCK_SIZE_2D]; __shared__ int s_n_succ2[BLOCK_SIZE_2D][BLOCK_SIZE_2D]; __shared__ int s_n_succ3[BLOCK_SIZE_2D][BLOCK_SIZE_2D]; __shared__ int s_n_fail1[BLOCK_SIZE_2D][BLOCK_SIZE_2D]; __shared__ int s_n_fail2[BLOCK_SIZE_2D][BLOCK_SIZE_2D]; __shared__ int s_n_fail3[BLOCK_SIZE_2D][BLOCK_SIZE_2D]; int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; // designed to create the data grid in row-major order, starting from the // lowest X-axis value and working upward, row by row. int grid_row = gridx / n_points_per_axis_; int grid_col = gridx % n_points_per_axis_; int begin_A = BLOCK_SIZE_2D * by; int begin_B = BLOCK_SIZE_2D * bx; int ndx; unsigned int stride; // 1.0001 keeps points on boundary from being missed float grid_delta1 = (1.0001f / (2.0f * (float) (n_points_per_axis_ - 1))); float grid_delta2 = (float) d_radius2_knn * grid_delta1; float grid_delta3 = (float) d_radius3_knn * grid_delta1; grid_delta1 *= (float) d_radius1_knn; float L_inf_dist = 0.0f, temp; bool cond1 = (begin_B + tx < n_data_grid_points_); bool cond2 = (begin_A + ty < n_samples); // read a 2-dimensional sample vector into column "ty" of s_A. We // do it only for tx == 0 so that each vector is read only once if (!tx) { if (cond2) { s_A[0][ty] = cxs[0]; s_A[1][ty] = cxs[1]; } else { s_A[0][ty] = 0.0f; s_A[1][ty] = 0.0f; } } __syncthreads(); // create a 2-dimensional data grid point in column "tx" of s_B. We // do it only for ty == 0 so that each grid point is created only once if (!ty) { if (cond1) { s_B[0][tx] = min[0] + delta[0] * grid_col; s_B[1][tx] = min[1] + delta[1] * grid_row; } else { s_B[0][tx] = 0.0f; s_B[1][tx] = 0.0f; } } __syncthreads(); if (cond2 && cond1) { // L-infinity norm distance -- each component is weighted by the inverse // of the corresponding compound variable's range. Compute the distance // between a sample "ty" and a grid point "tx". L_inf_dist = fabs((s_A[0][ty] - s_B[0][tx]) / range[0]); temp = fabs((s_A[1][ty] - s_B[1][tx]) / range[1]); if (temp > L_inf_dist) L_inf_dist = temp; } __syncthreads(); s_n_succ1[ty][tx] = 0; s_n_fail1[ty][tx] = 0; s_n_succ2[ty][tx] = 0; s_n_fail2[ty][tx] = 0; s_n_succ3[ty][tx] = 0; s_n_fail3[ty][tx] = 0; __syncthreads(); if (cond2 && cond1) // NOTE maybe cond2 and cond1 doesn't hold for some members of shared memory { if (pf) { // innermost, middle and outermost "radii" if (L_inf_dist <= grid_delta1) s_n_succ1[ty][tx] = 1; else if (L_inf_dist <= grid_delta2) s_n_succ2[ty][tx] = 1; else if (L_inf_dist <= grid_delta3) s_n_succ3[ty][tx] = 1; } else { // innermost, middle and outermost "radii" if (L_inf_dist <= grid_delta1) s_n_fail1[ty][tx] = 1; else if (L_inf_dist <= grid_delta2) s_n_fail2[ty][tx] = 1; else if (L_inf_dist <= grid_delta3) s_n_fail3[ty][tx] = 1; } } // NOTE: remove me __syncthreads(); // compute the stride for the reduction below. stride will be the largest power // of two that is less than the block size. for (stride = 1; stride < BLOCK_SIZE_2D; stride <<= 1); if (stride >= BLOCK_SIZE_2D) stride >>= 1; // perform a reduction within this block -- add up all the # of success and failure // samples near each grid point tx. stride is designed for the situation // when the block size is/is not a power of two or is odd. In those cases, one // the first iteration, the lowest numbered threads in each block will be combined // with those numbered "stride" or higher. So for an block size of 11, stride // would be 8 and on the first iteration, threads 0, 1 and 2 (relative to the base // of each set) would reduce with threads 8, 9 and 10 relative to the base of each // set. After that, the reduction proceeds normally, reducing thread divergence by // making blocks of adjacent threads perform the same instructions. for ( ; stride > 0; stride >>= 1) { __syncthreads(); if (ty < stride && cond2 && cond1 && begin_A + ty + stride < n_samples && ty + stride < BLOCK_SIZE_2D) // NOTE: added third and 4th conditions, may not be necessary { s_n_succ1[ty][tx] += s_n_succ1[ty + stride][tx]; s_n_fail1[ty][tx] += s_n_fail1[ty + stride][tx]; s_n_succ2[ty][tx] += s_n_succ2[ty + stride][tx]; s_n_fail2[ty][tx] += s_n_fail2[ty + stride][tx]; s_n_succ3[ty][tx] += s_n_succ3[ty + stride][tx]; s_n_fail3[ty][tx] += s_n_fail3[ty + stride][tx]; } } // store the count, for this CUDA block, of success samples near the grid point // that is indexed "gridx". Do the same for failure samples. The arrays // d_n_succ_per_block_[] and d_n_fail_per_block_[] are 3-dimensional, with // n_graphs_per_subpass_ rows (y-axis), n_sample_blocks_ columns (x-axis) and // a z-axis depth of n_points_per_axis_^2. if (!ty && gridx < n_data_grid_points_) { ndx = gridx * (n_sample_blocks_ * n_graphs_per_subpass_) + graph_ndx * n_sample_blocks_ + by; d_n_succ_per_block1_[ndx] = s_n_succ1[0][tx]; d_n_fail_per_block1_[ndx] = s_n_fail1[0][tx]; d_n_succ_per_block2_[ndx] = s_n_succ2[0][tx]; d_n_fail_per_block2_[ndx] = s_n_fail2[0][tx]; d_n_succ_per_block3_[ndx] = s_n_succ3[0][tx]; d_n_fail_per_block3_[ndx] = s_n_fail3[0][tx]; } }