i am still pulling apart and debugging. at first i thought i wrote the kernel too complex. if i write a simple block, it works fine.
please see if you can reproduce the issue, if you dont mind. below the code. i tried to make the kernel as simple as possble.
you will/ should note that the first
for (l_int[0] = threadIdx.x; l_int[0] < s_int[0]; l_int[0] += blockDim.x)
works fine. threads outside/ over/ above the threshold (s_int[0]) jump the loop, as they should.
the 2nd for loop does not work as it should. the debugger shows threads greater than the threshold changing their threadIdx.x to the threshold, and they then enter the for loop.
you can change:
__global__ void x_in_y_prob(int* d_int_data, int** d_int_ptr, double** d_dbl_ptr)
to
__global__ void x_in_y_prob(double* d_dbl_ptr)
and
d_dbl_ptr[0][l_int[1]] = l_dbl[0];
to
d_dbl_ptr[l_int[1]] = l_dbl[0];
then you only need to create and pass a double array of 256
kernel dimensions:
dim3 dGx(1, 1, 1);
dim3 dBx(256, 1, 1);
__global__ void x_in_y_prob(int* d_int_data, int** d_int_ptr, double** d_dbl_ptr)
{
__shared__ int s_int[2];
if (threadIdx.x == 0)
{
s_int[0] = 36;
s_int[1] = 0;
}
__syncthreads();
{
int l_int[6];
double l_dbl[1];
for (l_int[0] = threadIdx.x; l_int[0] < s_int[0]; l_int[0] += blockDim.x)
{
l_int[1] = l_int[0] * 2;
l_int[2] = l_int[1] * 5;
l_int[3] = l_int[2] * 4;
l_int[4] = l_int[3] * 6;
l_int[5] = l_int[4] * 7;
l_dbl[0] = l_int[5] / 2.4342;
d_dbl_ptr[0][l_int[1]] = l_dbl[0];
}
}
__syncthreads();
{
int l_int[6];
double l_dbl[1];
for (l_int[0] = threadIdx.x; l_int[0] < s_int[0]; l_int[0] += blockDim.x)
{
l_int[1] = s_int[1] + l_int[0];
l_int[2] = threadIdx.x % 4;
l_int[3] = threadIdx.x / 4;
if (l_int[2] == l_int[3])
{
l_int[4] = 1;
l_int[5] = 1;
}
else if (l_int[2] == 1)
{
l_int[4] = l_int[3];
l_int[5] = 1;
}
else
{
l_int[4] = 1;
l_int[5] = 1;
/*
l_int[4] = get_x1s_in_ybits(l_int[2], l_int[3]);
l_int[2]--;
l_int[3]--;
l_int[5] = get_x1s_in_ybits(l_int[2], l_int[3]);
*/
}
l_dbl[0] = l_int[4];
l_dbl[0] = l_int[5] / l_dbl[0];
d_dbl_ptr[0][l_int[1]] = l_dbl[0];
}
}
}
__device__ int get_x1s_in_ybits(int x1s, int ybits)
{
int l_int[2];
int res, ptr, count;
int res_pnt[max_x_in_y_cnt];
bool range_cnt[max_x_in_y_cnt];
if ((x1s >= ybits) || ((x1s < 1) || (ybits < 1)))
{
l_int[0] = 0;
if ((x1s == ybits) && (x1s > 0))
{
l_int[0] = 1;
}
return l_int[0];
}
range_cnt[0] = 1;
res_pnt[0] = x1s;
res = x1s;
ptr = 0;
count = 0;
while (1)
{
ptr++;
l_int[0] = ybits - ptr - res;
if (l_int[0] < 1)
{
count++;
l_int[0] = 1;
}
else
{
res_pnt[ptr] = res;
range_cnt[ptr] = 1;
l_int[0] = 0;
}
if (l_int[0] > 0)
{
l_int[0] = ptr - 1;
while (l_int[0] >= 0)
{
l_int[1] = range_cnt[l_int[0]];
if (l_int[1] > 0)
{
break;
}
l_int[0]--;
}
if (l_int[0] < 0)
{
break;
}
ptr = l_int[0];
l_int[1] = res_pnt[ptr] - 1;
res_pnt[ptr] = l_int[1];
res = l_int[1];
range_cnt[ptr] = 0;
}
}
return count;
}