Just to check if someone maybe knows the answer.
I have a kernel with a 2D blocksize. In the first part each thread calculates some values (dependent on text2D(texref, threadIdx.x, threadIdx.y)). These values are stored to some shared arrays and in the end I need to calculate the sum, so I have to do a reduction. But I don’t know how to use the if threadIdx.x < 32 trick from the reduction example.
I think the following should work, but I am not sure how threads from a 2D block are divided into warps. The important line is unsigned int tid = threadIdx.x + __umul24(threadIdx.y, blockDim.x);, I assume tid = [0 31] are the first warp.
mykernel<<<gridsize, dim3(16,8,1)>>>(out1, out2, out3, out4, offsetx, offsety);
__global__ void mykernel(float *out1, float *out2, float *out3, float *out4, int offset1, int offset2) {
__shared__ float arr1[16*8];
__shared__ float arr2[16*8];
__shared__ float arr3[16*8];
__shared__ float arr4[16*8];
unsigned int tid = threadIdx.x + __umul24(threadIdx.y, blockDim.x);
float dum = tex2D(texref, threadIdx.x+offset1, threadIdx.y+offset2);
arr1[tid] = func1(dum);
arr2[tid] = func2(dum);
arr3[tid] = func3(dum);
arr4[tid] = func4(dum);
// now I need to do a reduction
if (tid < 64) {
arr1[tid] += arr1[tid+64];
arr2[tid] += arr2[tid+64];
arr3[tid] += arr3[tid+64];
arr4[tid] += arr4[tid+64];
}
__syncthreads();
if (tid < 32) {
arr1[tid] += arr1[tid+32];
arr2[tid] += arr2[tid+32];
arr3[tid] += arr3[tid+32];
arr4[tid] += arr4[tid+32];
arr1[tid] += arr1[tid+16];
arr2[tid] += arr2[tid+16];
arr3[tid] += arr3[tid+16];
arr4[tid] += arr4[tid+16];
arr1[tid] += arr1[tid+8];
arr2[tid] += arr2[tid+8];
arr3[tid] += arr3[tid+8];
arr4[tid] += arr4[tid+8];
arr1[tid] += arr1[tid+4];
arr2[tid] += arr2[tid+4];
arr3[tid] += arr3[tid+4];
arr4[tid] += arr4[tid+4];
arr1[tid] += arr1[tid+2];
arr2[tid] += arr2[tid+2];
arr3[tid] += arr3[tid+2];
arr4[tid] += arr4[tid+2];
arr1[tid] += arr1[tid+1];
arr2[tid] += arr2[tid+1];
arr3[tid] += arr3[tid+1];
arr4[tid] += arr4[tid+1];
}
if (tid==0) {
out1[blockIdx.x] += arr1[0];
out2[blockIdx.x] += arr2[0];
out3[blockIdx.x] += arr3[0];
out4[blockIdx.x] += arr4[0];
}
}
Does anybody have any experience with how threads are split up in warps for 2D blocksize??