Hello,
I am writing a program to find the maximum value and the its index in a large set of data. The data size is very large, 4194304 float values.
I am not very professional in CUDA. Based on what I have read, I think every CUDA function can only find the maximum value in one block, and the maximum num. of threads per block is 1024. So I think I have to run the similar function multiple times to get the final one maximum value and its index. Is it right?
Now in my code, I create two shared memories, one to store value, one to store its index.
If my code is as follows:
__global__ void cuMaxCplxPerBlocknIndx(float *in, float *out, int *out_Indx)
{
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x+threadIdx.x;
extern __shared__ float shared_1[];
__shared__ int shared_2[1024];
shared_1[tid] = in[i*2]*in[i*2]+in[i*2+1]*in[i*2+1];
shared_2[tid] = i%4194304;
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
if (tid % (2*s) == 0)
{
if (shared_1[tid] < shared_1[tid+s])
{
shared_1[tid] = shared_1[tid+s];
shared_2[tid] = shared_2[tid+s];
}
}
__syncthreads();
}
if (tid == 0)
{
out[blockIdx.x] = shared_1[0];
out_Indx[blockIdx.x] = shared_2[0];
}
}
And I call this function as:
cuMaxCplxPerBlocknIndx<<<4194304/1024,1024,1024*sizeof(float)*2>>>(tmp1, tmp2, tmp2Indx);
The result is correct,
BUT, if the kernel function is as follows:
__global__ void cuMaxCplxPerBlocknIndx(float *in, float *out, int *out_Indx)
{
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x+threadIdx.x;
extern __shared__ float shared_1[];
__shared__ int* shared_2;
if (threadIdx.x ==0)
shared_2 = new int[blockDim.x];
__syncthreads();
shared_1[tid] = in[i*2]*in[i*2]+in[i*2+1]*in[i*2+1];;
shared_2[tid] = i%4194304;
__syncthreads();
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
if (tid % (2*s) == 0)
{
if (shared_1[tid] < shared_1[tid+s])
{
shared_1[tid] = shared_1[tid+s];
shared_2[tid] = shared_2[tid+s];
}
}
__syncthreads();
}
if (tid == 0)
{
out[blockIdx.x] = shared_1[0];
out_Indx[blockIdx.x] = shared_2[0];
}
}
And I use the same way to call the kernel function, as:
cuMaxCplxPerBlocknIndx<<<4194304/1024,1024,1024*sizeof(float)*2>>>(tmp1, tmp2, tmp2Indx);
I will get error message:
“unspecified launch failure”
I guess the error is because of the shared memory size limit. But I dont understand why if I hard code the size of “shared_2”, it will work fine.
Can anyone give me any advice?
Thank you.