One question regarding shared memory

Hello,

I am writing a program to find the maximum value and the its index in a large set of data. The data size is very large, 4194304 float values.

I am not very professional in CUDA. Based on what I have read, I think every CUDA function can only find the maximum value in one block, and the maximum num. of threads per block is 1024. So I think I have to run the similar function multiple times to get the final one maximum value and its index. Is it right?

Now in my code, I create two shared memories, one to store value, one to store its index.

If my code is as follows:

__global__ void cuMaxCplxPerBlocknIndx(float *in, float *out, int *out_Indx)
{
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x+threadIdx.x;

    extern __shared__ float shared_1[];
    __shared__ int shared_2[1024];

    shared_1[tid] = in[i*2]*in[i*2]+in[i*2+1]*in[i*2+1];
    shared_2[tid] = i%4194304;
    __syncthreads();

    for(unsigned int s=1; s < blockDim.x; s *= 2)
    {
		if (tid % (2*s) == 0)
		{
			if (shared_1[tid] < shared_1[tid+s])
			{
				shared_1[tid] = shared_1[tid+s];
				shared_2[tid] = shared_2[tid+s];
			}
		}
		__syncthreads();
    }
    if (tid == 0)
    {
		out[blockIdx.x] = shared_1[0];
		out_Indx[blockIdx.x] = shared_2[0];
    }
}

And I call this function as:

cuMaxCplxPerBlocknIndx<<<4194304/1024,1024,1024*sizeof(float)*2>>>(tmp1, tmp2, tmp2Indx);

The result is correct,

BUT, if the kernel function is as follows:

__global__ void cuMaxCplxPerBlocknIndx(float *in, float *out, int *out_Indx)
{
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*blockDim.x+threadIdx.x;

    extern __shared__ float shared_1[];
    __shared__ int* shared_2;
    if (threadIdx.x ==0)
    	shared_2 = new int[blockDim.x];
    __syncthreads();

    shared_1[tid] = in[i*2]*in[i*2]+in[i*2+1]*in[i*2+1];;
    shared_2[tid] = i%4194304;
    __syncthreads();

    for(unsigned int s=1; s < blockDim.x; s *= 2)
    {
		if (tid % (2*s) == 0)
		{
			if (shared_1[tid] < shared_1[tid+s])
			{
				shared_1[tid] = shared_1[tid+s];
				shared_2[tid] = shared_2[tid+s];
			}
		}
		__syncthreads();
    }
    if (tid == 0)
    {
		out[blockIdx.x] = shared_1[0];
		out_Indx[blockIdx.x] = shared_2[0];
    }
}

And I use the same way to call the kernel function, as:

cuMaxCplxPerBlocknIndx<<<4194304/1024,1024,1024*sizeof(float)*2>>>(tmp1, tmp2, tmp2Indx);

I will get error message:
“unspecified launch failure”

I guess the error is because of the shared memory size limit. But I dont understand why if I hard code the size of “shared_2”, it will work fine.

Can anyone give me any advice?

Thank you.

Your code probably performs an out-of-bounds memory access somewhere. Run it under cuda-memcheck to find it. Follow the link in my signature for more info.

If it is the out-of-bounds issue, Is there any better way to do this job?

The task you want to accomplish is a specific example of a reduction, which is a common building block for parallel algorithms. There should be an example of the use of shared memory to speed up reductions in the collection of sample apps that comes with CUDA, but I don’t have a pointer ready.

If your data happens to be all positive, you could use the CUBLAS function cublasIsamax() to find the index of the array element with the maximum value.

That code is not optimal for reductions. Either use thrust::max_element and thrust::device_pointer to get a pointer to the max element in the array, or read this;

[url]https://docs.google.com/viewer?a=v&q=cache:yDQjC03yD-8J:developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf+cuda+reduce&hl=en&gl=us&pid=bl&srcid=ADGEESiY9Z-Nu87rr599YFRjq29ciqXAuA05iC-NI9OcgIIYzgTrzbOLE65-Bmt7DS65uLfWsuv1CRzuN94R_NZgA4Lkxrtvcdnkn3K2sWL9oNgHzbe9QIYZNP9IzLN2itsjtvqca7xW&sig=AHIEtbRqvg2GqcXvY18UxvA6KcAlnpsFDA[/url]

which has 7 different versions of reduction code.

Something like this, you just replace the min opration with sum.

#include < cuda.h >
#include < stdio.h >
#include < time.h >


#define tbp 512
__global__ void kernel_min(int *a, int *d,int nn)
{
__shared__ int sdata[tbp]; //"static" shared memory

unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid]=100000000;
if(i<nn) 
{
sdata[tid] = a[i];
}

__syncthreads();
for(unsigned int s=tbp/2 ; s > = 1 ; s=s/2)
{
if(tid < s)
{
if(sdata[tid] > sdata[tid + s])
{
sdata[tid] = sdata[tid + s];
}
}
__syncthreads();
}
if(tid == 0 ) 
{
d[blockIdx.x] = sdata[0];
}
}

int main()
{
const int N=3000;
const int nblocks=(N+tbp-1)/tbp;
srand(time(NULL));

int *a;
a = (int*)malloc(N * sizeof(int));
int *d;
d = (int*)malloc(nblocks * sizeof(int));

int *dev_a, *dev_d;

cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_d, nblocks*sizeof(int));
int mmm=100;
for(int i = 0 ; i < N ; i++)
{
a[i] = rand()% 100 + 5;
//printf("%d \n",a[i]);
if(mmm>a[i]) mmm=a[i];

}
printf("");
printf("");
printf("");
printf("");
cudaMemcpy(dev_a , a, N*sizeof(int),cudaMemcpyHostToDevice);

kernel_min < < < nblocks,tbp > > >(dev_a,dev_d,N);
kernel_min< < < 1,tbp> > >(dev_d,dev_d,nblocks);

cudaMemcpy(d, dev_d, nblocks*sizeof(int),cudaMemcpyDeviceToHost);

printf("cpu min %d, gpu_min = %d\n",mmm,d[0]);

cudaFree(dev_a);
cudaFree(dev_d);

printf("");

return 0;
}