Hi,
I’ve finally taken the trouble to create a minimal example showing the problem with constructors shared memory (using the cusp library Google Code Archive - Long-term storage for Google Code Project Hosting. ):
#include <cusp/complex.h>
__global__ void empty1(){
__shared__ cusp::complex<float> cache[128];
}
__global__ void empty2(){
__shared__ cuComplex cache[128];
}
int main(){
int n = 1000000;
int nthreads = 128;
int nblocks = (n+nthreads-1)/nthreads;
cudaEvent_t start;
cudaEventCreate(&start);
cudaEventRecord (start, 0);
empty1<<<nblocks,nthreads>>>();
cudaEvent_t end;
cudaEventCreate(&end);
cudaEventRecord (end, 0);
cudaEventSynchronize(end);
float ms;
cudaEventElapsedTime (&ms, start, end);
printf("empty1 time: %fms\n",ms);
cudaEventCreate(&start);
cudaEventRecord (start, 0);
empty2<<<nblocks,nthreads>>>();
cudaEventCreate(&end);
cudaEventRecord (end, 0);
cudaEventSynchronize(end);
cudaEventElapsedTime (&ms, start, end);
printf("empty2 time: %fms\n",ms);
return 0;
}
My running times from the example are:
empty1 time: 118.835747ms
empty2 time: 0.169600ms
It looks like each thread is trying to call the constructor for all the elements in the shared array. I think this is the case because I made larger blocks while keeping the total number of threads constant and the runtime didn’t change (if the initialization would be spread out among the threads in the block I would expect it to decrease). Is this the expected behavior? What’s the rule for constructors in shared memory?
(I used cuda 3.2 on macosx for this example)
Cheers,
Filipe