My use of shared memory seems to be causing a problem in a program I am writing. It works in deviceemulation mode, but not without. On my linux machine (CUDA 2.0, GTX280) the program crashes X. On my windows machine (CUDA 2.0, GTS8800512) I don’t get a crash, but I do get a memory transfer error copying from the device to the host. On a laptop (CUDA 2.1, 9400M) the program runs fine.
Instead of copying the actual program I’ve produced a small test program that only has the neccesary parts. If inside the kernel I comment out the part reading into shared memory then the program runs fine. Otherwise, we encounter the errors mentioned above.
If anyone has any idea what is causing the crashes I would be grateful. I note it seems to work under CUDA 2.1, so if this has been a bug fix then does anyone know what the bug was in the first place?
#include <stdio.h>
#include <cuda.h>
struct myStruct {
int a;
int4 b;
float3 c;
float3 d;
float e;
float3 f;
float3 g;
float3 h;
float3 i;
float j;
int k;
};
__global__ void test(myStruct *data)
{
extern __shared__ myStruct s_data[];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + tid;
if (i < 300) {
s_data[tid] = data[i];
}
__syncthreads();
}
void algorithm()
{
size_t size = sizeof(myStruct);
myStruct* h_data;
h_data = (myStruct *) malloc(300 * size);
myStruct bob;
bob.a = 1;
bob.b = make_int4(1,1,1,1);
bob.c = make_float3(1,1,1);
bob.d = make_float3(1,1,1);
bob.e = 1;
bob.f = make_float3(1,1,1);
bob.g = make_float3(1,1,1);
bob.h = make_float3(1,1,1);
bob.i = make_float3(1,1,1);
bob.j = 1;
bob.k = 1;
for (int i = 0; i < 300; i++)
h_data[i] = bob;
myStruct* d_data;
if (cudaMalloc((void **) &d_data, 300 * size) != cudaSuccess) { printf("Error allocating memory on device!\n"); exit(1); }
if (cudaMemcpy(d_data, h_data, 300 * size, cudaMemcpyHostToDevice) != cudaSuccess) { printf("Error copying host->device"); exit(1); }
test <<< 5, 64, 64*size >>> (d_data);
if (cudaMemcpy(h_data, d_data, 300 * size, cudaMemcpyDeviceToHost) != cudaSuccess) { printf("Error copying device->host"); exit(1); };
}
int main(int argc, char **argv)
{
printf("Calling algorithm...\n");
algorithm();
printf("Done!");
return 0;
}