I am currently running CUDA on 8800 GTS on SLAMD64 (a 64-bit linux slackware distribution) using the SUSE Linux Enterprise Desktop driver. I thought that the maximum grid dimension was 65535 however when I went from CUDA 1.0 to 2.0 I got a seg fault on code that previously had run successfully. I narrowed down the critical section to the following code:
#include <stdio.h>
#include <cutil.h>
#include <cuda.h>
#define BLOCK_SIZE_IDX 256
global void applyIndex(float *orderedArray, float *inArray, int *indxArray)
{
// Block index
int b_idx = blockIdx.x;
// Thread index
int t_idx = threadIdx.x;
int indx = BLOCK_SIZE_IDX*b_idx + t_idx;
unsigned int index = indxArray[indx];
orderedArray[indx] = inArray[index];
}
main (int argc, char *argv) {
int P = 256*256;
int M = atol(argv[1]);
int PxM = P * M;
float szflt = sizeof(float);
int szint = sizeof(int);
printf(“M: %d P: %d\n”, M, P);
int idx;
CUDA_SAFE_CALL(cudaMallocHost((void*) &idx, (PxM)*szint));
for (int i=0; i<PxM; i++) idx[i] = i;
int d_p_idx;
CUDA_SAFE_CALL(cudaMalloc((void*) &d_p_idx, PxMszint));
CUDA_SAFE_CALL(cudaMemcpy(d_p_idx, idx, PxMszint, cudaMemcpyHostToDevice));
// Free space
CUDA_SAFE_CALL(cudaFreeHost(idx));
float ws1_dev, ws2_dev;
CUDA_SAFE_CALL(cudaMalloc((void) &ws1_dev, PMszflt));
CUDA_SAFE_CALL(cudaMalloc((void**) &ws2_dev, PMszflt));
printf(“Threads per block: %d Grid dimension: %d\n”,
BLOCK_SIZE_IDX, PxM/BLOCK_SIZE_IDX);
applyIndex<<<PxM/BLOCK_SIZE_IDX, BLOCK_SIZE_IDX>>>
(ws2_dev, ws1_dev, d_p_idx);
CUT_CHECK_ERROR(“Kernel execution failed”);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
// clean up memory
CUDA_SAFE_CALL(cudaFree(ws1_dev));
CUDA_SAFE_CALL(cudaFree(ws2_dev));
CUDA_SAFE_CALL(cudaFree(d_p_idx));
return 0;
}
When executed with M=128 then I get: Threads per block: 256 Grid dimension: 32768
With M=129 I get: Threads per block: 256 Grid dimension: 33024 and an “unspecified launch failure”.
I receive no errors during the memory allocation or copy commands. The code runs successfuly in emulation for both cases. Running under valgrind shows no errors. Any ideas what the problem might be?
Thanks.