I’ve read a few of these posts, and I realize that my “unspecified launch failure” probably equates to segmentation fault.
So, can someone help me look at the code and debug this?
I ran it through the emulator, and all is fine for all N. When I run it on the gpu, it works fine for smaller N, but fails for larger N.
I’ve been looking at the code all day … I’m sure some in this forum would be able to spot the problem fairly quickly.
thanks in advance.
Btw, I’m on OSX 10.5, cuda 2.0, and running the kernel on the secondary gpu (9600GT is display, 9400 is cuda kernel)
[codebox]
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cutil.h> // CUDA utilities CUDA_SAFE_CALL
#define _DEBUG
unsigned int N = 1024*1024;// the # of elems in the array (default: 1M)
float *A; // array A on the host
float *B; // array B on the host
float *A_d; // array A on the gpu
float *B_d; // array B on the gpu
int seed = 100; // random number generator seed
int MAX_BLOCKS_DIM1 = 0; // the maximum number of blocks per grid (1st D)
int MAX_BLOCKS_DIM2 = 0; // the maximum number of blocks per grid (2nd D)
int MAX_THREADS = 0; // the maximum number of threads per block
int total_blocks = 0; // the total number of blocks needed
int numblocks_dim1 = 0; // the number of blocks per grid (1st D)
int numblocks_dim2 = 0; // the number of blocks per grid (2nd D)
int numthreads = 0; // the number of threads per block
int amount = 8; // the number to increment by
struct cudaDeviceProp *dev;// the CUDA device properties struct
static struct option long_options =
{
{“num”, required_argument, 0, ‘n’},
{0, 0, 0, 0}
};
/*
-
CUDA Kernel to increment a value in an array
-
by some value.
-
@param *a pointer to the array in GPU memory
-
@param amt the value to increment by
-
@param N the number of elements in the array
*/
global
void increment(float *a, int amt, unsigned int n)
{
unsigned int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
unsigned int threadIndex = blockIndex * blockDim.x + threadIdx.x;
if (threadIndex < n) {
a[threadIndex] = a[threadIndex] + amt;
}
}
/*
-
usage
-
Print usage and then exit.
*/
void usage(int argc, char **argv)
{
fprintf(stderr, “usage: %s [options]\n”,argv[0]);
fprintf(stderr, “\n”);
fprintf(stderr, " -n,–num N the number of elements to process.\n");
fprintf(stderr, " default is %u.\n", N);
fprintf(stderr, " -h,–help this help screen.\n");
fprintf(stderr, “\n”);
exit(1);
}
/*
-
main program
-
@param argc the number of arguments passed to the program
-
@param **argv the arguments passed in
*/
int main(int argc, char **argv)
{
int count;
int active;
int i;
srand(seed);
// get the device properties so we know how to divide the work
dev = (struct cudaDeviceProp *)malloc(sizeof(struct cudaDeviceProp));
if (!dev) {
fprintf(stderr, "*** OOM: could not allocate cudaDeviceProp ***\n");
exit(1);
}
CUDA_SAFE_CALL(cudaGetDeviceCount(&count));
cudaGetDevice(&active);
printf(“total number of devices: %d\n”, count);
printf(“active device is %d\n”, active);
for (i = 0; i < count; i++) {
cudaGetDeviceProperties(dev, i);
printf("device %d: name: %s\n", i, dev->name);
}
active = 1;
printf(“setting active device to %d\n”, active);
cudaSetDevice(active);
cudaGetDevice(&active);
printf(“active device is %d\n”, active);
cudaGetDeviceProperties(dev, active);
MAX_THREADS = dev->maxThreadsPerBlock;
MAX_BLOCKS_DIM1 = dev->maxGridSize[0];
MAX_BLOCKS_DIM2 = dev->maxGridSize[1];
// get input (N)
int option_index = 0;
int c = 0;
while (1) {
c = getopt_long (argc, argv, "n:h", long_options, &option_index);
if (c == -1) break;
switch © {
case 'n':
N = (unsigned int)atof(optarg);
break;
case '?':
default:
usage(argc,argv);
}
}
// divide the work
// each element in N should be operated on by a thread
numthreads = MAX_THREADS;
total_blocks = N / numthreads + (N % numthreads == 0 ? 0 : 1);
numblocks_dim2 = 0;
numblocks_dim1 = 0;
// more than the maximum number of blocks in 1D, so go to 2D
if (total_blocks > MAX_BLOCKS_DIM1) {
numblocks_dim1 = MAX_BLOCKS_DIM1;
numblocks_dim2 = total_blocks / MAX_BLOCKS_DIM1 +
(total_blocks % MAX_BLOCKS_DIM1 == 0 ? 0 : 1);
} else {
numblocks_dim2 = 0;
numblocks_dim1 = total_blocks;
}
printf(“device name: %s\n”, dev->name);
printf(“total global memory: %u\n”, dev->totalGlobalMem);
printf(“N = %u\n”, N);
printf(“numthreads = %d; MAX = %d\n”, numthreads, MAX_THREADS);
printf(“total_blocks = %d; MAX = %d\n”, total_blocks, MAX_BLOCKS_DIM1);
printf(“numblocks_dim2 = %d\n”, numblocks_dim2);
printf(“numblocks_dim1 = %d\n”, numblocks_dim1);
unsigned int calculated_N, extra_blocks, extra_threads;
if (numblocks_dim2 > 0) {
calculated_N = numblocks_dim1 * numthreads * numblocks_dim2;
extra_blocks = numblocks_dim1 * numblocks_dim2 - total_blocks;
} else {
calculated_N = numblocks_dim1 * numthreads;
extra_blocks = 0;
}
extra_threads = calculated_N - N;
printf(“calculated_N = %u\n”, calculated_N);
printf(“extra_threads = %u\n”, extra_threads);
printf(“extra_blocks = %u\n”, extra_blocks);
// allocate host memory for the arrays
A = (float *)malloc(N * sizeof(float));
if (!A) {
fprintf(stderr, "*** OOM: could not allocate A ***\n");
exit(1);
}
B = (float *)malloc(N * sizeof(float));
if (!A) {
fprintf(stderr, "*** OOM: could not allocate B ***\n");
exit(1);
}
CUDA_SAFE_CALL(cudaMalloc((void **)&A_d, N*sizeof(float)));
// CUDA_SAFE_CALL(cudaMalloc((void **)&B_d, N*sizeof(float)));
// initialize A
printf(“initializing A with random floats\n”);
for (i = 0; i < N; i++) {
A[i] = (float) rand()/RAND_MAX;
B[i] = A[i];
}
printf(“copying %d elements from host->gpu\n”,N);
CUDA_SAFE_CALL(cudaMemcpy(A_d,A,N*sizeof(float),cudaMemcpyHo
stToDevice));
if (numblocks_dim2 == 0) numblocks_dim2 = 1; // dimension is 1-indexed?
dim3 grid_blocks(numblocks_dim1,numblocks_dim2);
dim3 block_threads(numthreads);
printf(“running cuda kernel with %dx%d blocks/grid and %d threads/block\n”,
numblocks_dim1,numblocks_dim2,numthreads);
increment<<<grid_blocks, block_threads>>>(A_d,amount,N);
printf(“blocking until gpu work is complete\n”);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if( err != 0 ) {
printf( "** cuda error: %s ** \n", cudaGetErrorString(err));
}
CUT_CHECK_ERROR(“Kernel execution failed”);
// printf(“copying %d elements from gpu->gpu\n”,N);
// CUDA_SAFE_CALL(cudaMemcpy(B_d,A_d,N*sizeof(float),cudaMemcpy
DeviceToDevice));
// printf(“copying %d elements from gpu->host\n”,N);
// CUDA_SAFE_CALL(cudaMemcpy(B,B_d,N*sizeof(float),cudaMemcpyDe
viceToHost));
printf(“copying %d elements from gpu->host\n”,N);
CUDA_SAFE_CALL(cudaMemcpy(A,A_d,N*sizeof(float),cudaMemcpyDe
viceToHost));
unsigned int errors = 0;
for (i = 0; i < N; i++) {
if (A[i] != B[i] + amount) {
errors++;
}
}
printf(“number of errors: %u / %u\n”,errors,N);
if (errors == N) {
printf("everything failed! what's the problem?\n");
}
free(A);
free(B);
CUDA_SAFE_CALL(cudaFree(A_d));
CUDA_SAFE_CALL(cudaFree(B_d));
}
[/codebox]
my output is:
** GOOD RUN / SMALL N **
total number of devices: 2
active device is 0
device 0: name: GeForce 9600M GT
device 1: name: GeForce 9400M
setting active device to 1
active device is 1
device name: GeForce 9400M
total global memory: 266010624
N = 1048576
numthreads = 512; MAX = 512
total_blocks = 2048; MAX = 65535
numblocks_dim2 = 0
numblocks_dim1 = 2048
calculated_N = 1048576
extra_threads = 0
extra_blocks = 0
initializing A with random floats
copying 1048576 elements from host->gpu
running cuda kernel with 2048x1 blocks/grid and 512 threads/block
blocking until gpu work is complete
copying 1048576 elements from gpu->host
number of errors: 0 / 1048576
** BAD RUN / LARGER N **
total number of devices: 2
active device is 0
device 0: name: GeForce 9600M GT
device 1: name: GeForce 9400M
setting active device to 1
active device is 1
device name: GeForce 9400M
total global memory: 266010624
N = 266010624
numthreads = 512; MAX = 512
total_blocks = 519552; MAX = 65535
numblocks_dim2 = 8
numblocks_dim1 = 65535
calculated_N = 268431360
extra_threads = 2420736
extra_blocks = 4728
initializing A with random floats
copying 266010624 elements from host->gpu
running cuda kernel with 65535x8 blocks/grid and 512 threads/block
blocking until gpu work is complete
** cuda error: unspecified launch failure **
copying 266010624 elements from gpu->host
number of errors: 266010624 / 266010624
everything failed! what’s the problem?