unspecified lauch failure...

Hello.

I’m just about to make a loop program on CUDA as below.

From several information, my guess at first was that this problem caused by segmentation
But this program don’t seem to have such a bug…
Please point out points including bugs…

Thank you.

/////////////////////////////////////////////////////////////
#include<stdio.h>
#include<cutil.h>
#include<cutil_inline.h>

#define LOOP (20)
#define BLOCKNUM (60)
#define THREAD_PER_BLOCK (256)
#define STRIDESIZE THREAD_PER_BLOCK16BLOCKNUM
#define FILESIZE THREAD_PER_BLOCK16BLOCKNUM*LOOP

global void kernel(
unsigned int *in,
unsigned int *out,
unsigned int loopnum,
unsigned int stride)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int io[4];

//loop start
int i;
for(i = 0; i < loopnum; i++){

	io[0] = in[4*tid + 0];
	io[1] = in[4*tid + 1];
	io[2] = in[4*tid + 2];
	io[3] = in[4*tid + 3];

	io[0] += 1;
	io[1] += 1;
	io[2] += 1;
	io[3] += 1;

	out[4*tid + 0] = io[0];
	out[4*tid + 1] = io[1];
	out[4*tid + 2] = io[2];
	out[4*tid + 3] = io[3];

	in += stride;
	out += stride;
}
return;

}

// ****************************
// MAIN
// ****************************w
main(int argc, char **argv){

unsigned int *pt;
unsigned int *ct_buf2;


CUT_DEVICE_INIT(argc, argv);

// malloc
CUDA_SAFE_CALL( cudaMallocHost((void **)&ct_buf2, sizeof(unsigned int)*FILESIZE/4) );
CUDA_SAFE_CALL( cudaMallocHost((void **)&pt, sizeof(unsigned int)*FILESIZE/4) );

unsigned int *d_pt, *d_ct;
CUDA_SAFE_CALL( cudaMalloc( (void **)&d_pt, sizeof(unsigned int)*FILESIZE/4) );
CUDA_SAFE_CALL( cudaMalloc( (void **)&d_ct, sizeof(unsigned int)*FILESIZE/4) );

CUDA_SAFE_CALL( cudaMemcpy( d_pt, pt, sizeof(unsigned char)*FILESIZE/4, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemset( d_ct, 0, sizeof(unsigned int)*FILESIZE/4) );

dim3 grid(BLOCKNUM, 1, 1), block(THREAD_PER_BLOCK, 1, 1);
kernel<<<grid, block>>>(d_pt, d_ct, LOOP, STRIDESIZE/4);
cudaThreadSynchronize();


CUDA_SAFE_CALL( cudaMemcpy( ct_buf2, d_ct, sizeof(unsigned int)*FILESIZE/4, cudaMemcpyDeviceToHost) );


CUDA_SAFE_CALL( cudaFreeHost(ct_buf2) );
CUDA_SAFE_CALL( cudaFreeHost(pt) );
CUDA_SAFE_CALL( cudaFree( d_pt ) );
CUDA_SAFE_CALL( cudaFree( d_ct ) );

}

Your stride size is four times too large. Adding x to a pointer p in C will increment p by x*sizeof(p) bytes, not by x bytes.

Thank you for your reply.

As you say, I know that “Adding x to a pointer p in C will increment p by x*sizeof(p) bytes”.

I forgot to explain that “STRIDESIZE/4” is added at kernel call.

If you found other causes already, and copy correct code to this page, I’m very helpful…