1 kernel, N launches, 1 result ?

Hi,

I’m trying to launch multiple kernel and get one result as follow :

// cudaMalloc(d_result)

	// fill d_result with '

// cudaMalloc(d_result)

// fill d_result with '\0'

for( i=0; i<N; i++)

	mykernel <<< 512, 64 >>> (myArray, i, d_result );

cudaThreadSynchronize();

	cudaMemcpy(result, d_result, 8 * sizeof(char), cudaMemcpyDeviceToHost);

printf("%s\n", result);
'

	for( i=0; i<N; i++)

		mykernel <<< 512, 64 >>> (myArray, i, d_result );

	cudaThreadSynchronize();

		cudaMemcpy(result, d_result, 8 * sizeof(char), cudaMemcpyDeviceToHost);

	printf("%s\n", result);

In the above code, d_result is an array of 8 chars which is filled with ‘\0’ before launching the kernel. The kernel performs some computation according to myArray (constant) and i (counter). In this example, i go from 0 to N and only one value of i can modify the d_result array.

__global__ void mykernel(int myArray, int i, char *d_result)

{

	if(i == 7)

	{

		d_result[0] = 'o';

		d_result[1] = 'k';

	}

}

Because of the statement "cudaThreadSynchronize()’ I was thinking that when all kernels returned, only i=7 had modify the d_result array but in fact the printf display ‘nothing’ (\0\0\0…\0 - the init value of d_result).

Is this a normal behavior of such code ? My goal is to launch 1 kernel N times. On thoses N kernel launches, only one can return a “result” - so how do you do if my approach is not correct ? :o)

If I’m right, d_result is in global memory (cudaMalloc) so every blocks in the grid and every threads block have access to the same memory space.

Thanks a lot,

n0mad

mykernel <<< 512, 64 >>> (myArray, i, d_result );

What you are doing is passing a pointer (d_result) to the kernel. You created 8 chars on the host, d_result is of type char* say, and so d_result points to an address in host memory. In your kernel:

__global__ void mykernel(int myArray, int i, char *d_result)

{

	if(i == 7)

	{

		d_result[0] = 'o';

		d_result[1] = 'k';

	}

}

You are then writing to this address on the device, which in all likelihood points to some invalid area which on the host would possibly result in a segmentation fault.

What you can do is cudaMalloc space on the device to write your results to, and then after the kernel call do cudaMemcpy using deviceToHost to get access to the results.

er, he specifically said he cudaMalloc’d something

Hello,

In my example, the d_result pointer is a valid device pointer. I mean I allocated it using cudaMalloc (as explain in the comment lines).

char *d_result; // device array

if( cudaMalloc((void **) &d_result, 9 * sizeof(char)) != cudaSuccess )

{

	printf("Error : malloc d_result\n");

	exit(-1);

}

char *result = (char *)calloc(9, sizeof(char)); // host array init with '

char *d_result; // device array

if( cudaMalloc((void **) &d_result, 9 * sizeof(char)) != cudaSuccess )

{

printf("Error : malloc d_result\n");

exit(-1);

}

char *result = (char *)calloc(9, sizeof(char)); // host array init with ‘\0’

if( cudaMemcpy(d_result, result, 9 * sizeof(char), cudaMemcpyHostToDevice) != cudaSuccess ) // copy to device to init it to ‘\0’

{

printf("Error : memcpy d_result\n");

exit(-1);

}

// launch kernel here

'

if( cudaMemcpy(d_result, result, 9 * sizeof(char), cudaMemcpyHostToDevice) != cudaSuccess ) // copy to device to init it to '

char *d_result; // device array

if( cudaMalloc((void **) &d_result, 9 * sizeof(char)) != cudaSuccess )

{

printf("Error : malloc d_result\n");

exit(-1);

}

char *result = (char *)calloc(9, sizeof(char)); // host array init with ‘\0’

if( cudaMemcpy(d_result, result, 9 * sizeof(char), cudaMemcpyHostToDevice) != cudaSuccess ) // copy to device to init it to ‘\0’

{

printf("Error : memcpy d_result\n");

exit(-1);

}

// launch kernel here

'

{

	printf("Error : memcpy d_result\n");

	exit(-1);

}

// launch kernel here

So ? =)

Works fine for me (CUDA 2.1 on Linux run on a 9500GT).

Yes…this example also works for me… I assume that’s a tricky issue in my real code…

thanks anyway

No I mean your original code work perfectly:

#include <stdio.h>

#include "cuda.h"

__global__ void mykernel(int myArray, int i, char *d_result)

{

	if (i == 7) {

		d_result[0] = 'o';

		d_result[1] = 'k';

	}

}

#define RESULT_LEN (8)

#define N (21)

int main(int argc, char* argv[])

{

	int	 i;

	int	 myArray=0;

	char	h_result[RESULT_LEN];

	char	*d_result;

	size_t  tsize=RESULT_LEN * sizeof(char);

	cudaMalloc((void**) &d_result, tsize);

	memset(h_result, 0, tsize);

	cudaMemcpy(d_result, h_result, tsize, cudaMemcpyHostToDevice);

	for( i=0; i<N; i++) {

		mykernel <<< 512, 64 >>> (myArray, i, d_result );

		cudaMemcpy(h_result, d_result, tsize, cudaMemcpyDeviceToHost);

		fprintf(stdout,"%2d %s\n", i, h_result);

	}

	cudaFree(d_result);

	return 0;

}

which produces

david@quadro:~/build/cudaok$ ./cudaok 

 0 

 1 

 2 

 3 

 4 

 5 

 6 

 7 ok

 8 ok

 9 ok

10 ok

11 ok

12 ok

13 ok

14 ok

15 ok

16 ok

17 ok

18 ok

19 ok

20 ok