sending parameters to kernel

How it is with parameters which I send to kernel function ? Should I allocate memory with cudaMalloc for all of the parameters which I want to be available inside kernel function or it concerns only these parameters which are pointers in host memory. If I want to send int or float, or some simple-type variable then should I allocate for it memory with cudaMalloc and send it with cudaMemCpy from host to device as well ?

Thanks in Advance :)

can we clasify it somehow? like variables which you don’t need back, which are just input to kernel, are supposed to be sent that way and other variables which you want to change and have it back (after execution of kernel function is finished) have to be sent other way?

Here is the example, is it necessary to allocate memory for dev_first and dev_last , if they are integers and I dont get it back after kernel execution is finished.

// Helper function for using CUDA to add vectors in parallel.

cudaError_t PrimesWithCuda(int first,int last,int nofb,int noft,int* output)

{

    // Choose which GPU to run on, change this on a multi-GPU system.

    cudaError_t cudaStatus = cudaSetDevice(0);

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?\n");

        goto Error;

    }

	int * dev_first;

	int * dev_last;

	int * dev_output;

	int nop = numberOfPrimes(first,last);

	if ( nop <= 0 )

	{

		fprintf(stderr, "Wrong parameters: first & last\n");

        goto Error;

	}

	output = (int*)malloc(sizeof(int)*nop);

	if ( output == NULL )

	{

		fprintf(stderr, "output: malloc failed\n");

        goto Error;

	}

	memset(output,0,nop*sizeof(int));

	cudaStatus = cudaMalloc((void**)&dev_first,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMalloc((void**)&dev_last,sizeof(int));

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMalloc((void**)&dev_output,sizeof(int)*nop);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: cudaMalloc failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_output,output,sizeof(int)*nop,cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_output: HtD cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_first,&first,sizeof(int),cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_first: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

	cudaStatus = cudaMemcpy(dev_last,&last,sizeof(int)*,cudaMemcpyHostToDevice);

	if (cudaStatus != cudaSuccess) {

		fprintf(stderr, "dev_last: cudaMemcpy failed\n", cudaStatus);

		goto Error;

    };

// Launch a kernel on the GPU with one thread for each element.

    PrimesKernel<<<nofb, noft>>>(first,last,dev_output);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns

    // any errors encountered during the launch.

    cudaStatus = cudaDeviceSynchronize();

    if (cudaStatus != cudaSuccess) {

        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);

        goto Error;

    }

	cudaStatus = cudaMemcpy(output,dev_output,sizeof(int)*nop,cudaMemcpyDeviceToHost);

	if (cudaStatus != cudaSuccess) {

    fprintf(stderr, "dev_output: DtH cudaMemcpy failed\n", cudaStatus);

    goto Error;

    };

	cudaFree(dev_first);

	cudaFree(dev_last);

	cudaFree(dev_output);

Error:

	return cudaStatus;

}

All of the arguments to a kernel are on the host, so there is no need to allocate any memory for them. And they are all input to the kernel.

However, some of them may be pointers. In that case, it is necessary to allocate the memory on the device where they point to. The allocated memory can the be used as input, output, or both, or none (just for scratch use inside the kernel).