How to pass a single number to device basic programming

Hi everyone,
I am learning CUDA and I have a problem with sending a single number to the device. If this question is too naive; please apologize me.

what this program does is to add two vectors that are scaled by a number. It seems very simple; however I got unexpected numbers.

here is my program
/************************************************************************************/
#include <stdio.h>

#define N 10

global void add( int *a, int *b, int *c, int *d, int *e) {

int tid = blockIdx.x;    // this thread handles the data at its thread id

if (tid < N)

    c[tid] = d[0]*a[tid] + e[0]*b[tid];

}

int main( void ) {

int a[N], b[N], c[N];

int *dev_a, *dev_b, *dev_c;
int *dev_d,*dev_e;
int d[0];
int e[0];


 d[0]=1;
 e[0]=1;
  printf( "%d and %d\n\n", d[0], e[0]);

// I dont know why it print other numbers? Can you explain?

// fill the arrays 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {

    a[i] = -i;

    b[i] = i * i;

}



// allocate the memory on the GPU

cudaMalloc( (void**)&dev_a, N * sizeof(int) );

cudaMalloc( (void**)&dev_b, N * sizeof(int) );

cudaMalloc( (void**)&dev_c, N * sizeof(int) );
cudaMalloc( (void**)&dev_d, sizeof(int) );
cudaMalloc( (void**)&dev_e, sizeof(int) );





// copy the arrays 'a' and 'b' to the GPU

cudaMemcpy( dev_a, a, N * sizeof(int),

                          cudaMemcpyHostToDevice ) ;

cudaMemcpy( dev_b, b, N * sizeof(int),

                          cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_e, e, sizeof(int),

                          cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_d, d, sizeof(int),

                          cudaMemcpyHostToDevice ) ;




add<<<N,1>>>( dev_a, dev_b, dev_c , dev_d, dev_e);



// copy the array 'c' back from the GPU to the CPU

cudaMemcpy( c, dev_c, N * sizeof(int),

                          cudaMemcpyDeviceToHost );



// display the results

for (int i=0; i<N; i++) {

    printf( "%d + %d = %d\n", a[i], b[i], c[i] );

}



// free the memory allocated on the GPU

cudaFree( dev_a );

cudaFree( dev_b );

cudaFree( dev_c );



return 0;

}

Hi!

You are 10 blocks of 1 thread each one this is a bit strange :)

Furthermore, you are always writing in c[0] because threadIdx.x=0 for your ten threads, always are the number 0 of your ten blocks.

See vectorAdition in the SDK.

PD: you can pass ints and floats at your kernels, not always int* or float*.

Regads!

You can pass number in, but to get it out you need pointers.

This is a small example which you might find useful:

#include <iostream>

__global__ void test(unsigned *blockDim_x)

{

	*blockDim_x=blockDim.x;

}

int main( int argc, char ** argv )

{

	unsigned blockSize=1111, *bs;

	cudaMalloc((void **)&bs,sizeof(unsigned));

	

	test<<< 2048, 256 >>>(bs);

	cudaMemcpy(&blockSize, bs, sizeof(unsigned), cudaMemcpyDeviceToHost);

	std::cout<<"Blocksize: "<<blockSize<<"\nError state: "<<cudaGetLastError();

	cudaFree(bs);

	return 0;

}

See the code below. I’ve edited it and inserted comments in it (marked as “//!!”).

************************************************************************************/ 

#include <stdio.h>

#define N 10

__global__ void add( int *a, int *b, int *c, int d, int e) {

int tid = threadIdx.x; // this thread handles the data at its thread id

if (tid < N)

  c[tid] = d*a[tid] + e*b[tid];

}

int main( void ) {

int a[N], b[N], c[N];

int *dev_a, *dev_b, *dev_c;

int d=1;

int e=1;

//!! int d[0] is an array of ZERO elements. You should have used int d[1] or int d, which is the same

printf( "%d and %d\n\n", d, e);

// I dont know why it print other numbers? Can you explain?

//!! Because you tried to write to unexisting element, see above.

// fill the arrays 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {

a[i] = -i;

b[i] = i * i;

}

// allocate the memory on the GPU

cudaMalloc( (void**)&dev_a, N * sizeof(int) );

cudaMalloc( (void**)&dev_b, N * sizeof(int) );

cudaMalloc( (void**)&dev_c, N * sizeof(int) );

// copy the arrays 'a' and 'b' to the GPU

cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) ;

cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) ;

//!! cudaMemset(dev_c,0,N*sizeof(int));

//!! memset is a good idea

add<<<1,N>>>( dev_a, dev_b, dev_c , d, e);

//!! 1 block of N threads is better here than N blocks of 1 thread.

//!! Block is better to contain a multiple of 32 threads

// copy the array 'c' back from the GPU to the CPU

cudaMemcpy(c,dev_c, N * sizeof(int), cudaMemcpyDeviceToHost );

// display the results

for (int i=0; i<N; i++)

  printf( "%d + %d = %d\n", a[i], b[i], c[i] );

//you may skip brackets if they contain 1 line

// free the memory allocated on the GPU

cudaFree( dev_a );

cudaFree( dev_b );

cudaFree( dev_c );

return 0;

}

Passing straight int/char/float/bool/etc parameters is possible. As it was said, it’s impossible to get data back from kernel without pointers.

Thanks Everyone;

The information provided was very helpful;

Thanks Everyone;

The information provided was very helpful;