# How to pass a single number to device basic programming

Hi everyone,
I am learning CUDA and I have a problem with sending a single number to the device. If this question is too naive; please apologize me.

what this program does is to add two vectors that are scaled by a number. It seems very simple; however I got unexpected numbers.

here is my program
/************************************************************************************/
#include <stdio.h>

#define N 10

global void add( int *a, int *b, int *c, int *d, int *e) {

``````int tid = blockIdx.x;    // this thread handles the data at its thread id

if (tid < N)

c[tid] = d[0]*a[tid] + e[0]*b[tid];
``````

}

int main( void ) {

``````int a[N], b[N], c[N];

int *dev_a, *dev_b, *dev_c;
int *dev_d,*dev_e;
int d[0];
int e[0];

d[0]=1;
e[0]=1;
printf( "%d and %d\n\n", d[0], e[0]);
``````

// I dont know why it print other numbers? Can you explain?

``````// fill the arrays 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {

a[i] = -i;

b[i] = i * i;

}

// allocate the memory on the GPU

cudaMalloc( (void**)&dev_a, N * sizeof(int) );

cudaMalloc( (void**)&dev_b, N * sizeof(int) );

cudaMalloc( (void**)&dev_c, N * sizeof(int) );
cudaMalloc( (void**)&dev_d, sizeof(int) );
cudaMalloc( (void**)&dev_e, sizeof(int) );

// copy the arrays 'a' and 'b' to the GPU

cudaMemcpy( dev_a, a, N * sizeof(int),

cudaMemcpyHostToDevice ) ;

cudaMemcpy( dev_b, b, N * sizeof(int),

cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_e, e, sizeof(int),

cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_d, d, sizeof(int),

cudaMemcpyHostToDevice ) ;

add<<<N,1>>>( dev_a, dev_b, dev_c , dev_d, dev_e);

// copy the array 'c' back from the GPU to the CPU

cudaMemcpy( c, dev_c, N * sizeof(int),

cudaMemcpyDeviceToHost );

// display the results

for (int i=0; i<N; i++) {

printf( "%d + %d = %d\n", a[i], b[i], c[i] );

}

// free the memory allocated on the GPU

cudaFree( dev_a );

cudaFree( dev_b );

cudaFree( dev_c );

return 0;
``````

}

Hi!

You are 10 blocks of 1 thread each one this is a bit strange :)

PD: you can pass ints and floats at your kernels, not always int* or float*.

You can pass number in, but to get it out you need pointers.

This is a small example which you might find useful:

``````#include <iostream>

__global__ void test(unsigned *blockDim_x)

{

*blockDim_x=blockDim.x;

}

int main( int argc, char ** argv )

{

unsigned blockSize=1111, *bs;

cudaMalloc((void **)&bs,sizeof(unsigned));

test<<< 2048, 256 >>>(bs);

cudaMemcpy(&blockSize, bs, sizeof(unsigned), cudaMemcpyDeviceToHost);

std::cout<<"Blocksize: "<<blockSize<<"\nError state: "<<cudaGetLastError();

cudaFree(bs);

return 0;

}
``````

See the code below. I’ve edited it and inserted comments in it (marked as “//!!”).

``````************************************************************************************/

#include <stdio.h>

#define N 10

__global__ void add( int *a, int *b, int *c, int d, int e) {

if (tid < N)

c[tid] = d*a[tid] + e*b[tid];

}

int main( void ) {

int a[N], b[N], c[N];

int *dev_a, *dev_b, *dev_c;

int d=1;

int e=1;

//!! int d[0] is an array of ZERO elements. You should have used int d[1] or int d, which is the same

printf( "%d and %d\n\n", d, e);

// I dont know why it print other numbers? Can you explain?

//!! Because you tried to write to unexisting element, see above.

// fill the arrays 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {

a[i] = -i;

b[i] = i * i;

}

// allocate the memory on the GPU

cudaMalloc( (void**)&dev_a, N * sizeof(int) );

cudaMalloc( (void**)&dev_b, N * sizeof(int) );

cudaMalloc( (void**)&dev_c, N * sizeof(int) );

// copy the arrays 'a' and 'b' to the GPU

cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) ;

cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) ;

//!! cudaMemset(dev_c,0,N*sizeof(int));

//!! memset is a good idea

add<<<1,N>>>( dev_a, dev_b, dev_c , d, e);

//!! 1 block of N threads is better here than N blocks of 1 thread.

//!! Block is better to contain a multiple of 32 threads

// copy the array 'c' back from the GPU to the CPU

cudaMemcpy(c,dev_c, N * sizeof(int), cudaMemcpyDeviceToHost );

// display the results

for (int i=0; i<N; i++)

printf( "%d + %d = %d\n", a[i], b[i], c[i] );

//you may skip brackets if they contain 1 line

// free the memory allocated on the GPU

cudaFree( dev_a );

cudaFree( dev_b );

cudaFree( dev_c );

return 0;

}
``````

Passing straight int/char/float/bool/etc parameters is possible. As it was said, it’s impossible to get data back from kernel without pointers.

Thanks Everyone;

The information provided was very helpful;

Thanks Everyone;

The information provided was very helpful;