How to pass a single number to device basic programming

paulclizana · November 29, 2010, 2:41pm

Hi everyone,
I am learning CUDA and I have a problem with sending a single number to the device. If this question is too naive; please apologize me.

what this program does is to add two vectors that are scaled by a number. It seems very simple; however I got unexpected numbers.

here is my program
/************************************************************************************/
#include <stdio.h>

#define N 10

global void add( int *a, int *b, int *c, int *d, int *e) {

int tid = blockIdx.x;    // this thread handles the data at its thread id

if (tid < N)

    c[tid] = d[0]*a[tid] + e[0]*b[tid];

}

int main( void ) {

int a[N], b[N], c[N];

int *dev_a, *dev_b, *dev_c;
int *dev_d,*dev_e;
int d[0];
int e[0];


 d[0]=1;
 e[0]=1;
  printf( "%d and %d\n\n", d[0], e[0]);

// I dont know why it print other numbers? Can you explain?

// fill the arrays 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {

    a[i] = -i;

    b[i] = i * i;

}



// allocate the memory on the GPU

cudaMalloc( (void**)&dev_a, N * sizeof(int) );

cudaMalloc( (void**)&dev_b, N * sizeof(int) );

cudaMalloc( (void**)&dev_c, N * sizeof(int) );
cudaMalloc( (void**)&dev_d, sizeof(int) );
cudaMalloc( (void**)&dev_e, sizeof(int) );





// copy the arrays 'a' and 'b' to the GPU

cudaMemcpy( dev_a, a, N * sizeof(int),

                          cudaMemcpyHostToDevice ) ;

cudaMemcpy( dev_b, b, N * sizeof(int),

                          cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_e, e, sizeof(int),

                          cudaMemcpyHostToDevice ) ;
cudaMemcpy( dev_d, d, sizeof(int),

                          cudaMemcpyHostToDevice ) ;




add<<<N,1>>>( dev_a, dev_b, dev_c , dev_d, dev_e);



// copy the array 'c' back from the GPU to the CPU

cudaMemcpy( c, dev_c, N * sizeof(int),

                          cudaMemcpyDeviceToHost );



// display the results

for (int i=0; i<N; i++) {

    printf( "%d + %d = %d\n", a[i], b[i], c[i] );

}



// free the memory allocated on the GPU

cudaFree( dev_a );

cudaFree( dev_b );

cudaFree( dev_c );



return 0;

}

insmvb00 · November 29, 2010, 3:03pm

Hi!

You are 10 blocks of 1 thread each one this is a bit strange :)

Furthermore, you are always writing in c[0] because threadIdx.x=0 for your ten threads, always are the number 0 of your ten blocks.

See vectorAdition in the SDK.

PD: you can pass ints and floats at your kernels, not always int* or float*.

Regads!

DA_enan · November 29, 2010, 3:56pm

You can pass number in, but to get it out you need pointers.

This is a small example which you might find useful:

#include <iostream>

__global__ void test(unsigned *blockDim_x)

{

	*blockDim_x=blockDim.x;

}

int main( int argc, char ** argv )

{

	unsigned blockSize=1111, *bs;

	cudaMalloc((void **)&bs,sizeof(unsigned));

	

	test<<< 2048, 256 >>>(bs);

	cudaMemcpy(&blockSize, bs, sizeof(unsigned), cudaMemcpyDeviceToHost);

	std::cout<<"Blocksize: "<<blockSize<<"\nError state: "<<cudaGetLastError();

	cudaFree(bs);

	return 0;

}

umod.47 · November 29, 2010, 8:46pm

See the code below. I’ve edited it and inserted comments in it (marked as “//!!”).

************************************************************************************/ 

#include <stdio.h>

#define N 10

__global__ void add( int *a, int *b, int *c, int d, int e) {

int tid = threadIdx.x; // this thread handles the data at its thread id

if (tid < N)

  c[tid] = d*a[tid] + e*b[tid];

}

int main( void ) {

int a[N], b[N], c[N];

int *dev_a, *dev_b, *dev_c;

int d=1;

int e=1;

//!! int d[0] is an array of ZERO elements. You should have used int d[1] or int d, which is the same

printf( "%d and %d\n\n", d, e);

// I dont know why it print other numbers? Can you explain?

//!! Because you tried to write to unexisting element, see above.

// fill the arrays 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {

a[i] = -i;

b[i] = i * i;

}

// allocate the memory on the GPU

cudaMalloc( (void**)&dev_a, N * sizeof(int) );

cudaMalloc( (void**)&dev_b, N * sizeof(int) );

cudaMalloc( (void**)&dev_c, N * sizeof(int) );

// copy the arrays 'a' and 'b' to the GPU

cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) ;

cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) ;

//!! cudaMemset(dev_c,0,N*sizeof(int));

//!! memset is a good idea

add<<<1,N>>>( dev_a, dev_b, dev_c , d, e);

//!! 1 block of N threads is better here than N blocks of 1 thread.

//!! Block is better to contain a multiple of 32 threads

// copy the array 'c' back from the GPU to the CPU

cudaMemcpy(c,dev_c, N * sizeof(int), cudaMemcpyDeviceToHost );

// display the results

for (int i=0; i<N; i++)

  printf( "%d + %d = %d\n", a[i], b[i], c[i] );

//you may skip brackets if they contain 1 line

// free the memory allocated on the GPU

cudaFree( dev_a );

cudaFree( dev_b );

cudaFree( dev_c );

return 0;

}

Passing straight int/char/float/bool/etc parameters is possible. As it was said, it’s impossible to get data back from kernel without pointers.

paulclizana · November 30, 2010, 3:04am

Thanks Everyone;

The information provided was very helpful;

See the code below. I’ve edited it and inserted comments in it (marked as “//!!”).

************************************************************************************/ 

#include <stdio.h>

#define N 10

__global__ void add( int *a, int *b, int *c, int d, int e) {

int tid = threadIdx.x; // this thread handles the data at its thread id

if (tid < N)

  c[tid] = d*a[tid] + e*b[tid];

}

int main( void ) {

int a[N], b[N], c[N];

int *dev_a, *dev_b, *dev_c;

int d=1;

int e=1;

//!! int d[0] is an array of ZERO elements. You should have used int d[1] or int d, which is the same

printf( "%d and %d\n\n", d, e);

// I dont know why it print other numbers? Can you explain?

//!! Because you tried to write to unexisting element, see above.

// fill the arrays 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {

a[i] = -i;

b[i] = i * i;

}

// allocate the memory on the GPU

cudaMalloc( (void**)&dev_a, N * sizeof(int) );

cudaMalloc( (void**)&dev_b, N * sizeof(int) );

cudaMalloc( (void**)&dev_c, N * sizeof(int) );

// copy the arrays 'a' and 'b' to the GPU

cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) ;

cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) ;

//!! cudaMemset(dev_c,0,N*sizeof(int));

//!! memset is a good idea

add<<<1,N>>>( dev_a, dev_b, dev_c , d, e);

//!! 1 block of N threads is better here than N blocks of 1 thread.

//!! Block is better to contain a multiple of 32 threads

// copy the array 'c' back from the GPU to the CPU

cudaMemcpy(c,dev_c, N * sizeof(int), cudaMemcpyDeviceToHost );

// display the results

for (int i=0; i<N; i++)

  printf( "%d + %d = %d\n", a[i], b[i], c[i] );

//you may skip brackets if they contain 1 line

// free the memory allocated on the GPU

cudaFree( dev_a );

cudaFree( dev_b );

cudaFree( dev_c );

return 0;

}

Passing straight int/char/float/bool/etc parameters is possible. As it was said, it’s impossible to get data back from kernel without pointers.

paulclizana · November 30, 2010, 3:08am

Thanks Everyone;

The information provided was very helpful;

See the code below. I’ve edited it and inserted comments in it (marked as “//!!”).

************************************************************************************/ 

#include <stdio.h>

#define N 10

__global__ void add( int *a, int *b, int *c, int d, int e) {

int tid = threadIdx.x; // this thread handles the data at its thread id

if (tid < N)

  c[tid] = d*a[tid] + e*b[tid];

}

int main( void ) {

int a[N], b[N], c[N];

int *dev_a, *dev_b, *dev_c;

int d=1;

int e=1;

//!! int d[0] is an array of ZERO elements. You should have used int d[1] or int d, which is the same

printf( "%d and %d\n\n", d, e);

// I dont know why it print other numbers? Can you explain?

//!! Because you tried to write to unexisting element, see above.

// fill the arrays 'a' and 'b' on the CPU

for (int i=0; i<N; i++) {

a[i] = -i;

b[i] = i * i;

}

// allocate the memory on the GPU

cudaMalloc( (void**)&dev_a, N * sizeof(int) );

cudaMalloc( (void**)&dev_b, N * sizeof(int) );

cudaMalloc( (void**)&dev_c, N * sizeof(int) );

// copy the arrays 'a' and 'b' to the GPU

cudaMemcpy( dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice ) ;

cudaMemcpy( dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice ) ;

//!! cudaMemset(dev_c,0,N*sizeof(int));

//!! memset is a good idea

add<<<1,N>>>( dev_a, dev_b, dev_c , d, e);

//!! 1 block of N threads is better here than N blocks of 1 thread.

//!! Block is better to contain a multiple of 32 threads

// copy the array 'c' back from the GPU to the CPU

cudaMemcpy(c,dev_c, N * sizeof(int), cudaMemcpyDeviceToHost );

// display the results

for (int i=0; i<N; i++)

  printf( "%d + %d = %d\n", a[i], b[i], c[i] );

//you may skip brackets if they contain 1 line

// free the memory allocated on the GPU

cudaFree( dev_a );

cudaFree( dev_b );

cudaFree( dev_c );

return 0;

}

Passing straight int/char/float/bool/etc parameters is possible. As it was said, it’s impossible to get data back from kernel without pointers.

Topic		Replies	Views
Simple question on passing to the kernel CUDA Programming and Performance	15	3392	January 15, 2012
Strange memory gremlins Getting pwned by pointers CUDA Programming and Performance	9	12175	July 1, 2009
Help with strange error CUDA Programming and Performance	8	2096	February 25, 2010
Annoying problems with memory and/or syntax CUDA Programming and Performance	19	4769	April 8, 2008
problem with array offset CUDA Programming and Performance	28	3228	February 3, 2011
Number of Blocks CUDA Programming and Performance	3	1585	October 15, 2011
Newbie question about data transfer CUDA Programming and Performance	4	2701	July 25, 2008
Losing CUDA calculatons CUDA Programming and Performance	5	2321	March 21, 2011
[Beginner] Math operations giving incorrect answers CUDA Programming and Performance	3	1388	October 30, 2010
Performance issues on memory transfer CUDA Programming and Performance	13	12982	November 26, 2010

How to pass a single number to device basic programming

Related topics