Simple question on passing to the kernel

Hey. So i have been playing with CUDA for a while now and everything has been working but today i have some issues with running even the simplest programs.

Could you guys verufy for me why the following program is printing rubbish to the console.

The idea of the program: i pass in an array to the kernel. Then in kernel, in each thread writes to it its threadIdx.x and threadIdx.y and i copy the array back and

see its contents. However, when im printing it back i get some ridiculous numbers like -2343243234 etc.

The call makes use of 1 block and 9 x 3 threads therefore the array is of size 2 * 9 * 3.

Any help or explanation is appreciated. Thanks.

#include <iostream>

using namespace std;

__global__ void see_y_threads(int a[],int size)

{

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	

	int tid = tx * size + ty;

	

	a[tid] = threadIdx.x;

	a[tid + 27] = threadIdx.y;

	

}

int main()

{

	

	int* dev_a;

	const int N = 27;

	int a[2 * N];

	

	

	

	//zero out the array

	for(int i=0; i < 2*N; i++) a[i] = 0;

	

	cudaMalloc((void**) &dev_a, 2 * N * sizeof(int));

	cudaMemcpy(dev_a,a, 2 * N * sizeof(int), cudaMemcpyHostToDevice);

	

	

	

	//call

	dim3 block(9,3,1);

	see_y_threads<<<1, block>>>(a,9);

	

	cudaMemcpy(a, dev_a, 2 * N * sizeof(int), cudaMemcpyDeviceToHost);

	

	for(int i=0; i < N; i++)

	{

		cout << a[i] << "\t" << a[i+N] <<endl;                                         // < -- printing rubbish

	}

	

	cudaFree(dev_a);

	

	cin.get();

	return 0;

}

Hey. So i have been playing with CUDA for a while now and everything has been working but today i have some issues with running even the simplest programs.

Could you guys verufy for me why the following program is printing rubbish to the console.

The idea of the program: i pass in an array to the kernel. Then in kernel, in each thread writes to it its threadIdx.x and threadIdx.y and i copy the array back and

see its contents. However, when im printing it back i get some ridiculous numbers like -2343243234 etc.

The call makes use of 1 block and 9 x 3 threads therefore the array is of size 2 * 9 * 3.

Any help or explanation is appreciated. Thanks.

#include <iostream>

using namespace std;

__global__ void see_y_threads(int a[],int size)

{

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	

	int tid = tx * size + ty;

	

	a[tid] = threadIdx.x;

	a[tid + 27] = threadIdx.y;

	

}

int main()

{

	

	int* dev_a;

	const int N = 27;

	int a[2 * N];

	

	

	

	//zero out the array

	for(int i=0; i < 2*N; i++) a[i] = 0;

	

	cudaMalloc((void**) &dev_a, 2 * N * sizeof(int));

	cudaMemcpy(dev_a,a, 2 * N * sizeof(int), cudaMemcpyHostToDevice);

	

	

	

	//call

	dim3 block(9,3,1);

	see_y_threads<<<1, block>>>(a,9);

	

	cudaMemcpy(a, dev_a, 2 * N * sizeof(int), cudaMemcpyDeviceToHost);

	

	for(int i=0; i < N; i++)

	{

		cout << a[i] << "\t" << a[i+N] <<endl;                                         // < -- printing rubbish

	}

	

	cudaFree(dev_a);

	

	cin.get();

	return 0;

}
see_y_threads<<<1, block>>>(dev_a,9);

Always check error codes returned.

see_y_threads<<<1, block>>>(dev_a,9);

Always check error codes returned.

Hey. First of all, thanks for catching the error. However, what do you mean by error codes returned. How do i do it?

And my other question: why is this code producing wrong output? It should display:

0 0

1 0

2 0

3 0

0 1

1 1 etc etc

Howevere its producing mostly 0s. ie

0 0

0 0

0 0

0 0 etc

#include <iostream>

using namespace std;

__global__ void see_y_threads(int a[],int size)

{

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	

	int tid = tx * size + ty;

	

	a[tid] = threadIdx.x;

	a[tid + 27] = threadIdx.y;

	

}

int main()

{

	

	int* dev_a;

	const int N = 27;

	int a[2 * N];

	

	

	

	//zero out the array

	for(int i=0; i < 2*N; i++) a[i] = 0;

	

	cudaMalloc((void**) &dev_a, 2 * N * sizeof(int));

	cudaMemcpy(dev_a,a, 2 * N * sizeof(int), cudaMemcpyHostToDevice);

	

	

	

	//call

	dim3 block(9,3,1);

	see_y_threads<<<1, block>>>(dev_a,9);

	

	cudaMemcpy(a, dev_a, 2 * N * sizeof(int), cudaMemcpyDeviceToHost);

	

	for(int i=0; i < N; i++)

	{

		cout << a[i] << "\t" << a[i+N] <<endl;

	}

	

	cudaFree(dev_a);

	

	cin.get();

	return 0;

}

Hey. First of all, thanks for catching the error. However, what do you mean by error codes returned. How do i do it?

And my other question: why is this code producing wrong output? It should display:

0 0

1 0

2 0

3 0

0 1

1 1 etc etc

Howevere its producing mostly 0s. ie

0 0

0 0

0 0

0 0 etc

#include <iostream>

using namespace std;

__global__ void see_y_threads(int a[],int size)

{

	int tx = threadIdx.x;

	int ty = threadIdx.y;

	

	int tid = tx * size + ty;

	

	a[tid] = threadIdx.x;

	a[tid + 27] = threadIdx.y;

	

}

int main()

{

	

	int* dev_a;

	const int N = 27;

	int a[2 * N];

	

	

	

	//zero out the array

	for(int i=0; i < 2*N; i++) a[i] = 0;

	

	cudaMalloc((void**) &dev_a, 2 * N * sizeof(int));

	cudaMemcpy(dev_a,a, 2 * N * sizeof(int), cudaMemcpyHostToDevice);

	

	

	

	//call

	dim3 block(9,3,1);

	see_y_threads<<<1, block>>>(dev_a,9);

	

	cudaMemcpy(a, dev_a, 2 * N * sizeof(int), cudaMemcpyDeviceToHost);

	

	for(int i=0; i < N; i++)

	{

		cout << a[i] << "\t" << a[i+N] <<endl;

	}

	

	cudaFree(dev_a);

	

	cin.get();

	return 0;

}

threadIdx provides the ID numbers of a thread within a block. Assuming the kernel is launching (which you only know if you check the return codes from the CUDA functions), the reason you see all zeros is because you are launching the kernel with a configuration of 1 thread per block, and 27 blocks. With only 1 thread per block, threadIdx.x and threadIdx.y will be 0 for every thread.

threadIdx provides the ID numbers of a thread within a block. Assuming the kernel is launching (which you only know if you check the return codes from the CUDA functions), the reason you see all zeros is because you are launching the kernel with a configuration of 1 thread per block, and 27 blocks. With only 1 thread per block, threadIdx.x and threadIdx.y will be 0 for every thread.

Well, im launching in this way:

dim3 block(9,3,1);
see_y_threads<<<1, block>>>(dev_a,9);

what means i have just 1 block and within it i have 9 x 3 threads so the output should be different.

Well, im launching in this way:

dim3 block(9,3,1);
see_y_threads<<<1, block>>>(dev_a,9);

what means i have just 1 block and within it i have 9 x 3 threads so the output should be different.

int tid = ty * size + tx;
int tid = ty * size + tx;

And you can check return codes for errors like this:

#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) printf(#x " returned \"%s\"\n", cudaGetErrorString(cuda_error__));}

CUDA_CALL(cudaMalloc((void**) &dev_a, 2 * N * sizeof(int)));

        CUDA_CALL(cudaMemcpy(dev_a,a, 2 * N * sizeof(int), cudaMemcpyHostToDevice));

        see_y_threads<<<1, block>>>(dev_a,9);

        CUDA_CALL(cudaMemcpy(a, dev_a, 2 * N * sizeof(int), cudaMemcpyDeviceToHost));

        CUDA_CALL(cudaFree(dev_a));

And you can check return codes for errors like this:

#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) printf(#x " returned \"%s\"\n", cudaGetErrorString(cuda_error__));}

CUDA_CALL(cudaMalloc((void**) &dev_a, 2 * N * sizeof(int)));

        CUDA_CALL(cudaMemcpy(dev_a,a, 2 * N * sizeof(int), cudaMemcpyHostToDevice));

        see_y_threads<<<1, block>>>(dev_a,9);

        CUDA_CALL(cudaMemcpy(a, dev_a, 2 * N * sizeof(int), cudaMemcpyDeviceToHost));

        CUDA_CALL(cudaFree(dev_a));

Oops, never mind. I flipped the order of block and grid. Ignore my message!

Oops, never mind. I flipped the order of block and grid. Ignore my message!