threadIdx.y question

#include <iostream>

#include <cuda.h>

#include <stdlib.h>

using namespace std;

__global__ void function(int a[], int size)

{

	if(threadIdx.y < size)

	{

		a[threadIdx.y] = threadIdx.y;

	}

}

void checkCUDAError(const char *msg){    

	cudaError_t err = cudaGetLastError();    

	

	if( cudaSuccess != err)

	{       

		fprintf(stderr, "Cuda error: %s: %s.\n", msg,   cudaGetErrorString( err) );        

		exit(EXIT_FAILURE);    

	}                         

 }

int main()

{

	const int N = 10;

	int a[N];

	int *dev_a;

	

	cudaMalloc((void**) &dev_a, N * sizeof(int));

	cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);

	

	dim3 block = (1,10,1);

	function<<<1,block>>>(dev_a,N);

	cudaThreadSynchronize();

	checkCUDAError("kernel invocation");

	

	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);

	

	// Check for any CUDA errors    

	checkCUDAError("memcpy");

	

	for(int i=0; i < N; i++)

	{

		cout << a[i] << endl;

	}

	

	cudaFree(dev_a);

	

	cout << "Correct!" << endl;

	cin.get();

	return 0;

}

So i have the following code and the way i called it was:

dim3 block = (1,10,1);

function<<<1,block>>>(dev_a,N);

So i should have 1 block with 10 threads in the Y direction and in the kernel i try to get the threads to write their IDs to the array however wrong numbers are written there!!!

WHY??

The error function does not return anything and “Correct” is printed at the end.

I appreciate any help. If anyone could run the code on their machine and post here the ouput i would appreciate it as well.

Thanks in advance.

#include <iostream>

#include <cuda.h>

#include <stdlib.h>

using namespace std;

__global__ void function(int a[], int size)

{

	if(threadIdx.y < size)

	{

		a[threadIdx.y] = threadIdx.y;

	}

}

void checkCUDAError(const char *msg){    

	cudaError_t err = cudaGetLastError();    

	

	if( cudaSuccess != err)

	{       

		fprintf(stderr, "Cuda error: %s: %s.\n", msg,   cudaGetErrorString( err) );        

		exit(EXIT_FAILURE);    

	}                         

 }

int main()

{

	const int N = 10;

	int a[N];

	int *dev_a;

	

	cudaMalloc((void**) &dev_a, N * sizeof(int));

	cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);

	

	dim3 block = (1,10,1);

	function<<<1,block>>>(dev_a,N);

	cudaThreadSynchronize();

	checkCUDAError("kernel invocation");

	

	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);

	

	// Check for any CUDA errors    

	checkCUDAError("memcpy");

	

	for(int i=0; i < N; i++)

	{

		cout << a[i] << endl;

	}

	

	cudaFree(dev_a);

	

	cout << "Correct!" << endl;

	cin.get();

	return 0;

}

So i have the following code and the way i called it was:

dim3 block = (1,10,1);

function<<<1,block>>>(dev_a,N);

So i should have 1 block with 10 threads in the Y direction and in the kernel i try to get the threads to write their IDs to the array however wrong numbers are written there!!!

WHY??

The error function does not return anything and “Correct” is printed at the end.

I appreciate any help. If anyone could run the code on their machine and post here the ouput i would appreciate it as well.

Thanks in advance.

Change it to:
dim3 block = dim3(1,10,1);

and it will work.

Change it to:
dim3 block = dim3(1,10,1);

and it will work.

It did work indeed. Thanks for the input. Would you mind explaining why my previous version failed?

It did work indeed. Thanks for the input. Would you mind explaining why my previous version failed?

your previous version used the comma operator, it was like writing:

int x = 1,10,1;

this really works! the comma operator when not used in function calls gives you the last element.

It works thanks