Strange Behaviour for multiple kernel calls

Lets say I am using two kernels, First one generates Matrix M, and second one consumes the generated Matrix. So my code kind of looks like following.

cudaMalloc(M);

//Generate Matrix M 

Generate_Matrix<<>>(M);

//Use matrix M to calculate result

Consume_Matrix<<>>(M);

//Memcopy results from device to host and print.

//Cuda Free mem

The problem is inside Consume_Matrix kernel, where is is just running a simple for loop to read all Matrix rows and sum it up.

for(int k =0 ; k < COL; k++)

{	

     result += M[i*COL + k];

}

Here is the issue. I get expected result for COL = 5000 and get result = 0 for COL = 9000

I made sure that Matrix M is getting populated correctly after first kernel, for any COL value. Problem is in second kernel call. Where it is unable to perform simple addition.

Checked for errors but there were none. Its not also Kernel synchronization problem as I am using GTX 280.

Can someone please help me here. I am going MAD. What did i miss? It feels like something is terribly wrong with my understanding.

Lets say I am using two kernels, First one generates Matrix M, and second one consumes the generated Matrix. So my code kind of looks like following.

cudaMalloc(M);

//Generate Matrix M 

Generate_Matrix<<>>(M);

//Use matrix M to calculate result

Consume_Matrix<<>>(M);

//Memcopy results from device to host and print.

//Cuda Free mem

The problem is inside Consume_Matrix kernel, where is is just running a simple for loop to read all Matrix rows and sum it up.

for(int k =0 ; k < COL; k++)

{	

     result += M[i*COL + k];

}

Here is the issue. I get expected result for COL = 5000 and get result = 0 for COL = 9000

I made sure that Matrix M is getting populated correctly after first kernel, for any COL value. Problem is in second kernel call. Where it is unable to perform simple addition.

Checked for errors but there were none. Its not also Kernel synchronization problem as I am using GTX 280.

Can someone please help me here. I am going MAD. What did i miss? It feels like something is terribly wrong with my understanding.

You haven’t shown enough code to be useful.

You haven’t shown enough code to be useful.

Sorry for making it short. Here is the complete code for Generate_Matrix and Consume_Matrix Kernel.

__global__ void Generate_Matrix(float * M)	

{

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	int by = blockIdx.y;

	int ty = threadIdx.y;

	int i = by* blockDim.y +  ty;

	int j = bx* blockDim.x +  tx;	

	if(i < DIM_X && j< DIM_Y)

	{

		M[idx2c(i,j,DIM_X)] = 2;

	}

}

__global__ void Consume_Matrix(float *M, float * res)	

{

	

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	int by = blockIdx.y;

	int ty = threadIdx.y;

	int i = by* blockDim.y +  ty;

	int j = bx* blockDim.x +  tx;	

	if(i < DIM_X && j< DIM_X)

	{

		float sum =0;

		for(int k =0 ; k < DIM_Y ; k++)

		{

			sum += M[idx2c(i,k,DIM_X)] * M[idx2c(j,k,DIM_X)];

		}

		res[idx2c(i,j,DIM_X)] =sum;

	}

}

And My main method looks like following

cudaMalloc((void**)&M, DIM_X*DIM_Y*sizeof(float));     //DIM_X by DIM_Y dimension, M matrix

cudaMalloc((void**)&res, DIM_X*DIM_X*sizeof(float));   //DIM_X by DIM_X , result matrix

Generate_Matrix<<<>>>(M);

Consume_Matrix<<<>>>(M, res);

The issue is that, res matrix stays unchanged (all values zero) for DIM_Y = 9000. Both the kernels seems to work fine for DIM_Y = 5000. (DIM_X is 2000 for all execution.)

Could you please help on this?

Sorry for making it short. Here is the complete code for Generate_Matrix and Consume_Matrix Kernel.

__global__ void Generate_Matrix(float * M)	

{

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	int by = blockIdx.y;

	int ty = threadIdx.y;

	int i = by* blockDim.y +  ty;

	int j = bx* blockDim.x +  tx;	

	if(i < DIM_X && j< DIM_Y)

	{

		M[idx2c(i,j,DIM_X)] = 2;

	}

}

__global__ void Consume_Matrix(float *M, float * res)	

{

	

	int bx = blockIdx.x;

	int tx = threadIdx.x;

	int by = blockIdx.y;

	int ty = threadIdx.y;

	int i = by* blockDim.y +  ty;

	int j = bx* blockDim.x +  tx;	

	if(i < DIM_X && j< DIM_X)

	{

		float sum =0;

		for(int k =0 ; k < DIM_Y ; k++)

		{

			sum += M[idx2c(i,k,DIM_X)] * M[idx2c(j,k,DIM_X)];

		}

		res[idx2c(i,j,DIM_X)] =sum;

	}

}

And My main method looks like following

cudaMalloc((void**)&M, DIM_X*DIM_Y*sizeof(float));     //DIM_X by DIM_Y dimension, M matrix

cudaMalloc((void**)&res, DIM_X*DIM_X*sizeof(float));   //DIM_X by DIM_X , result matrix

Generate_Matrix<<<>>>(M);

Consume_Matrix<<<>>>(M, res);

The issue is that, res matrix stays unchanged (all values zero) for DIM_Y = 9000. Both the kernels seems to work fine for DIM_Y = 5000. (DIM_X is 2000 for all execution.)

Could you please help on this?

How long does the kernel take to run? You might be triggering the watchdog timer. Check for errors after the kernel launch:
[font=“Courier New”] cudaThreadSynchronize();
printf(“%s\n”, cudaGetErrorString(cudaGetLastError()));
[/font]
To speed your code up, coalesce memory accesses, and reuse values read from global memory as much as possible by tiling the matrix.

How long does the kernel take to run? You might be triggering the watchdog timer. Check for errors after the kernel launch:
[font=“Courier New”] cudaThreadSynchronize();
printf(“%s\n”, cudaGetErrorString(cudaGetLastError()));
[/font]
To speed your code up, coalesce memory accesses, and reuse values read from global memory as much as possible by tiling the matrix.

Are your cudaMalloc’s successful for the large matrices?

Are your cudaMalloc’s successful for the large matrices?

yeah I’d definitely check cudaMalloc’s and cudaMemcpy’s to make sure that they’re all successful for all values of DIM_Y

yeah I’d definitely check cudaMalloc’s and cudaMemcpy’s to make sure that they’re all successful for all values of DIM_Y