Lets say I am using two kernels, First one generates Matrix M, and second one consumes the generated Matrix. So my code kind of looks like following.
cudaMalloc(M);
//Generate Matrix M
Generate_Matrix<<>>(M);
//Use matrix M to calculate result
Consume_Matrix<<>>(M);
//Memcopy results from device to host and print.
//Cuda Free mem
The problem is inside Consume_Matrix kernel, where is is just running a simple for loop to read all Matrix rows and sum it up.
for(int k =0 ; k < COL; k++)
{
result += M[i*COL + k];
}
Here is the issue. I get expected result for COL = 5000 and get result = 0 for COL = 9000
I made sure that Matrix M is getting populated correctly after first kernel, for any COL value. Problem is in second kernel call. Where it is unable to perform simple addition.
Checked for errors but there were none. Its not also Kernel synchronization problem as I am using GTX 280.
Can someone please help me here. I am going MAD. What did i miss? It feels like something is terribly wrong with my understanding.
Lets say I am using two kernels, First one generates Matrix M, and second one consumes the generated Matrix. So my code kind of looks like following.
cudaMalloc(M);
//Generate Matrix M
Generate_Matrix<<>>(M);
//Use matrix M to calculate result
Consume_Matrix<<>>(M);
//Memcopy results from device to host and print.
//Cuda Free mem
The problem is inside Consume_Matrix kernel, where is is just running a simple for loop to read all Matrix rows and sum it up.
for(int k =0 ; k < COL; k++)
{
result += M[i*COL + k];
}
Here is the issue. I get expected result for COL = 5000 and get result = 0 for COL = 9000
I made sure that Matrix M is getting populated correctly after first kernel, for any COL value. Problem is in second kernel call. Where it is unable to perform simple addition.
Checked for errors but there were none. Its not also Kernel synchronization problem as I am using GTX 280.
Can someone please help me here. I am going MAD. What did i miss? It feels like something is terribly wrong with my understanding.
Sorry for making it short. Here is the complete code for Generate_Matrix and Consume_Matrix Kernel.
__global__ void Generate_Matrix(float * M)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int by = blockIdx.y;
int ty = threadIdx.y;
int i = by* blockDim.y + ty;
int j = bx* blockDim.x + tx;
if(i < DIM_X && j< DIM_Y)
{
M[idx2c(i,j,DIM_X)] = 2;
}
}
__global__ void Consume_Matrix(float *M, float * res)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int by = blockIdx.y;
int ty = threadIdx.y;
int i = by* blockDim.y + ty;
int j = bx* blockDim.x + tx;
if(i < DIM_X && j< DIM_X)
{
float sum =0;
for(int k =0 ; k < DIM_Y ; k++)
{
sum += M[idx2c(i,k,DIM_X)] * M[idx2c(j,k,DIM_X)];
}
res[idx2c(i,j,DIM_X)] =sum;
}
}
And My main method looks like following
cudaMalloc((void**)&M, DIM_X*DIM_Y*sizeof(float)); //DIM_X by DIM_Y dimension, M matrix
cudaMalloc((void**)&res, DIM_X*DIM_X*sizeof(float)); //DIM_X by DIM_X , result matrix
Generate_Matrix<<<>>>(M);
Consume_Matrix<<<>>>(M, res);
The issue is that, res matrix stays unchanged (all values zero) for DIM_Y = 9000. Both the kernels seems to work fine for DIM_Y = 5000. (DIM_X is 2000 for all execution.)
Sorry for making it short. Here is the complete code for Generate_Matrix and Consume_Matrix Kernel.
__global__ void Generate_Matrix(float * M)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int by = blockIdx.y;
int ty = threadIdx.y;
int i = by* blockDim.y + ty;
int j = bx* blockDim.x + tx;
if(i < DIM_X && j< DIM_Y)
{
M[idx2c(i,j,DIM_X)] = 2;
}
}
__global__ void Consume_Matrix(float *M, float * res)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int by = blockIdx.y;
int ty = threadIdx.y;
int i = by* blockDim.y + ty;
int j = bx* blockDim.x + tx;
if(i < DIM_X && j< DIM_X)
{
float sum =0;
for(int k =0 ; k < DIM_Y ; k++)
{
sum += M[idx2c(i,k,DIM_X)] * M[idx2c(j,k,DIM_X)];
}
res[idx2c(i,j,DIM_X)] =sum;
}
}
And My main method looks like following
cudaMalloc((void**)&M, DIM_X*DIM_Y*sizeof(float)); //DIM_X by DIM_Y dimension, M matrix
cudaMalloc((void**)&res, DIM_X*DIM_X*sizeof(float)); //DIM_X by DIM_X , result matrix
Generate_Matrix<<<>>>(M);
Consume_Matrix<<<>>>(M, res);
The issue is that, res matrix stays unchanged (all values zero) for DIM_Y = 9000. Both the kernels seems to work fine for DIM_Y = 5000. (DIM_X is 2000 for all execution.)
How long does the kernel take to run? You might be triggering the watchdog timer. Check for errors after the kernel launch:
[font=“Courier New”] cudaThreadSynchronize();
printf(“%s\n”, cudaGetErrorString(cudaGetLastError()));
[/font]
To speed your code up, coalesce memory accesses, and reuse values read from global memory as much as possible by tiling the matrix.
How long does the kernel take to run? You might be triggering the watchdog timer. Check for errors after the kernel launch:
[font=“Courier New”] cudaThreadSynchronize();
printf(“%s\n”, cudaGetErrorString(cudaGetLastError()));
[/font]
To speed your code up, coalesce memory accesses, and reuse values read from global memory as much as possible by tiling the matrix.