I am trying a code for matrix multiplication. But the answer came out to be wrong. So when I checked what is the issue, I found that the kernel is not launching for some reason. What is the reason. Here is my code.
MY GPU is Quadro P2000 and CUDA version 11.2
#include <stdio.h>
#include <memory>
#include <cstring>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdlib.h>
#include <time.h>
__global__ void matmul_gpu(int *a, int *b, int *output, int nx_a, int ny_a, int nx_b, int ny_b)
{
int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
printf("row :- %d, col :- %d\n",row,col);
int sum = 0;
if(row > nx_a || col > ny_a)
return;
for(int k = 0; k < ny_b; k++)
{
sum += a[row*ny_b + k] * b[col + ny_a*k];
}
output[row*ny_b + col] = sum;
}
void matmul_cpu(int *a, int *b, int *output, int nx_a, int ny_a, int nx_b, int ny_b)
{
for(int i = 0; i < nx_a; i++)
{
for(int j =0; j < ny_b; j++)
{
int sum = 0;
for(int k =0; k < nx_b ; k++)
{
sum += (a[k + nx_b*i] * b[k*ny_b + j]);
}
output[i + nx_b*j] = sum;
}
}
}
int main()
{
size_t nx_a = 256,ny_a = 256,nx_b = 256,ny_b = 256;
int *h_a,*h_b,*h_c;
int block_size = 128;
int BYTES_A,BYTES_B,BYTES_C;
BYTES_A = nx_a * ny_a * sizeof(int);
BYTES_B = ny_b * nx_b * sizeof(int);
BYTES_C = nx_a * ny_b * sizeof(int);
h_a = (int*)(malloc(BYTES_A));
h_b = (int*)(malloc(BYTES_B));
h_c = (int*)(malloc(BYTES_C));
memset(h_c,0,BYTES_C);
time_t t;
srand((unsigned)time(&t));
for(int i = 0; i < BYTES_A/sizeof(int); i++)
h_a[i] = (int)(rand() & 0x0F);
for(int i = 0; i < BYTES_B/sizeof(int); i++)
h_b[i] = (int)(rand() & 0x0F);
int *d_a,*d_b, *d_c;
cudaMalloc((int**)&d_a,BYTES_A);
cudaMalloc((int**)&d_b,BYTES_B);
cudaMalloc((int**)&d_c,BYTES_C);
cudaMemcpy(d_a,h_a,BYTES_A,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b,BYTES_B,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,h_c,BYTES_C,cudaMemcpyHostToDevice);
dim3 block(block_size,block_size);
dim3 grid(nx_a/block_size,ny_a/block_size);
printf("Running kernel with block size %dx%d and grid size %dx%d\n",block.x,block.y,grid.x,grid.y);
matmul_gpu<< <grid,block>> >(d_a,d_b,d_c,nx_a,ny_a,nx_b,ny_b);
cudaDeviceSynchronize();
int *gpu_result;
gpu_result = (int*)(malloc(BYTES_C));
cudaMemcpy(gpu_result,d_c,BYTES_C,cudaMemcpyDeviceToHost);
matmul_cpu(h_a,h_b,h_c,nx_a,ny_a,nx_b,ny_b);
bool wrong = false;
for(int i = 0 ; i < BYTES_C/sizeof(int); i++)
{
if(gpu_result[i] != h_c[i])
{
printf("%d %d\n",gpu_result[i],h_c[i]);
wrong = true;
break;
}
}
if(wrong)
printf("Answer is Wrong\n");
else
printf("Answer is correct\n");
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
free(gpu_result);
//cudaDeviceReset();
return EXIT_SUCCESS;
}