I am converting my original C code to Cuda for performance enhancement but the values of both C and Cuda code are not matching. Cuda is giving an incorrect result. There are no errors anywhere with regard to Cuda error checking. When I am launching the kernel the values are not matching with the original C code. My Code:-
#include <cuda_runtime_api.h>
#include <stdio.h>
#include
#define max 673
using real_sim = float;
// Error checking macro
void allocate_array_2d(real_sim **&pDouble, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
// Referenced from:
// transfer dynamic 2D array memory to CUDA
// and
// [C++]Contiguous allocation of 2-D arrays - DEV Community 👩💻👨💻
// with error correction i=0 to i=1
pDouble = new real_sim * [dim1];
pDouble[0] = new real_sim[dim1 * dim2];
for (int i = 1; i < dim1; i++) pDouble[i] = pDouble[i-1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2;j++) {
pDouble[i][j] = 0;
}
}
}
#define cudaCheckError(code)
{
if ((code) != cudaSuccess) {
fprintf(stderr, “Cuda failure %s:%d: ‘%s’ \n”, FILE, LINE,
cudaGetErrorString(code));
}
}
global void kernel_1d(floatda,floatdb,floatdc)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int of = x * max + y;
int z = 0;
if (x < max && x>0 && y < max && y>0) {
int a = 0;
int b = 0;
// dc[of] = 2db[x * max + y ] * da[(x ) * max + y];
if(x<10) a *= db[x * max + y] * da[(x)max + y] + da[of];
a= db[x * max + y] * da[(x)*max + y] + da[of];
b+= db[x * max + y] * da[(x)*max + y] - da[of];
db[(x ) * max + y] = db[x * max + y ]* da[(x)*max + y] + a;
dc[(x)*max + y] += 2*db[x * max + y] /b;
// printf("block %d, thread %d, index (%d, %d)\n", blockIdx.x, threadIdx.x, x,
}
__syncthreads();
}
int main()
{
real_sim** a;
real_sim** b;
real_sim** c;
real_sim** cpu;//host array check device ouput…
allocate_array_2d(a,max,max);//input array
allocate_array_2d(b,max,max);//input array
allocate_array_2d(c,max,max);//output array for device code
allocate_array_2d(cpu, max, max);//output array for C code
for (int i = 0; i < max; i++) {
for (int j = 0; j < max; j++) {
a[i][j] = j+1;
b[i][j] = j+1;
c[i][j] = 0;
cpu[i][j] = 0;
}
}
float* da;
float* db;
float* dc;
int pixel_count = max * max;
cudaCheckError(cudaMalloc(&da, pixel_count * sizeof(float)));
cudaCheckError(cudaMalloc(&db, pixel_count * sizeof(float)));
cudaCheckError(cudaMalloc(&dc, pixel_count * sizeof(float)));
std::cout.precision(17);
for (int i = 1; i < max; i++) {
std::cout << " " << i << "\n";
cudaCheckError(cudaMemcpy(da, a[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(db, b[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(dc, c[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
// calculation for C code.........................
dim3 thread(32, 32);
dim3 block((max+31)/32, (max + 31) / 32);
kernel_1d << <block, thread >> > (da, db, dc);
cudaCheckError(cudaDeviceSynchronize());
// copy value to host for result comparison...........
cudaCheckError(cudaMemcpy(c[0], dc, pixel_count * sizeof(float), cudaMemcpyDeviceToHost));
// calculation for C code.........................
for (int i = 1; i < max; i++) {
for (int j = 1; j < max; j++) {
int a1 = 0, b1 = 0;
if (i < 10) a1 *= b[i][j] * a[i][j] + a[i][j];
a1 *= b[i][j] * a[i][j] + a[i][j];
b1 += b[i][j] * a[i][j] - a[i][j];
b[i][j] = b[i][j] * a[i][j] + a1;
cpu[i][j] += 2 * b[i][j] / b1;
}
}
///Compairing CPU and GPU code results..........................
for (int i = 0; i < max; i++) {
for (int j = 0; j < max; j++) {
if (c[i][j] != cpu[i][j]){
std::cout << "BREAK DUE TO MIS MATCH OF GPU AND CPU RESULTS" << "\n";
std::cout <<"i= "<< i << " j= " << j << "\n";//index
std::cout <<"c[i][j]="<< c[i][j] << " cpu[i][j]= " << cpu[i][j] << "\n";//values
exit(0);//exiting when output do not match..............
}
}
}
std::cout << "\nSUCESS" << "\n";
}
}