CUDA code giving wrong result

I am converting my original C code to Cuda for performance enhancement but the values of both C and Cuda code are not matching. Cuda is giving an incorrect result. There are no errors anywhere with regard to Cuda error checking. When I am launching the kernel the values are not matching with the original C code. My Code:-
#include <cuda_runtime_api.h>
#include <stdio.h>
#include
#define max 673
using real_sim = float;
// Error checking macro
void allocate_array_2d(real_sim **&pDouble, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
// Referenced from:
// http://www.trevorsimonton.com/blog/2016/11/16/transfer-2d-array-memory-to-cuda.html
// and
// https://dev.to/drakargx/c-contiguous-allocation-of-2-d-arrays-446m
// with error correction i=0 to i=1

pDouble = new real_sim * [dim1];
pDouble[0] = new real_sim[dim1 * dim2];
for (int i = 1; i < dim1; i++) pDouble[i] = pDouble[i-1] + dim2;

for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2;j++) {
pDouble[i][j] = 0;
}
}
}
#define cudaCheckError(code)
{
if ((code) != cudaSuccess) {
fprintf(stderr, “Cuda failure %s:%d: ‘%s’ \n”, FILE, LINE,
cudaGetErrorString(code));
}
}

global void kernel_1d(floatda,floatdb,floatdc)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int of = x * max + y;
int z = 0;
if (x < max && x>0 && y < max && y>0) {
int a = 0;
int b = 0;
// dc[of] = 2
db[x * max + y ] * da[(x ) * max + y];
if(x<10) a *= db[x * max + y] * da[(x)max + y] + da[of];
a
= db[x * max + y] * da[(x)*max + y] + da[of];
b+= db[x * max + y] * da[(x)*max + y] - da[of];

   db[(x ) * max + y] = db[x * max + y ]* da[(x)*max + y] + a;
   dc[(x)*max + y] += 2*db[x * max + y] /b;

   // printf("block %d, thread %d, index (%d, %d)\n", blockIdx.x, threadIdx.x, x,

}
__syncthreads();
}

int main()

{
real_sim** a;
real_sim** b;
real_sim** c;
real_sim** cpu;//host array check device ouput…

allocate_array_2d(a,max,max);//input array
allocate_array_2d(b,max,max);//input array
allocate_array_2d(c,max,max);//output array for device code
allocate_array_2d(cpu, max, max);//output array for C code

for (int i = 0; i < max; i++) {
    for (int j = 0; j < max; j++) {
        a[i][j] = j+1;
        b[i][j] = j+1;
        c[i][j] = 0;
        cpu[i][j] = 0;
    }
}


float* da;
float* db;
float* dc;
int pixel_count = max * max;

cudaCheckError(cudaMalloc(&da, pixel_count * sizeof(float)));
cudaCheckError(cudaMalloc(&db, pixel_count * sizeof(float)));
cudaCheckError(cudaMalloc(&dc, pixel_count * sizeof(float)));

std::cout.precision(17);

for (int i = 1; i < max; i++) {
    std::cout << "  " << i  << "\n";

    cudaCheckError(cudaMemcpy(da, a[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
    cudaCheckError(cudaMemcpy(db, b[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
    cudaCheckError(cudaMemcpy(dc, c[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
    //  calculation for C code.........................
    dim3 thread(32, 32);
    dim3 block((max+31)/32, (max + 31) / 32);
    kernel_1d << <block, thread >> > (da, db, dc);
    cudaCheckError(cudaDeviceSynchronize());
    // copy value to host for result comparison...........
    cudaCheckError(cudaMemcpy(c[0], dc, pixel_count * sizeof(float), cudaMemcpyDeviceToHost));


    //  calculation for C code.........................

    for (int i = 1; i < max; i++) {
        for (int j = 1; j < max; j++) {
            int a1 = 0, b1 = 0;
            if (i < 10) a1 *= b[i][j] * a[i][j] + a[i][j];
            a1 *= b[i][j] * a[i][j] + a[i][j];
            b1 += b[i][j] * a[i][j] - a[i][j];

            b[i][j] = b[i][j] * a[i][j] + a1;
            cpu[i][j] += 2 * b[i][j] / b1;
        }
    }

    ///Compairing CPU and GPU code results..........................

    for (int i = 0; i < max; i++) {
        for (int j = 0; j < max; j++) {
       

            if (c[i][j] != cpu[i][j]){
                std::cout << "BREAK DUE TO MIS MATCH OF GPU AND CPU RESULTS" << "\n";
                std::cout <<"i= "<< i << " j=  " << j << "\n";//index
                std::cout <<"c[i][j]="<< c[i][j] << "  cpu[i][j]= " << cpu[i][j] << "\n";//values
               
                exit(0);//exiting when output do not match..............
            }
        }
    }
    std::cout << "\nSUCESS" << "\n";
}

}