CUDA code giving wrong result

mohiuddeen20 · May 4, 2020, 5:58pm

I am converting my original C code to Cuda for performance enhancement but the values of both C and Cuda code are not matching. Cuda is giving an incorrect result. There are no errors anywhere with regard to Cuda error checking. When I am launching the kernel the values are not matching with the original C code. My Code:-
#include <cuda_runtime_api.h>
#include <stdio.h>
#include
#define max 673
using real_sim = float;
// Error checking macro
void allocate_array_2d(real_sim **&pDouble, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
// Referenced from:
// transfer dynamic 2D array memory to CUDA
// and
// [C++]Contiguous allocation of 2-D arrays - DEV Community 👩‍💻👨‍💻
// with error correction i=0 to i=1

pDouble = new real_sim * [dim1];
pDouble[0] = new real_sim[dim1 * dim2];
for (int i = 1; i < dim1; i++) pDouble[i] = pDouble[i-1] + dim2;

for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2;j++) {
pDouble[i][j] = 0;
}
}
}
#define cudaCheckError(code)
{
if ((code) != cudaSuccess) {
fprintf(stderr, “Cuda failure %s:%d: ‘%s’ \n”, FILE, LINE,
cudaGetErrorString(code));
}
}

global void kernel_1d(floatda,floatdb,floatdc)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int of = x * max + y;
int z = 0;
if (x < max && x>0 && y < max && y>0) {
int a = 0;
int b = 0;
// dc[of] = 2db[x * max + y ] * da[(x ) * max + y];
if(x<10) a *= db[x * max + y] * da[(x)max + y] + da[of];
a= db[x * max + y] * da[(x)*max + y] + da[of];
b+= db[x * max + y] * da[(x)*max + y] - da[of];

   db[(x ) * max + y] = db[x * max + y ]* da[(x)*max + y] + a;
   dc[(x)*max + y] += 2*db[x * max + y] /b;

   // printf("block %d, thread %d, index (%d, %d)\n", blockIdx.x, threadIdx.x, x,

}
__syncthreads();
}

int main()

{
real_sim** a;
real_sim** b;
real_sim** c;
real_sim** cpu;//host array check device ouput…

allocate_array_2d(a,max,max);//input array
allocate_array_2d(b,max,max);//input array
allocate_array_2d(c,max,max);//output array for device code
allocate_array_2d(cpu, max, max);//output array for C code

for (int i = 0; i < max; i++) {
    for (int j = 0; j < max; j++) {
        a[i][j] = j+1;
        b[i][j] = j+1;
        c[i][j] = 0;
        cpu[i][j] = 0;
    }
}


float* da;
float* db;
float* dc;
int pixel_count = max * max;

cudaCheckError(cudaMalloc(&da, pixel_count * sizeof(float)));
cudaCheckError(cudaMalloc(&db, pixel_count * sizeof(float)));
cudaCheckError(cudaMalloc(&dc, pixel_count * sizeof(float)));

std::cout.precision(17);

for (int i = 1; i < max; i++) {
    std::cout << "  " << i  << "\n";

    cudaCheckError(cudaMemcpy(da, a[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
    cudaCheckError(cudaMemcpy(db, b[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
    cudaCheckError(cudaMemcpy(dc, c[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
    //  calculation for C code.........................
    dim3 thread(32, 32);
    dim3 block((max+31)/32, (max + 31) / 32);
    kernel_1d << <block, thread >> > (da, db, dc);
    cudaCheckError(cudaDeviceSynchronize());
    // copy value to host for result comparison...........
    cudaCheckError(cudaMemcpy(c[0], dc, pixel_count * sizeof(float), cudaMemcpyDeviceToHost));


    //  calculation for C code.........................

    for (int i = 1; i < max; i++) {
        for (int j = 1; j < max; j++) {
            int a1 = 0, b1 = 0;
            if (i < 10) a1 *= b[i][j] * a[i][j] + a[i][j];
            a1 *= b[i][j] * a[i][j] + a[i][j];
            b1 += b[i][j] * a[i][j] - a[i][j];

            b[i][j] = b[i][j] * a[i][j] + a1;
            cpu[i][j] += 2 * b[i][j] / b1;
        }
    }

    ///Compairing CPU and GPU code results..........................

    for (int i = 0; i < max; i++) {
        for (int j = 0; j < max; j++) {
       

            if (c[i][j] != cpu[i][j]){
                std::cout << "BREAK DUE TO MIS MATCH OF GPU AND CPU RESULTS" << "\n";
                std::cout <<"i= "<< i << " j=  " << j << "\n";//index
                std::cout <<"c[i][j]="<< c[i][j] << "  cpu[i][j]= " << cpu[i][j] << "\n";//values
               
                exit(0);//exiting when output do not match..............
            }
        }
    }
    std::cout << "\nSUCESS" << "\n";
}

}

Topic		Replies	Views
Cuda giving wrong result CUDA Programming and Performance cuda	1	303	May 4, 2020
Cuda giving wrong result CUDA Programming and Performance cuda , kernel	2	643	May 3, 2020
Inconsistent Cuda Errors CUDA Programming and Performance	10	1059	July 7, 2017
Finding max in array CUDA Programming and Performance	15	41913	November 26, 2017
CUDA Program Issue CUDA Programming and Performance cuda	19	145	September 20, 2024
Incosistent results - can't explain CUDA Programming and Performance	18	3062	May 10, 2010
GPU/CPU precision comparison and Kernel instructions question CUDA Programming and Performance	5	677	April 4, 2017
The kernel isn't working CUDA Programming and Performance	9	1050	January 19, 2011
How to realise 3 dimension calculation? CUDA Programming and Performance	9	245	August 28, 2023
[Beginner] Math operations giving incorrect answers CUDA Programming and Performance	3	1387	October 30, 2010

CUDA code giving wrong result

Related topics