Recently I am studying cuda application.
But when I use cuda on 2D second partial derivative calculation, it gives result mismatch between release build and debug build.
A search said that it could be solved by using the -fmad=false option but it doesn’t work in this case.
The result of release build is same as that of debug build only if I decrease operation like tot = dxx * dxx + dyy * dyy or tot = dxx * dxx - dxy * dxy (see following code).
So I guess release build neglects some operations or may be related to parameters limitations of global function.
The sample code uses 2048x2048 2d (flatten) random data and pick only the data of a given width.
edit: size of input 2048x2048 → 256x256
Tested environments are
ubuntu 20.04 / cuda 11.2.2 / nvidia 510.68 / gcc-7, 10 / rtx2070
or
redhat 8.3 / cuda 11.2.2, 11.2.0, 11.3 / nvidia 460 (larger than 460.32) / gcc-8, 10 / v100
When I test same code with cuda 11.1.1 or 10.1, there is no difference between release and debug.
I don’t know why there is a difference.
Code:
edit: size of input and platform independent data
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <fstream>
const int width = 256;
const int thread = 128;
const int vw = 128;
const int vh = 128;
__global__ void partial_d
(float* __restrict__ out, int ow, int ox1, int oy1,
const float* __restrict__ in, int iw, int ix1, int iy1,
int vw, int vh)
{
int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
if(thread_idx >= vw * vh) return;
int j = thread_idx / vw;
int i = thread_idx % vw;
int in_idx = (j + iy1) * iw + (i + ix1);
int out_idx = (j + oy1) * ow + (i + ox1);
float dxx = in[in_idx + 1] + in[in_idx - 1] - in[in_idx] * 2.0;
float dyy = in[in_idx + iw] + in[in_idx - iw] - in[in_idx] * 2.0;
float dxy = in[in_idx + 1 + iw] + in[in_idx - 1 - iw]
- in[in_idx + 1 - iw] - in[in_idx - 1 + iw];
dxy *= 0.25;
float tot = dxx * dxx + dyy * dyy - dxy * dxy;
out[out_idx] += tot;
}
int main(int argc, char** argv)
{
float *in, *out;
in = (float*) malloc(sizeof(float) * width * width);
out = (float*) malloc(sizeof(float) * width * width);
for(int i = 0; i < width * width; i++)
{
in[i] = (float) abs(i * i - width * width )/10000000.f;
out[i] = 0.f;
}
float *d_in, *d_out;
cudaMalloc((void**)&d_in, sizeof(float) * width * width);
cudaMalloc((void**)&d_out, sizeof(float) * width * width);
cudaMemcpy(d_in, in, sizeof(float) * width * width, cudaMemcpyHostToDevice);
cudaMemcpy(d_out, out, sizeof(float) * width * width, cudaMemcpyHostToDevice);
int ix1, iy1, ox1, oy1;
ix1 = 1;
iy1 = 1;
ox1 = 1;
oy1 = 1;
int block_num = (vw * vh + thread - 1) / thread;
partial_d<<<block_num, thread>>>
(d_out, width, ox1, oy1,
d_in, width, ix1, iy1,
vw, vh);
cudaDeviceSynchronize();
cudaMemcpy(in, d_in, sizeof(float) * width * width, cudaMemcpyDeviceToHost);
cudaMemcpy(out, d_out, sizeof(float) * width * width, cudaMemcpyDeviceToHost);
printf("%.28f %.28f \n",in[2062],out[2062]);
std::ofstream save(argv[1],std::ios::trunc);
save.precision(28);
for(int i=0; i < 10000; i++)
{
save << i << " " << in[i] << " " << out[i] << std::endl;
}
save.close();
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
Run script:
nvcc -ccbin g++-10 -fmad=false -G test.cu -o test;
./test ~/debug.csv;
nvcc -ccbin g++-10 -fmad=false test.cu -o test;
./test ~/release.csv;
Output:
0.4186308085918426513671875000 0.0001717955601634457707405090
0.4186308085918426513671875000 0.0001717981795081868767738342