I am trying to interpolate a 3D array. I am using cuda textures for its trilinear interpolation. I am not getting any error from ‘cuda-memcheck’. This code is working fine. But getting some weird answer also.
If I am passing value = 100 at (0,1,0), then its value should be same within an interpolated result.
#include <iostream>
#include <fstream>
#define NX 5
#define NY 5
#define NZ 1
#define IX 10
#define IY 10
#define IZ 2
#define ifact 2
texture<float, 3> tex;
__global__ void getInterpolatedFunctionValue(float *a, float *b, float *c, float *result) // int IX, int IY, int IZ )
{
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
if ((x < IX) && (y < IY) && (z < IZ))
{
float w = a[x] , v = b[y], u = c[z] ;
result[z * IZ * IY + y * IZ + x ] = tex3D(tex, u, v, w );
}
}
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
using namespace std;
int main(){
int nx=NX, ny=NY, nz=NZ, xx =IX, yy = IY, zz = IZ;
float fff[nz][ny][nx];
float x[xx], y[yy], z[zz] ;
for(int ix=0; ix<nx; ix++)
for(int iy=0; iy<ny; iy++)
for(int iz=0; iz<nz; iz++){
fff[iz][iy][ix] = (ix + iy + iz ) * 100;
std::cout<<fff[iz][iy][ix] <<" "<< ix<<" "<< iy<<" "<< iz<<'\n' ;
}
for(int ix=0; ix<IX; ix++)
for(int iy=0; iy<IY; iy++)
for(int iz=0; iz<IZ; iz++){
x[ix] = (float) ix / 2 ;
y[iy] = (float) iy / 2 ;
z[iz] = (float) iz / 2 ;
// std::cout<< x[ix] << " "<< y[iy]<<" "<< z[iz]<< '\n' ;
}
//*************************************************************************//
float *d_x, *d_y, *d_z, *d_result, *h_result ;
cudaMalloc((void**)&d_x, IX * sizeof(float));
cudaMalloc((void**)&d_y, IY * sizeof(float));
cudaMalloc((void**)&d_z, IZ * sizeof(float));
cudaMalloc((void**)&d_result, IX * IY * IZ * sizeof(float));
cudaMemcpy( d_x, x, IX * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( d_y, y, IY * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( d_z, z, IZ * sizeof(float), cudaMemcpyHostToDevice );
cudaCheckErrors("allocating an array is failed");
//*************************************************************************//
cudaArray *d_volumeArray ;
//const cudaExtent extent = make_cudaExtent(nx, ny, nz);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_volumeArray, &channelDesc, make_cudaExtent(nz, ny, nx));
cudaCheckErrors("cudaMalloc3D error");
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)fff, sizeof(float)*nz,ny,nx);
copyParams.dstArray = d_volumeArray;
copyParams.dstPos = make_cudaPos(0,0,0);
copyParams.srcPos = make_cudaPos(0,0,0);
copyParams.extent = make_cudaExtent(nz, ny, nx);
copyParams.kind = cudaMemcpyHostToDevice;
cudaCheckErrors("copyParams3D fail");
cudaMemcpy3D(©Params);
cudaCheckErrors("cudaMemcpy3DParms fail");
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.addressMode[2] = cudaAddressModeClamp;
cudaBindTextureToArray(tex, d_volumeArray, channelDesc);
cudaCheckErrors("bind fail");
const dim3 blockSize(8, 8, 8 );
const dim3 gridSize(((IZ + blockSize.x )/blockSize.x),((IY + blockSize.y )/blockSize.y),((IX + blockSize.z)/blockSize.z));
printf("Blocksize.x = %i, blockSize.y = %i, blockSize.z = %i \n", blockSize.x, blockSize.y, blockSize.z);
printf("gridSize.x = %i, gridSize.y = %i, gridSize.z = %i \n", gridSize.x, gridSize.y, gridSize.z);
getInterpolatedFunctionValue<<<gridSize, blockSize>>>(d_x, d_y, d_z, d_result) ; // IX , IY , IZ ) ;
cudaCheckErrors("kernel fail");
cudaDeviceSynchronize();
h_result = (float*) malloc(IX * IY * IZ * sizeof(float)) ;
cudaMemcpy( h_result, d_result, IX * IY * IZ * sizeof(float),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
cudaUnbindTexture(tex);
cudaCheckErrors("unbind fail");
cudaCheckErrors("cudaFree fail");
cudaFreeArray(d_volumeArray);
cudaCheckErrors("free fail");
printf("success!\n");
for(int ix=0; ix<IX; ix++)
for(int iy=0; iy<IY; iy++)
for(int iz=0; iz<IZ; iz++){
std::cout<<h_result[iz * IZ * IY + iy * IZ + ix ] << " "<< x[ix] << " "<< y[iy]<<" "<< z[iz]<< '\n' ;
}
return 0;
}
Its result after cuda-memcheck
========= CUDA-MEMCHECK
0 0 0 0
100 0 1 0
200 0 2 0
300 0 3 0
400 0 4 0
100 1 0 0
200 1 1 0
300 1 2 0
400 1 3 0
500 1 4 0
200 2 0 0
300 2 1 0
400 2 2 0
500 2 3 0
600 2 4 0
300 3 0 0
400 3 1 0
500 3 2 0
600 3 3 0
700 3 4 0
400 4 0 0
500 4 1 0
600 4 2 0
700 4 3 0
800 4 4 0
Blocksize.x = 8, blockSize.y = 8, blockSize.z = 8
gridSize.x = 1, gridSize.y = 2, gridSize.z = 2
success!
0 0 0 0
0 0 0 0.5
0 0 0.5 0
0 0 0.5 0.5
150 0 1 0
150 0 1 0.5
150 0 1.5 0
150 0 1.5 0.5
200 0 2 0
200 0 2 0.5
250 0 2.5 0
250 0 2.5 0.5
350 0 3 0
350 0 3 0.5
350 0 3.5 0
350 0 3.5 0.5
350 0 4 0
400 0 4 0.5
400 0 4.5 0
450 0 4.5 0.5
0 0.5 0 0
0 0.5 0 0.5
0 0.5 0.5 0
0 0.5 0.5 0.5
200 0.5 1 0
200 0.5 1 0.5
200 0.5 1.5 0
200 0.5 1.5 0.5
250 0.5 2 0
250 0.5 2 0.5
300 0.5 2.5 0
300 0.5 2.5 0.5
400 0.5 3 0
400 0.5 3 0.5
400 0.5 3.5 0
400 0.5 3.5 0.5
350 0.5 4 0
450 0.5 4 0.5
400 0.5 4.5 0
500 0.5 4.5 0.5
0 1 0 0
0 1 0 0.5
150 1 0.5 0
150 1 0.5 0.5
150 1 1 0
150 1 1 0.5
200 1 1.5 0
200 1 1.5 0.5
250 1 2 0
250 1 2 0.5
350 1 2.5 0
350 1 2.5 0.5
350 1 3 0
350 1 3 0.5
350 1 3.5 0
400 1 3.5 0.5
400 1 4 0
450 1 4 0.5
0 1 4.5 0
550 1 4.5 0.5
0 1.5 0 0
0 1.5 0 0.5
200 1.5 0.5 0
200 1.5 0.5 0.5
200 1.5 1 0
200 1.5 1 0.5
250 1.5 1.5 0
250 1.5 1.5 0.5
300 1.5 2 0
300 1.5 2 0.5
400 1.5 2.5 0
400 1.5 2.5 0.5
400 1.5 3 0
400 1.5 3 0.5
350 1.5 3.5 0
450 1.5 3.5 0.5
400 1.5 4 0
500 1.5 4 0.5
0 1.5 4.5 0
600 1.5 4.5 0.5
150 2 0 0
150 2 0 0.5
150 2 0.5 0
150 2 0.5 0.5
200 2 1 0
200 2 1 0.5
250 2 1.5 0
250 2 1.5 0.5
350 2 2 0
350 2 2 0.5
350 2 2.5 0
350 2 2.5 0.5
350 2 3 0
400 2 3 0.5
400 2 3.5 0
450 2 3.5 0.5
0 2 4 0
550 2 4 0.5
0 2 4.5 0
550 2 4.5 0.5
200 2.5 0 0
200 2.5 0 0.5
200 2.5 0.5 0
200 2.5 0.5 0.5
250 2.5 1 0
250 2.5 1 0.5
300 2.5 1.5 0
300 2.5 1.5 0.5
400 2.5 2 0
400 2.5 2 0.5
400 2.5 2.5 0
400 2.5 2.5 0.5
350 2.5 3 0
450 2.5 3 0.5
400 2.5 3.5 0
500 2.5 3.5 0.5
0 2.5 4 0
600 2.5 4 0.5
0 2.5 4.5 0
600 2.5 4.5 0.5
150 3 0 0
150 3 0 0.5
200 3 0.5 0
200 3 0.5 0.5
250 3 1 0
250 3 1 0.5
350 3 1.5 0
350 3 1.5 0.5
350 3 2 0
350 3 2 0.5
350 3 2.5 0
400 3 2.5 0.5
400 3 3 0
450 3 3 0.5
0 3 3.5 0
550 3 3.5 0.5
0 3 4 0
550 3 4 0.5
150 3 4.5 0
650 3 4.5 0.5
200 3.5 0 0
200 3.5 0 0.5
250 3.5 0.5 0
250 3.5 0.5 0.5
300 3.5 1 0
300 3.5 1 0.5
400 3.5 1.5 0
400 3.5 1.5 0.5
400 3.5 2 0
400 3.5 2 0.5
350 3.5 2.5 0
450 3.5 2.5 0.5
400 3.5 3 0
500 3.5 3 0.5
0 3.5 3.5 0
600 3.5 3.5 0.5
0 3.5 4 0
600 3.5 4 0.5
200 3.5 4.5 0
700 3.5 4.5 0.5
200 4 0 0
200 4 0 0.5
250 4 0.5 0
250 4 0.5 0.5
350 4 1 0
350 4 1 0.5
350 4 1.5 0
350 4 1.5 0.5
350 4 2 0
400 4 2 0.5
400 4 2.5 0
450 4 2.5 0.5
0 4 3 0
550 4 3 0.5
0 4 3.5 0
550 4 3.5 0.5
150 4 4 0
650 4 4 0.5
150 4 4.5 0
0 4 4.5 0.5
250 4.5 0 0
250 4.5 0 0.5
300 4.5 0.5 0
300 4.5 0.5 0.5
400 4.5 1 0
400 4.5 1 0.5
400 4.5 1.5 0
400 4.5 1.5 0.5
350 4.5 2 0
450 4.5 2 0.5
400 4.5 2.5 0
500 4.5 2.5 0.5
0 4.5 3 0
600 4.5 3 0.5
0 4.5 3.5 0
600 4.5 3.5 0.5
200 4.5 4 0
700 4.5 4 0.5
200 4.5 4.5 0
0 4.5 4.5 0.5
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
Why am I getting value = 0 in output?
Where is my mistake and how could find my mistake?