I am generating a 3d array with the dimension(nz, ny, nx) and 1d arrays along 3 dimensions x- axis[nx], y-axis[ny] and z-axis[nz]. Then I am using texture memory to interpolate it.
But I am not getting about how kernel failed?
Here the following code what I am trying to do
#include <iostream>
#include <fstream>
#define NX 50
#define NY 50
#define NZ 10
#define ifact 2
texture<float, 3> tex;
__global__ void getInterpolatedFunctionValue(float *a, float *b, float *c, float *result, int nx, int ny, int nz)
{
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
if ((x < nx) && (y < ny) && (z < nz))
{
printf("hello \n") ;
float w = a[x] + 0.5f, v = b[y] + 0.5f, u = c[z] + 0.5f ;
result[z * nz * ny + y * nz + x ] = tex3D(tex, u, v, w );
}
}
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
using namespace std;
int main(){
int nx=NX, ny=NY, nz=NZ;
float fff[nz][ny][nx];
float x[nx], y[ny], z[nz] ;
for(int ix=0; ix<nx; ix++)
for(int iy=0; iy<ny; iy++)
for(int iz=0; iz<nz; iz++){
x[ix] = ix / ifact ;
y[iz] = iy / ifact ;
z[iz] = iz / ifact ;
fff[iz][iy][ix] = sin(ix/(float)10)*cos(iy/(float)10)+iz;
}
float *d_x, *d_y, *d_z, *d_result, *h_result ;
cudaMalloc((void**)&d_x, nx * sizeof(float));
cudaMalloc((void**)&d_y, ny * sizeof(float));
cudaMalloc((void**)&d_z, nz * sizeof(float));
cudaMalloc((void**)&d_result, nx * ny * nz * sizeof(float));
cudaMemcpy( d_x, x, nx * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( d_y, y, ny * sizeof(float), cudaMemcpyHostToDevice );
cudaMemcpy( d_z, z, nz * sizeof(float), cudaMemcpyHostToDevice );
cudaCheckErrors("allocating an array is failed");
cudaArray *d_volumeArray ;
//const cudaExtent extent = make_cudaExtent(nx, ny, nz);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_volumeArray, &channelDesc, make_cudaExtent(nz, ny, nx));
cudaCheckErrors("cudaMalloc3D error");
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void*)fff, sizeof(float)*nz,ny,nx);
copyParams.dstArray = d_volumeArray;
copyParams.dstPos = make_cudaPos(0,0,0);
copyParams.srcPos = make_cudaPos(0,0,0);
copyParams.extent = make_cudaExtent(nz, ny, nx);
copyParams.kind = cudaMemcpyHostToDevice;
cudaCheckErrors("copyParams3D fail");
cudaMemcpy3D(©Params);
cudaCheckErrors("cudaMemcpy3DParms fail");
tex.normalized = false;
tex.filterMode = cudaFilterModeLinear;
tex.addressMode[0] = cudaAddressModeClamp;
tex.addressMode[1] = cudaAddressModeClamp;
tex.addressMode[2] = cudaAddressModeClamp;
cudaBindTextureToArray(tex, d_volumeArray, channelDesc);
cudaCheckErrors("bind fail");
const dim3 blockSize(32, 8, 8 );
const dim3 gridSize(((nz + blockSize.z )/blockSize.z),((ny + blockSize.y )/blockSize.y),((nx + blockSize.x)/blockSize.x));
getInterpolatedFunctionValue<<<gridSize, blockSize>>>(d_x, d_y, d_z, d_result, nx, ny, nz ) ;
cudaCheckErrors("kernel fail");
cudaDeviceSynchronize();
cudaMemcpy( h_result, d_result, nx * ny * nz * sizeof(float),cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
printf("success!\n");
cudaUnbindTexture(tex);
cudaCheckErrors("unbind fail");
cudaFreeArray(d_volumeArray);
cudaCheckErrors("free fail");
return 0;
}
Is there any thread issue, or any else?