tex3D<float>... not able to to calculate from a code

I am having a 3d data with the size of (5,4,2) which I want to interpolate to make it (10,8,4) size. I am using

tex3D

to interpolate it and further extracting the values into an array.
But I am facing some issue

  1. able to transfer 1-dimensional data from host to device, but unable to do with 3d data while using cudaMemcpy3D, cudaMalloc3DArray.
  2. If I printing the values of the output of tex3D within __global__ kernel, printing only 00 (ZERO)
  3. From Cuda-Memcheck and Cuda-gdb, No Error is throwing.
  4. How to debug it?
  5. If I am redefining the values as such, width = 75, height = 145, depth = 18, n_width = 87, n_height = 288, n_depth = 150, then I am getting error. How could I get the access such large datasets? I know, I have to access it on multi-GPU, but I am not getting it. How could I divide the task for 3D - texture Memory?

My code is

#include<stdio.h>
#include<cuda.h>

#define width 5
#define height 4
#define depth 2
#define n_width 10
#define n_height 8
#define n_depth 4
#define SIZE (width * height * depth)
#define N_SIZE (n_depth * n_height * n_width)
#define theta 30

// for cuda error checking
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            return 1; \
        } \
    } while (0)

// Simple transformation kernel
__global__  void transformKernel(float* output,float* d_new_lon,float* d_new_lat, float* d_new_lvl,
cudaTextureObject_t texObj)
{
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;

//printf("kernel value is x= %d\n", x);
//printf("kernel value is y= %d\n", y);
if ((x < n_width) && (y < n_height) && (z < n_depth))
{
float u = d_new_lon[x];
float v = d_new_lat[y];
float w = d_new_lvl[z];

// Transform coordinates

float tu = u ;	   	// longitude is increasing //
float tv = v ;		// latitude is decreasing  //
float tw = w ;	    // level is decreasing     //

// Read from texture and write to global memory
output[z * n_width * n_height + y * n_width + x] = tex3D<float>(texObj, tu, tv, tw);

printf("texObj=%f,tu=%f,tv=%f,tw=%f\n",texObj,tu,tv,tw);
printf("output = %f, width =%d, height = %d, depth = %d\n",output[z * n_width * n_height + y * n_width + x], x, y, z);
printf("d_new_lon = %f\n", d_new_lon[x] );
}
}

// Host code
int main()
{
float h_data[width][height][depth];
float d_data[n_width][n_height][n_depth];
float h_new_lon[n_width], out_new_lon[n_width];
float* d_new_lon;
float h_new_lat[n_height], out_new_lat[n_height];
float* d_new_lat;
float h_new_lvl[n_depth], out_new_lvl[n_depth];
float* d_new_lvl;

float del_lon = 1.25, del_lat = 1.25 ;
int size_lon = n_width * sizeof(float);
int size_lat = n_height * sizeof(float);
int size_lvl = n_depth * sizeof(float);

//float *h_data = (float*)malloc(SIZE * sizeof(float));

for (int i =0; i < n_width; i++ )
	{
		h_new_lon[i] = 0 + (i)*del_lon;
	}
for (int i = 0 ; i < n_height ; i++ )
{
	h_new_lat[i] = 90 - (i * del_lat) ;
}
for (int i = 0 ; i < n_depth ; i++)
{
	h_new_lvl[i] = rand() % 10 + i ;
}

//initialize the host array before usage
 for ( int i =0 ; i<width ; i++ )
                  for ( int j = 0 ; j<height; j++ )
					  for ( int k =0; k<depth; k++ )
						  {	h_data [i][j][k] = (i+j+k) * 100;
					//		printf ("%f ", h_data[i][j][k]);
					//		printf("\n");
						  }	

						  
//allocate host and device memory						
cudaMalloc((void**)&d_new_lon, size_lon);
cudaMalloc((void**)&d_new_lat, size_lat);
cudaMalloc((void**)&d_new_lvl, size_lvl);
cudaMalloc((void**)&d_data,sizeof(float)*n_width*n_height*n_depth);

cudaCheckErrors("cudaMalloc is failed");

cudaMemcpy( d_new_lon, h_new_lon, size_lon, cudaMemcpyHostToDevice );
cudaMemcpy( d_new_lat, h_new_lat, size_lat, cudaMemcpyHostToDevice );
cudaMemcpy( d_new_lvl, h_new_lvl, size_lvl, cudaMemcpyHostToDevice );

cudaCheckErrors("allocating an array is failed");

cudaExtent volumesize;
// set cuda array volume size
volumesize=make_cudaExtent(width,height,depth);						

// Allocate CUDA array in device memory
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindFloat);

cudaArray* cuArray;
// create channel to describe data type
cudaChannelFormatDesc channelDesc;
channelDesc =cudaCreateChannelDesc<float>();

// allocate device memory for cuda array
cudaMalloc3DArray(&cuArray,&channelDesc,volumesize);
cudaCheckErrors("cudamalloc3dArray is failed");

cudaMemcpy3DParms copyparms={0};
// set cuda array copy parameters
copyparms.extent    =volumesize;
copyparms.dstArray  =cuArray;
copyparms.kind      =cudaMemcpyHostToDevice;
cudaCheckErrors("copyparms is not kind");
copyparms.srcPtr    = make_cudaPitchedPtr((void*)h_data,sizeof(float)*width,height,depth);
	
cudaCheckErrors("copyparms is undone");
cudaMemcpy3D(&copyparms);
cudaCheckErrors("cudaMemcpy3D is failed");
// Copy to device memory some data located at address h_data
// in host memory

// Specify texture
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
// Specify texture object parameters
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;

// Create texture object
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
// Allocate result of transformation in device memory
float* output; 
cudaMalloc(&output, n_width * n_height * n_depth * sizeof(float));
// Invoke kernel
dim3 blocknum;
dim3 blocksize;
blocksize.x = 16;
blocksize.y = 16;
blocksize.z = 4;

blocknum.x = (int)ceil(((float)n_width+blocksize.x)/blocksize.x);
blocknum.y = (int)ceil(((float)n_height+blocksize.y)/blocksize.y);
blocknum.z = (int)ceil(((float)n_depth+blocksize.z)/blocksize.y);

transformKernel<<<blocknum, blocksize>>>(output, d_new_lon, d_new_lat, d_new_lvl, texObj );
cudaMemcpy(d_data,output,N_SIZE * sizeof(float),cudaMemcpyDeviceToHost);

cudaMemcpy(out_new_lon, d_new_lon, size_lon, cudaMemcpyDeviceToHost);
cudaMemcpy(out_new_lat, d_new_lat, size_lat, cudaMemcpyDeviceToHost);
cudaMemcpy(out_new_lvl, d_new_lvl, size_lvl, cudaMemcpyDeviceToHost);

cudaCheckErrors("cudaMemcpyDtoH is failed");

// Destroy texture object
cudaDestroyTextureObject(texObj);
// Free device memory
cudaFreeArray(cuArray);
cudaFree(output);

printf("h_data 1\n");

for (int i =0; i < n_width; i++ )
	{
		printf("%f\n", out_new_lon[i] );
	}

}