I am having a 3d data with the size of (5,4,2) which I want to interpolate to make it (10,8,4) size. I am using
tex3D
to interpolate it and further extracting the values into an array.
But I am facing some issue
- able to transfer 1-dimensional data from host to device, but unable to do with 3d data while using cudaMemcpy3D, cudaMalloc3DArray.
- If I printing the values of the output of tex3D within __global__ kernel, printing only 00 (ZERO)
- From Cuda-Memcheck and Cuda-gdb, No Error is throwing.
- How to debug it?
- If I am redefining the values as such, width = 75, height = 145, depth = 18, n_width = 87, n_height = 288, n_depth = 150, then I am getting error. How could I get the access such large datasets? I know, I have to access it on multi-GPU, but I am not getting it. How could I divide the task for 3D - texture Memory?
My code is
#include<stdio.h>
#include<cuda.h>
#define width 5
#define height 4
#define depth 2
#define n_width 10
#define n_height 8
#define n_depth 4
#define SIZE (width * height * depth)
#define N_SIZE (n_depth * n_height * n_width)
#define theta 30
// for cuda error checking
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
return 1; \
} \
} while (0)
// Simple transformation kernel
__global__ void transformKernel(float* output,float* d_new_lon,float* d_new_lat, float* d_new_lvl,
cudaTextureObject_t texObj)
{
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int z = blockIdx.z * blockDim.z + threadIdx.z;
//printf("kernel value is x= %d\n", x);
//printf("kernel value is y= %d\n", y);
if ((x < n_width) && (y < n_height) && (z < n_depth))
{
float u = d_new_lon[x];
float v = d_new_lat[y];
float w = d_new_lvl[z];
// Transform coordinates
float tu = u ; // longitude is increasing //
float tv = v ; // latitude is decreasing //
float tw = w ; // level is decreasing //
// Read from texture and write to global memory
output[z * n_width * n_height + y * n_width + x] = tex3D<float>(texObj, tu, tv, tw);
printf("texObj=%f,tu=%f,tv=%f,tw=%f\n",texObj,tu,tv,tw);
printf("output = %f, width =%d, height = %d, depth = %d\n",output[z * n_width * n_height + y * n_width + x], x, y, z);
printf("d_new_lon = %f\n", d_new_lon[x] );
}
}
// Host code
int main()
{
float h_data[width][height][depth];
float d_data[n_width][n_height][n_depth];
float h_new_lon[n_width], out_new_lon[n_width];
float* d_new_lon;
float h_new_lat[n_height], out_new_lat[n_height];
float* d_new_lat;
float h_new_lvl[n_depth], out_new_lvl[n_depth];
float* d_new_lvl;
float del_lon = 1.25, del_lat = 1.25 ;
int size_lon = n_width * sizeof(float);
int size_lat = n_height * sizeof(float);
int size_lvl = n_depth * sizeof(float);
//float *h_data = (float*)malloc(SIZE * sizeof(float));
for (int i =0; i < n_width; i++ )
{
h_new_lon[i] = 0 + (i)*del_lon;
}
for (int i = 0 ; i < n_height ; i++ )
{
h_new_lat[i] = 90 - (i * del_lat) ;
}
for (int i = 0 ; i < n_depth ; i++)
{
h_new_lvl[i] = rand() % 10 + i ;
}
//initialize the host array before usage
for ( int i =0 ; i<width ; i++ )
for ( int j = 0 ; j<height; j++ )
for ( int k =0; k<depth; k++ )
{ h_data [i][j][k] = (i+j+k) * 100;
// printf ("%f ", h_data[i][j][k]);
// printf("\n");
}
//allocate host and device memory
cudaMalloc((void**)&d_new_lon, size_lon);
cudaMalloc((void**)&d_new_lat, size_lat);
cudaMalloc((void**)&d_new_lvl, size_lvl);
cudaMalloc((void**)&d_data,sizeof(float)*n_width*n_height*n_depth);
cudaCheckErrors("cudaMalloc is failed");
cudaMemcpy( d_new_lon, h_new_lon, size_lon, cudaMemcpyHostToDevice );
cudaMemcpy( d_new_lat, h_new_lat, size_lat, cudaMemcpyHostToDevice );
cudaMemcpy( d_new_lvl, h_new_lvl, size_lvl, cudaMemcpyHostToDevice );
cudaCheckErrors("allocating an array is failed");
cudaExtent volumesize;
// set cuda array volume size
volumesize=make_cudaExtent(width,height,depth);
// Allocate CUDA array in device memory
//cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindFloat);
cudaArray* cuArray;
// create channel to describe data type
cudaChannelFormatDesc channelDesc;
channelDesc =cudaCreateChannelDesc<float>();
// allocate device memory for cuda array
cudaMalloc3DArray(&cuArray,&channelDesc,volumesize);
cudaCheckErrors("cudamalloc3dArray is failed");
cudaMemcpy3DParms copyparms={0};
// set cuda array copy parameters
copyparms.extent =volumesize;
copyparms.dstArray =cuArray;
copyparms.kind =cudaMemcpyHostToDevice;
cudaCheckErrors("copyparms is not kind");
copyparms.srcPtr = make_cudaPitchedPtr((void*)h_data,sizeof(float)*width,height,depth);
cudaCheckErrors("copyparms is undone");
cudaMemcpy3D(©parms);
cudaCheckErrors("cudaMemcpy3D is failed");
// Copy to device memory some data located at address h_data
// in host memory
// Specify texture
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
// Specify texture object parameters
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
// Allocate result of transformation in device memory
float* output;
cudaMalloc(&output, n_width * n_height * n_depth * sizeof(float));
// Invoke kernel
dim3 blocknum;
dim3 blocksize;
blocksize.x = 16;
blocksize.y = 16;
blocksize.z = 4;
blocknum.x = (int)ceil(((float)n_width+blocksize.x)/blocksize.x);
blocknum.y = (int)ceil(((float)n_height+blocksize.y)/blocksize.y);
blocknum.z = (int)ceil(((float)n_depth+blocksize.z)/blocksize.y);
transformKernel<<<blocknum, blocksize>>>(output, d_new_lon, d_new_lat, d_new_lvl, texObj );
cudaMemcpy(d_data,output,N_SIZE * sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(out_new_lon, d_new_lon, size_lon, cudaMemcpyDeviceToHost);
cudaMemcpy(out_new_lat, d_new_lat, size_lat, cudaMemcpyDeviceToHost);
cudaMemcpy(out_new_lvl, d_new_lvl, size_lvl, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpyDtoH is failed");
// Destroy texture object
cudaDestroyTextureObject(texObj);
// Free device memory
cudaFreeArray(cuArray);
cudaFree(output);
printf("h_data 1\n");
for (int i =0; i < n_width; i++ )
{
printf("%f\n", out_new_lon[i] );
}
}