Hello,
For my own future reference, I have put together some simple examples on how to use texture memory in CUDA. I’d like to have feedback from expert users to make sure that I am doing things the right way.
The file below contains multiple examples (covering 1D, 2D, 3D, and cubemap texture binding and fetching as well as their layered variants, where applicable). Please remove the /* */ comment delimiters to access individual examples.
My examples are minimal and not intended to show anything besides how to use textures (for examples, each block contains just one thread… not very efficient). I chose rather odd numbers for the dimensions of my textures (indeed, they are all prime numbers) to avoid lucky situations.
Thanks,
Luca
#include <cuda.h>
#include <stdio.h>
#define DIM_X 23
#define DIM_Y 11
#define DIM_Z 7
#define DIM_FACE 29
#define NUM_LAYERS 5
#define MAP_2D(__dimx, __dimy, __x, __y) ((__y) * (__dimx) + (__x))
#define UNMAP_2D_X(__dimx, __dimy, __index) ((__index) % (__dimx))
#define UNMAP_2D_Y(__dimx, __dimy, __index) ((__index) / (__dimx))
#define MAP_3D(__dimx, __dimy, __dimz, __x, __y, __z) (((__z) * (__dimy) + (__y)) * (__dimx) + (__x))
#define UNMAP_3D_X(__dimx, __dimy, __dimz, __index) ((__index) % (__dimx))
#define UNMAP_3D_Y(__dimx, __dimy, __dimz, __index) (((__index) / (__dimx)) % (__dimy))
#define UNMAP_3D_Z(__dimx, __dimy, __dimz, __index) ((__index) / ((__dimx) * (__dimy)))
// Macro used to check CUDA function returning value and error
#define MY_SAFE_CALL(__call) do { \
cudaError_t __err = __call; \
if(__err != cudaSuccess) { \
fprintf(stderr, "CUDA driver error 0x%04X: %s\n", (unsigned int) __err, cudaGetErrorString(__err)); \
fprintf(stderr, "File: \"%s\", function: \"%s\", line: %d\n", __FILE__, __FUNCTION__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while(0)
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/*
// *** 1D Texture Example ***
texture<float4, cudaTextureType1D, cudaReadModeElementType> data_tex;
void __global__ kernel(float *test_dev) {
const int n = MAP_2D(gridDim.x, gridDim.y, blockIdx.x, blockIdx.y);
if(n < DIM_X) {
test_dev[n] = tex1D(data_tex, n + 0.50f).x;
}
return;
}
int main(int argc, char **argv) {
struct cudaChannelFormatDesc desc;
struct cudaArray *data_dev;
float4 data[DIM_X];
float test[DIM_X];
float *test_dev;
float a, b;
int i, size;
for(i = 0; i < DIM_X; ++i) {
data[i] = make_float4(i + 1.00, i + 2.00, i + 3.00, i + 4.00);
}
MY_SAFE_CALL(cudaSetDevice(0));
desc = cudaCreateChannelDesc<float4>();
MY_SAFE_CALL(cudaMallocArray(& data_dev, & desc, DIM_X));
MY_SAFE_CALL(cudaMemcpyToArray(data_dev, 0, 0, data, sizeof(data), cudaMemcpyHostToDevice));
data_tex.normalized = 0;
data_tex.filterMode = cudaFilterModeLinear;
data_tex.addressMode[0] = cudaAddressModeBorder;
MY_SAFE_CALL(cudaBindTextureToArray(data_tex, data_dev, desc));
MY_SAFE_CALL(cudaMalloc((void **) (& test_dev), DIM_X * sizeof(*test_dev)));
size = (int) ceil(sqrtf(DIM_X));
kernel<<<dim3(size, size), 1>>>(test_dev);
MY_SAFE_CALL(cudaUnbindTexture(data_tex));
MY_SAFE_CALL(cudaFreeArray(data_dev));
MY_SAFE_CALL(cudaMemcpy(test, test_dev, DIM_X * sizeof(*test), cudaMemcpyDeviceToHost));
MY_SAFE_CALL(cudaFree(test_dev));
for(i = 0; i < DIM_X; ++i) {
a = test[i];
b = data[i].x;
printf("%d:\t%f\t%f\t%f\n", i, a, b, fabsf(a - b));
}
return(0);
}
*/
/*
// *** 1D Layered Texture Example ***
texture<float4, cudaTextureType1DLayered, cudaReadModeElementType> data_tex;
void __global__ kernel(float *test_dev) {
const int n = MAP_2D(gridDim.x, gridDim.y, blockIdx.x, blockIdx.y);
if(n < (DIM_X * NUM_LAYERS)) {
const int n_x = UNMAP_2D_X(DIM_X, NUM_LAYERS, n);
const int layer = UNMAP_2D_Y(DIM_X, NUM_LAYERS, n);
test_dev[n] = tex1DLayered(data_tex, n_x + 0.50f, layer).x;
}
return;
}
int main(int argc, char **argv) {
static float4 data[DIM_X * NUM_LAYERS];
static float test[DIM_X * NUM_LAYERS];
struct cudaMemcpy3DParms copy_par = {0};
struct cudaChannelFormatDesc desc;
struct cudaArray *data_dev;
float *test_dev;
int i, layer, n;
float a, b;
int size;
for(i = 0; i < DIM_X; ++i) {
for(layer = 0; layer < NUM_LAYERS; ++layer) {
n = MAP_2D(DIM_X, NUM_LAYERS, i, layer);
data[n] = make_float4(i + 1.00, layer + 1.00, i + layer + 1.00, n);
}
}
MY_SAFE_CALL(cudaSetDevice(0));
desc = cudaCreateChannelDesc<float4>();
MY_SAFE_CALL(cudaMalloc3DArray(& data_dev, & desc, make_cudaExtent(DIM_X, 0, NUM_LAYERS), cudaArrayLayered));
copy_par.extent = make_cudaExtent(DIM_X, 1, NUM_LAYERS);
copy_par.kind = cudaMemcpyHostToDevice;
copy_par.dstArray = data_dev;
copy_par.srcPtr = make_cudaPitchedPtr(data, DIM_X * sizeof(*data), DIM_X, 1);
MY_SAFE_CALL(cudaMemcpy3D(& copy_par));
data_tex.normalized = 0;
data_tex.filterMode = cudaFilterModeLinear;
data_tex.addressMode[0] = cudaAddressModeBorder;
MY_SAFE_CALL(cudaBindTextureToArray(data_tex, data_dev, desc));
MY_SAFE_CALL(cudaMalloc((void **) (& test_dev), DIM_X * NUM_LAYERS * sizeof(*test_dev)));
size = (int) ceil(sqrtf(DIM_X * NUM_LAYERS));
kernel<<<dim3(size, size), 1>>>(test_dev);
MY_SAFE_CALL(cudaUnbindTexture(data_tex));
MY_SAFE_CALL(cudaFreeArray(data_dev));
MY_SAFE_CALL(cudaMemcpy(test, test_dev, DIM_X * NUM_LAYERS * sizeof(*test), cudaMemcpyDeviceToHost));
MY_SAFE_CALL(cudaFree(test_dev));
for(layer = 0; layer < NUM_LAYERS; ++layer) {
for(i = 0; i < DIM_X; ++i) {
a = test[MAP_2D(DIM_X, NUM_LAYERS, i, layer)];
b = data[MAP_2D(DIM_X, NUM_LAYERS, i, layer)].x;
printf("[%d, %d]\t%10.5f\t%10.5f\t\t%e\n", i, layer, a, b, fabsf(a - b));
}
}
return(0);
}
*/
/*
// *** 2D Texture Example ***
texture<float4, cudaTextureType2D, cudaReadModeElementType> data_tex;
void __global__ kernel(float *test_dev) {
const int n = MAP_2D(gridDim.x, gridDim.y, blockIdx.x, blockIdx.y);
if(n < (DIM_X * DIM_Y)) {
const int n_x = UNMAP_2D_X(DIM_X, DIM_Y, n);
const int n_y = UNMAP_2D_Y(DIM_X, DIM_Y, n);
test_dev[n] = tex2D(data_tex, n_x + 0.50f, n_y + 0.50f).x;
}
return;
}
int main(int argc, char **argv) {
struct cudaChannelFormatDesc desc;
float4 data[DIM_X * DIM_Y];
float test[DIM_X * DIM_Y];
float4 *data_dev;
float *test_dev;
float a, b;
int i, j, n;
size_t pitch;
int size;
for(i = 0; i < DIM_X; ++i) {
for(j = 0; j < DIM_Y; ++j) {
n = MAP_2D(DIM_X, DIM_Y, i, j);
data[n] = make_float4(i + 1.00, j + 1.00, i + j + 1.00, n);
}
}
MY_SAFE_CALL(cudaSetDevice(0));
MY_SAFE_CALL(cudaMallocPitch((void **) (& data_dev), & pitch, DIM_X * sizeof(*data_dev), DIM_Y));
MY_SAFE_CALL(cudaMemcpy2D(data_dev, pitch, data, DIM_X * sizeof(*data), DIM_X * sizeof(*data), DIM_Y, cudaMemcpyHostToDevice));
desc = cudaCreateChannelDesc<float4>();
data_tex.normalized = 0;
data_tex.filterMode = cudaFilterModeLinear;
data_tex.addressMode[0] = cudaAddressModeBorder;
data_tex.addressMode[1] = cudaAddressModeBorder;
MY_SAFE_CALL(cudaBindTexture2D(NULL, data_tex, data_dev, desc, DIM_X, DIM_Y, pitch));
MY_SAFE_CALL(cudaMalloc((void **) (& test_dev), DIM_X * DIM_Y * sizeof(*test_dev)));
size = (int) ceil(sqrtf(DIM_X * DIM_Y));
kernel<<<dim3(size, size), 1>>>(test_dev);
MY_SAFE_CALL(cudaUnbindTexture(data_tex));
MY_SAFE_CALL(cudaFree(data_dev));
MY_SAFE_CALL(cudaMemcpy(test, test_dev, DIM_X * DIM_Y * sizeof(*test), cudaMemcpyDeviceToHost));
MY_SAFE_CALL(cudaFree(test_dev));
for(j = 0; j < DIM_Y; ++j) {
for(i = 0; i < DIM_X; ++i) {
a = test[MAP_2D(DIM_X, DIM_Y, i, j)];
b = data[MAP_2D(DIM_X, DIM_Y, i, j)].x;
printf("[%d, %d]\t%10.5f\t%10.5f\t\t%e\n", i, j, a, b, fabsf(a - b));
}
}
return(0);
}
*/
/*
// *** 2D Layered Texture Example ***
texture<float4, cudaTextureType2DLayered, cudaReadModeElementType> data_tex;
void __global__ kernel(float *test_dev) {
const int n = MAP_2D(gridDim.x, gridDim.y, blockIdx.x, blockIdx.y);
if(n < DIM_X * DIM_Y * NUM_LAYERS) {
const int n_x = UNMAP_3D_X(DIM_X, DIM_Y, NUM_LAYERS, n);
const int n_y = UNMAP_3D_Y(DIM_X, DIM_Y, NUM_LAYERS, n);
const int layer = UNMAP_3D_Z(DIM_X, DIM_Y, NUM_LAYERS, n);
test_dev[n] = tex2DLayered(data_tex, n_x + 0.50f, n_y + 0.50f, layer).x;
}
return;
}
int main(int argc, char **argv) {
static float4 data[DIM_X * DIM_Y * NUM_LAYERS];
static float test[DIM_X * DIM_Y * NUM_LAYERS];
struct cudaMemcpy3DParms copy_par = {0};
struct cudaChannelFormatDesc desc;
struct cudaArray *data_dev;
struct cudaExtent extent;
float *test_dev;
int i, j, layer, n;
float a, b;
int size;
for(i = 0; i < DIM_X; ++i) {
for(j = 0; j < DIM_Y; ++j) {
for(layer = 0; layer < NUM_LAYERS; ++layer) {
n = MAP_3D(DIM_X, DIM_Y, NUM_LAYERS, i, j, layer);
data[n] = make_float4(i + 1.00, j + 1.00, layer + 1.00, n);
}
}
}
MY_SAFE_CALL(cudaSetDevice(0));
desc = cudaCreateChannelDesc<float4>();
extent = make_cudaExtent(DIM_X, DIM_Y, NUM_LAYERS);
MY_SAFE_CALL(cudaMalloc3DArray(& data_dev, & desc, extent, cudaArrayLayered));
copy_par.extent = extent;
copy_par.kind = cudaMemcpyHostToDevice;
copy_par.dstArray = data_dev;
copy_par.srcPtr = make_cudaPitchedPtr(data, DIM_X * sizeof(*data), DIM_X, DIM_Y);
MY_SAFE_CALL(cudaMemcpy3D(& copy_par));
data_tex.normalized = 0;
data_tex.filterMode = cudaFilterModeLinear;
data_tex.addressMode[0] = cudaAddressModeBorder;
data_tex.addressMode[1] = cudaAddressModeBorder;
MY_SAFE_CALL(cudaBindTextureToArray(data_tex, data_dev, desc));
MY_SAFE_CALL(cudaMalloc((void **) (& test_dev), DIM_X * DIM_Y * NUM_LAYERS * sizeof(*test_dev)));
size = (int) ceil(sqrtf(DIM_X * DIM_Y * NUM_LAYERS));
kernel<<<dim3(size, size), 1>>>(test_dev);
MY_SAFE_CALL(cudaUnbindTexture(data_tex));
MY_SAFE_CALL(cudaFreeArray(data_dev));
MY_SAFE_CALL(cudaMemcpy(test, test_dev, DIM_X * DIM_Y * NUM_LAYERS * sizeof(*test), cudaMemcpyDeviceToHost));
MY_SAFE_CALL(cudaFree(test_dev));
for(layer = 0; layer < NUM_LAYERS; ++layer) {
for(i = 0; i < DIM_X; ++i) {
for(j = 0; j < DIM_Y; ++j) {
a = test[MAP_3D(DIM_X, DIM_Y, NUM_LAYERS, i, j, layer)];
b = data[MAP_3D(DIM_X, DIM_Y, NUM_LAYERS, i, j, layer)].x;
printf("[%d, %d, %d]\t%10.5f\t%10.5f\t\t%e\n", i, j, layer, a, b, fabsf(a - b));
}
}
}
return(0);
}
*/
/*
// *** 3D Texture Example ***
texture<float4, cudaTextureType3D, cudaReadModeElementType> data_tex;
void __global__ kernel(float *test_dev) {
const int n = MAP_2D(gridDim.x, gridDim.y, blockIdx.x, blockIdx.y);
if(n < DIM_X * DIM_Y * DIM_Z) {
const int n_x = UNMAP_3D_X(DIM_X, DIM_Y, DIM_Z, n);
const int n_y = UNMAP_3D_Y(DIM_X, DIM_Y, DIM_Z, n);
const int n_z = UNMAP_3D_Z(DIM_X, DIM_Y, DIM_Z, n);
test_dev[n] = tex3D(data_tex, n_x + 0.50f, n_y + 0.50f, n_z + 0.50f).x;
}
return;
}
int main(int argc, char **argv) {
static float4 data[DIM_X * DIM_Y * DIM_Z];
static float test[DIM_X * DIM_Y * DIM_Z];
struct cudaMemcpy3DParms copy_par = {0};
struct cudaChannelFormatDesc desc;
struct cudaArray *data_dev;
struct cudaExtent extent;
float *test_dev;
int i, j, k, n;
float a, b;
int size;
for(i = 0; i < DIM_X; ++i) {
for(j = 0; j < DIM_Y; ++j) {
for(k = 0; k < DIM_Z; ++k) {
n = MAP_3D(DIM_X, DIM_Y, DIM_Z, i, j, k);
data[n] = make_float4(i + 1.00, j + 1.00, k + 1.00, n);
}
}
}
MY_SAFE_CALL(cudaSetDevice(0));
desc = cudaCreateChannelDesc<float4>();
extent = make_cudaExtent(DIM_X, DIM_Y, DIM_Z);
MY_SAFE_CALL(cudaMalloc3DArray(& data_dev, & desc, extent));
copy_par.extent = extent;
copy_par.kind = cudaMemcpyHostToDevice;
copy_par.dstArray = data_dev;
copy_par.srcPtr = make_cudaPitchedPtr(data, DIM_X * sizeof(*data), DIM_X, DIM_Y);
MY_SAFE_CALL(cudaMemcpy3D(& copy_par));
data_tex.normalized = 0;
data_tex.filterMode = cudaFilterModeLinear;
data_tex.addressMode[0] = cudaAddressModeBorder;
data_tex.addressMode[1] = cudaAddressModeBorder;
MY_SAFE_CALL(cudaBindTextureToArray(data_tex, data_dev, desc));
MY_SAFE_CALL(cudaMalloc((void **) (& test_dev), DIM_X * DIM_Y * DIM_Z * sizeof(*test_dev)));
size = (int) ceil(sqrtf(DIM_X * DIM_Y * DIM_Z));
kernel<<<dim3(size, size), 1>>>(test_dev);
MY_SAFE_CALL(cudaUnbindTexture(data_tex));
MY_SAFE_CALL(cudaFreeArray(data_dev));
MY_SAFE_CALL(cudaMemcpy(test, test_dev, DIM_X * DIM_Y * DIM_Z * sizeof(*test), cudaMemcpyDeviceToHost));
MY_SAFE_CALL(cudaFree(test_dev));
for(i = 0; i < DIM_X; ++i) {
for(j = 0; j < DIM_Y; ++j) {
for(k = 0; k < DIM_Z; ++k) {
a = test[MAP_3D(DIM_X, DIM_Y, DIM_Z, i, j, k)];
b = data[MAP_3D(DIM_X, DIM_Y, DIM_Z, i, j, k)].x;
printf("[%d, %d, %d]\t%10.5f\t%10.5f\t\t%e\n", i, j, k, a, b, fabsf(a - b));
}
}
}
return(0);
}
*/
/*
// *** Cubemap Texture Example ***
texture<float4, cudaTextureTypeCubemap, cudaReadModeElementType> data_tex;
void __global__ kernel(float *test_dev) {
const int n = MAP_2D(gridDim.x, gridDim.y, blockIdx.x, blockIdx.y);
if(n < DIM_X * DIM_Y * DIM_Z) {
const float x = (UNMAP_3D_X(DIM_X, DIM_Y, DIM_Z, n) + 0.50f) / ((float) DIM_X) * 2.00f - 1.00f;
const float y = (UNMAP_3D_Y(DIM_X, DIM_Y, DIM_Z, n) + 0.50f) / ((float) DIM_Y) * 2.00f - 1.00f;
const float z = (UNMAP_3D_Z(DIM_X, DIM_Y, DIM_Z, n) + 0.50f) / ((float) DIM_Z) * 2.00f - 1.00f;
test_dev[n] = texCubemap(data_tex, x, y, z).z;
}
return;
}
int main(int argc, char **argv) {
static float4 data[DIM_FACE * DIM_FACE * 6];
static float test[DIM_X * DIM_Y * DIM_Z];
struct cudaMemcpy3DParms copy_par = {0};
struct cudaChannelFormatDesc desc;
struct cudaArray *data_dev;
struct cudaExtent extent;
float *test_dev;
int i, j, k, face, n;
int size;
for(i = 0; i < DIM_FACE; ++i) {
for(j = 0; j < DIM_FACE; ++j) {
for(face = 0; face < 6; ++face) {
n = MAP_3D(DIM_FACE, DIM_FACE, 6, i, j, face);
data[n] = make_float4(i + 1.00, j + 1.00, face, n);
}
}
}
MY_SAFE_CALL(cudaSetDevice(0));
desc = cudaCreateChannelDesc<float4>();
extent = make_cudaExtent(DIM_FACE, DIM_FACE, 6);
MY_SAFE_CALL(cudaMalloc3DArray(& data_dev, & desc, extent, cudaArrayCubemap));
copy_par.extent = extent;
copy_par.kind = cudaMemcpyHostToDevice;
copy_par.dstArray = data_dev;
copy_par.srcPtr = make_cudaPitchedPtr(data, DIM_FACE * sizeof(*data), DIM_FACE, DIM_FACE);
MY_SAFE_CALL(cudaMemcpy3D(& copy_par));
data_tex.normalized = 0;
data_tex.filterMode = cudaFilterModeLinear;
data_tex.addressMode[0] = cudaAddressModeBorder;
data_tex.addressMode[1] = cudaAddressModeBorder;
MY_SAFE_CALL(cudaBindTextureToArray(data_tex, data_dev, desc));
MY_SAFE_CALL(cudaMalloc((void **) (& test_dev), DIM_X * DIM_Y * DIM_Z * sizeof(*test_dev)));
size = (int) ceil(sqrtf(DIM_X * DIM_Y * DIM_Z));
kernel<<<dim3(size, size), 1>>>(test_dev);
MY_SAFE_CALL(cudaUnbindTexture(data_tex));
MY_SAFE_CALL(cudaFreeArray(data_dev));
MY_SAFE_CALL(cudaMemcpy(test, test_dev, DIM_X * DIM_Y * DIM_Z * sizeof(*test), cudaMemcpyDeviceToHost));
MY_SAFE_CALL(cudaFree(test_dev));
for(i = 0; i < DIM_X; ++i) {
for(j = 0; j < DIM_Y; ++j) {
for(k = 0; k < DIM_Z; ++k) {
n = MAP_3D(DIM_X, DIM_Y, DIM_Z, i, j, k);
printf("[%d, %d, %d]\t%10.5f\n", i, j, k, test[n]);
}
}
}
return(0);
}
*/
/*
// *** Cubemap Layered Texture Example ***
texture<float4, cudaTextureTypeCubemapLayered, cudaReadModeElementType> data_tex;
void __global__ kernel(float *test_dev) {
const int n = blockIdx.x;
const int layer = blockIdx.y;
const float x = (UNMAP_3D_X(DIM_X, DIM_Y, DIM_Z, n) + 0.50f) / ((float) DIM_X) * 2.00f - 1.00f;
const float y = (UNMAP_3D_Y(DIM_X, DIM_Y, DIM_Z, n) + 0.50f) / ((float) DIM_Y) * 2.00f - 1.00f;
const float z = (UNMAP_3D_Z(DIM_X, DIM_Y, DIM_Z, n) + 0.50f) / ((float) DIM_Z) * 2.00f - 1.00f;
test_dev[MAP_2D(DIM_X * DIM_Y * DIM_Z, NUM_LAYERS, n, layer)] = texCubemapLayered(data_tex, x, y, z, layer).z;
return;
}
int main(int argc, char **argv) {
static float4 data[(DIM_FACE * DIM_FACE) * (6 * NUM_LAYERS)];
static float test[(DIM_X * DIM_Y * DIM_Z) * NUM_LAYERS];
struct cudaMemcpy3DParms copy_par = {0};
struct cudaChannelFormatDesc desc;
struct cudaArray *data_dev;
struct cudaExtent extent;
float *test_dev;
int i, j, k, m, n;
int face, layer;
for(layer = 0; layer < NUM_LAYERS; ++layer) {
for(face = 0; face < 6; ++face) {
m = MAP_2D(6, NUM_LAYERS, face, layer);
for(i = 0; i < DIM_FACE; ++i) {
for(j = 0; j < DIM_FACE; ++j) {
n = MAP_2D(DIM_FACE, DIM_FACE, i, j);
data[MAP_2D(DIM_FACE * DIM_FACE, 6 * NUM_LAYERS, n, m)] = make_float4(i + 1.00, j + 1.00, face + 1.00, layer + 1.00);
}
}
}
}
MY_SAFE_CALL(cudaSetDevice(0));
desc = cudaCreateChannelDesc<float4>();
extent = make_cudaExtent(DIM_FACE, DIM_FACE, 6 * NUM_LAYERS);
MY_SAFE_CALL(cudaMalloc3DArray(& data_dev, & desc, extent, cudaArrayCubemap | cudaArrayLayered));
copy_par.extent = extent;
copy_par.kind = cudaMemcpyHostToDevice;
copy_par.dstArray = data_dev;
copy_par.srcPtr = make_cudaPitchedPtr(data, DIM_FACE * sizeof(*data), DIM_FACE, DIM_FACE);
MY_SAFE_CALL(cudaMemcpy3D(& copy_par));
data_tex.normalized = 0;
data_tex.filterMode = cudaFilterModeLinear;
data_tex.addressMode[0] = cudaAddressModeBorder;
data_tex.addressMode[1] = cudaAddressModeBorder;
MY_SAFE_CALL(cudaBindTextureToArray(data_tex, data_dev, desc));
MY_SAFE_CALL(cudaMalloc((void **) (& test_dev), (DIM_X * DIM_Y * DIM_Z) * NUM_LAYERS * sizeof(*test_dev)));
kernel<<<dim3(DIM_X * DIM_Y * DIM_Z, NUM_LAYERS), 1>>>(test_dev);
MY_SAFE_CALL(cudaUnbindTexture(data_tex));
MY_SAFE_CALL(cudaFreeArray(data_dev));
MY_SAFE_CALL(cudaMemcpy(test, test_dev, (DIM_X * DIM_Y * DIM_Z) * NUM_LAYERS * sizeof(*test), cudaMemcpyDeviceToHost));
MY_SAFE_CALL(cudaFree(test_dev));
for(layer = 0; layer < NUM_LAYERS; ++layer) {
for(i = 0; i < DIM_X; ++i) {
for(j = 0; j < DIM_Y; ++j) {
for(k = 0; k < DIM_Z; ++k) {
n = MAP_3D(DIM_X, DIM_Y, DIM_Z, i, j, k);
printf("[%d, %d, %d, %d]\t%10.5f\n", layer, i, j, k, test[MAP_2D(DIM_X * DIM_Y * DIM_Z, NUM_LAYERS, n, layer)]);
}
}
}
}
return(0);
}
*/