Matlab crashes when I use 3d texture in CUDA code

I have a stack of 2d images. I want to use a 3D texture to store them and read them one by one in a kernel. But when I run the test .m file . The Matlab crashes. I do not know whether I missed something about 3D texture.

The CUDA code is just like this:

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include "mex.h"

#include "matrix.h"

#include <cutil.h>

typedef unsigned int  uint;

texture<float, 3, cudaReadModeElementType> tex;  // 3D texture

cudaArray *d_volumeArray = 0;

/***************************************************************************

 * Read**Kernel : read one layer in 3D texture

 **************************************************************************/

__global__ void

ReadKernel(float *d_output, uint imageW, uint imageH, float w)

{

    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

    // read from 3D texture

    float voxel = tex3D(tex, x, y, w);

    if ((x < imageW) && (y < imageH)) {

        uint i = __umul24(y, imageW) + x;

        d_output[i] = voxel;

    }

}

void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) 

{       

float* images = (float *)mxGetPr(prhs[0]);

    int n_i = mxGetScalar(prhs[1]);

    int width = mxGetScalar(prhs[2]);

    int height = mxGetScalar(prhs[3]);

const cudaExtent volumeSize = make_cudaExtent( width, height, n_i);       // size for  images stack 

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

    CUDA_SAFE_CALL( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) );

// copy data to 3D array

    cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)images, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);

    copyParams.dstArray = d_volumeArray;

    copyParams.extent   = volumeSize;

    copyParams.kind     = cudaMemcpyHostToDevice;

    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );

// set texture parameters

    tex.normalized = false;        

    tex.filterMode = cudaFilterModePoint;  //use cudaFilterModePoint filter mode to avoid fetching from different textures layers.

    tex.addressMode[0] = cudaAddressModeClamp;   

    tex.addressMode[1] = cudaAddressModeClamp;

    tex.addressMode[2] = cudaAddressModeClamp;

// bind array to 3D texture

    CUDA_SAFE_CALL(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));

float* d_output = NULL;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_output, (width*height*sizeof(float)))); 

float *h_odata = NULL;

/* get a pointer to  the output  */

    const mwSize dims[]={width,height,n_i};

    plhs[0] = mxCreateNumericArray(3,dims,mxDOUBLE_CLASS,mxREAL);

    h_odata = (float *)mxGetPr(plhs[0]);

dim3 dimBlock1(8, 8, 1);

    dim3 dimGrid1((width+ dimBlock1.x -1) / dimBlock1.x, (height+ dimBlock1.y -1) / dimBlock1.y, 1);

    for (int i = 0; i < n_i; i++)

    { 

        // read in 3D texture one layer to get one image 

        ReadKernel<<< dimGrid1, dimBlock1 >>>( d_output, width, height, i);         

        CUDA_SAFE_CALL( cudaThreadSynchronize());

        CUDA_SAFE_CALL( cudaMemcpy(h_odata+i*width*height*sizeof(float),d_output, width*height*sizeof(float),cudaMemcpyDeviceToHost));  // memory copy

    }

    // clear memory

    CUDA_SAFE_CALL(cudaFree(d_output));

    CUDA_SAFE_CALL(cudaFreeArray(d_volumeArray));

}

 /code]

and the test .m code is just like this

[code]

m=imread('cameraman.tif');

m=imresize(m,[64 64]);

images(:,:,1) = m  ; % the first image

images(:,:,2) = m   ;  % the second  image

CC = test_3dtexture(images,2,64,64);

I checked line by line for many times but still could not find the error. Someone told me to add the unbind texture operation before cudaFree, but still crashes. Could anyone help me? Thank you!!!

I have a stack of 2d images. I want to use a 3D texture to store them and read them one by one in a kernel. But when I run the test .m file . The Matlab crashes. I do not know whether I missed something about 3D texture.

The CUDA code is just like this:

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include "mex.h"

#include "matrix.h"

#include <cutil.h>

typedef unsigned int  uint;

texture<float, 3, cudaReadModeElementType> tex;  // 3D texture

cudaArray *d_volumeArray = 0;

/***************************************************************************

 * Read**Kernel : read one layer in 3D texture

 **************************************************************************/

__global__ void

ReadKernel(float *d_output, uint imageW, uint imageH, float w)

{

    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

    // read from 3D texture

    float voxel = tex3D(tex, x, y, w);

    if ((x < imageW) && (y < imageH)) {

        uint i = __umul24(y, imageW) + x;

        d_output[i] = voxel;

    }

}

void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) 

{       

float* images = (float *)mxGetPr(prhs[0]);

    int n_i = mxGetScalar(prhs[1]);

    int width = mxGetScalar(prhs[2]);

    int height = mxGetScalar(prhs[3]);

const cudaExtent volumeSize = make_cudaExtent( width, height, n_i);       // size for  images stack 

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

    CUDA_SAFE_CALL( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) );

// copy data to 3D array

    cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)images, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);

    copyParams.dstArray = d_volumeArray;

    copyParams.extent   = volumeSize;

    copyParams.kind     = cudaMemcpyHostToDevice;

    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );

// set texture parameters

    tex.normalized = false;        

    tex.filterMode = cudaFilterModePoint;  //use cudaFilterModePoint filter mode to avoid fetching from different textures layers.

    tex.addressMode[0] = cudaAddressModeClamp;   

    tex.addressMode[1] = cudaAddressModeClamp;

    tex.addressMode[2] = cudaAddressModeClamp;

// bind array to 3D texture

    CUDA_SAFE_CALL(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));

float* d_output = NULL;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_output, (width*height*sizeof(float)))); 

float *h_odata = NULL;

/* get a pointer to  the output  */

    const mwSize dims[]={width,height,n_i};

    plhs[0] = mxCreateNumericArray(3,dims,mxDOUBLE_CLASS,mxREAL);

    h_odata = (float *)mxGetPr(plhs[0]);

dim3 dimBlock1(8, 8, 1);

    dim3 dimGrid1((width+ dimBlock1.x -1) / dimBlock1.x, (height+ dimBlock1.y -1) / dimBlock1.y, 1);

    for (int i = 0; i < n_i; i++)

    { 

        // read in 3D texture one layer to get one image 

        ReadKernel<<< dimGrid1, dimBlock1 >>>( d_output, width, height, i);         

        CUDA_SAFE_CALL( cudaThreadSynchronize());

        CUDA_SAFE_CALL( cudaMemcpy(h_odata+i*width*height*sizeof(float),d_output, width*height*sizeof(float),cudaMemcpyDeviceToHost));  // memory copy

    }

    // clear memory

    CUDA_SAFE_CALL(cudaFree(d_output));

    CUDA_SAFE_CALL(cudaFreeArray(d_volumeArray));

}

 /code]

and the test .m code is just like this

[code]

m=imread('cameraman.tif');

m=imresize(m,[64 64]);

images(:,:,1) = m  ; % the first image

images(:,:,2) = m   ;  % the second  image

CC = test_3dtexture(images,2,64,64);

I checked line by line for many times but still could not find the error. Someone told me to add the unbind texture operation before cudaFree, but still crashes. Could anyone help me? Thank you!!!

I don’t know if this makes any difference but you seem to call tex3D with two ints and one float

float w from function definition

uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
float voxel = tex3D(tex, x, y, w);

try

tex3D(tex, (float)x, (float)y, w);

I don’t know if this makes any difference but you seem to call tex3D with two ints and one float

float w from function definition

uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
float voxel = tex3D(tex, x, y, w);

try

tex3D(tex, (float)x, (float)y, w);

I know this is a dated post, but I am attempting to do the same thing and I ran into the error:

cudaErrorNoKernelImageForDevice

When calling cudaMemcpy3d. the dimensions of my data were 620x480x133 array of floats. Now the pitch calculated for this is over the 2048 limit as it is (6204 = 2480). If I try 512 for the first dimension (5124 = 2048) every thing works spectacularly. Ok so I figured this out not by looking at the obscure error code cudaErrorNoKernelImageForDevice, which means all kinds of nothing to me, but by looking back at the documentation.

Note the element type is “unsigned char” so for 640 floats of single type: sizeof(float) = 4

Ok so this is quazi documented. But I hate to be the one to ask this question… Why did cudaMalloc3D succeed? Only to fail on the copy which does document the 2048 limit. Seems to me this documentation is in the wrong place. Also what is the point of being able to allocate memory you can never effectively use (i.e. copy to/from host)? Is this some type of lazy programming where the memory is not actually allocated until it is requested by cudaMemcpy3D?

Also where I fail at creating 3D textures of this size I succeed at creating 2D textures where apparently (from looking at the documentation) there are no limits in the y direction (there are in the x due to ptich size), at least documented ones:

which likely is:

Maximum memory pitch: 2147483647 bytes

for my Quatro 1700M found by deviceQuery.exe in gpu SDK or putting this logic in your code.

Anyway hope this helps.