Matlab crashes when I use 3d texture in CUDA code

soyvia · November 11, 2010, 5:00pm

I have a stack of 2d images. I want to use a 3D texture to store them and read them one by one in a kernel. But when I run the test .m file . The Matlab crashes. I do not know whether I missed something about 3D texture.

The CUDA code is just like this:

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include "mex.h"

#include "matrix.h"

#include <cutil.h>

typedef unsigned int  uint;

texture<float, 3, cudaReadModeElementType> tex;  // 3D texture

cudaArray *d_volumeArray = 0;

/***************************************************************************

 * Read**Kernel : read one layer in 3D texture

 **************************************************************************/

__global__ void

ReadKernel(float *d_output, uint imageW, uint imageH, float w)

{

    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

    // read from 3D texture

    float voxel = tex3D(tex, x, y, w);

    if ((x < imageW) && (y < imageH)) {

        uint i = __umul24(y, imageW) + x;

        d_output[i] = voxel;

    }

}

void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) 

{       

float* images = (float *)mxGetPr(prhs[0]);

    int n_i = mxGetScalar(prhs[1]);

    int width = mxGetScalar(prhs[2]);

    int height = mxGetScalar(prhs[3]);

const cudaExtent volumeSize = make_cudaExtent( width, height, n_i);       // size for  images stack 

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

    CUDA_SAFE_CALL( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) );

// copy data to 3D array

    cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)images, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);

    copyParams.dstArray = d_volumeArray;

    copyParams.extent   = volumeSize;

    copyParams.kind     = cudaMemcpyHostToDevice;

    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );

// set texture parameters

    tex.normalized = false;        

    tex.filterMode = cudaFilterModePoint;  //use cudaFilterModePoint filter mode to avoid fetching from different textures layers.

    tex.addressMode[0] = cudaAddressModeClamp;   

    tex.addressMode[1] = cudaAddressModeClamp;

    tex.addressMode[2] = cudaAddressModeClamp;

// bind array to 3D texture

    CUDA_SAFE_CALL(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));

float* d_output = NULL;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_output, (width*height*sizeof(float)))); 

float *h_odata = NULL;

/* get a pointer to  the output  */

    const mwSize dims[]={width,height,n_i};

    plhs[0] = mxCreateNumericArray(3,dims,mxDOUBLE_CLASS,mxREAL);

    h_odata = (float *)mxGetPr(plhs[0]);

dim3 dimBlock1(8, 8, 1);

    dim3 dimGrid1((width+ dimBlock1.x -1) / dimBlock1.x, (height+ dimBlock1.y -1) / dimBlock1.y, 1);

    for (int i = 0; i < n_i; i++)

    { 

        // read in 3D texture one layer to get one image 

        ReadKernel<<< dimGrid1, dimBlock1 >>>( d_output, width, height, i);         

        CUDA_SAFE_CALL( cudaThreadSynchronize());

        CUDA_SAFE_CALL( cudaMemcpy(h_odata+i*width*height*sizeof(float),d_output, width*height*sizeof(float),cudaMemcpyDeviceToHost));  // memory copy

    }

    // clear memory

    CUDA_SAFE_CALL(cudaFree(d_output));

    CUDA_SAFE_CALL(cudaFreeArray(d_volumeArray));

}

 /code]

and the test .m code is just like this

[code]

m=imread('cameraman.tif');

m=imresize(m,[64 64]);

images(:,:,1) = m  ; % the first image

images(:,:,2) = m   ;  % the second  image

CC = test_3dtexture(images,2,64,64);

I checked line by line for many times but still could not find the error. Someone told me to add the unbind texture operation before cudaFree, but still crashes. Could anyone help me? Thank you!!!

soyvia · November 11, 2010, 5:00pm

I have a stack of 2d images. I want to use a 3D texture to store them and read them one by one in a kernel. But when I run the test .m file . The Matlab crashes. I do not know whether I missed something about 3D texture.

The CUDA code is just like this:

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include "mex.h"

#include "matrix.h"

#include <cutil.h>

typedef unsigned int  uint;

texture<float, 3, cudaReadModeElementType> tex;  // 3D texture

cudaArray *d_volumeArray = 0;

/***************************************************************************

 * Read**Kernel : read one layer in 3D texture

 **************************************************************************/

__global__ void

ReadKernel(float *d_output, uint imageW, uint imageH, float w)

{

    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

    // read from 3D texture

    float voxel = tex3D(tex, x, y, w);

    if ((x < imageW) && (y < imageH)) {

        uint i = __umul24(y, imageW) + x;

        d_output[i] = voxel;

    }

}

void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) 

{       

float* images = (float *)mxGetPr(prhs[0]);

    int n_i = mxGetScalar(prhs[1]);

    int width = mxGetScalar(prhs[2]);

    int height = mxGetScalar(prhs[3]);

const cudaExtent volumeSize = make_cudaExtent( width, height, n_i);       // size for  images stack 

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

    CUDA_SAFE_CALL( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) );

// copy data to 3D array

    cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)images, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);

    copyParams.dstArray = d_volumeArray;

    copyParams.extent   = volumeSize;

    copyParams.kind     = cudaMemcpyHostToDevice;

    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );

// set texture parameters

    tex.normalized = false;        

    tex.filterMode = cudaFilterModePoint;  //use cudaFilterModePoint filter mode to avoid fetching from different textures layers.

    tex.addressMode[0] = cudaAddressModeClamp;   

    tex.addressMode[1] = cudaAddressModeClamp;

    tex.addressMode[2] = cudaAddressModeClamp;

// bind array to 3D texture

    CUDA_SAFE_CALL(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));

float* d_output = NULL;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_output, (width*height*sizeof(float)))); 

float *h_odata = NULL;

/* get a pointer to  the output  */

    const mwSize dims[]={width,height,n_i};

    plhs[0] = mxCreateNumericArray(3,dims,mxDOUBLE_CLASS,mxREAL);

    h_odata = (float *)mxGetPr(plhs[0]);

dim3 dimBlock1(8, 8, 1);

    dim3 dimGrid1((width+ dimBlock1.x -1) / dimBlock1.x, (height+ dimBlock1.y -1) / dimBlock1.y, 1);

    for (int i = 0; i < n_i; i++)

    { 

        // read in 3D texture one layer to get one image 

        ReadKernel<<< dimGrid1, dimBlock1 >>>( d_output, width, height, i);         

        CUDA_SAFE_CALL( cudaThreadSynchronize());

        CUDA_SAFE_CALL( cudaMemcpy(h_odata+i*width*height*sizeof(float),d_output, width*height*sizeof(float),cudaMemcpyDeviceToHost));  // memory copy

    }

    // clear memory

    CUDA_SAFE_CALL(cudaFree(d_output));

    CUDA_SAFE_CALL(cudaFreeArray(d_volumeArray));

}

 /code]

and the test .m code is just like this

[code]

m=imread('cameraman.tif');

m=imresize(m,[64 64]);

images(:,:,1) = m  ; % the first image

images(:,:,2) = m   ;  % the second  image

CC = test_3dtexture(images,2,64,64);

I checked line by line for many times but still could not find the error. Someone told me to add the unbind texture operation before cudaFree, but still crashes. Could anyone help me? Thank you!!!

wanderine · November 20, 2010, 4:20pm

I don’t know if this makes any difference but you seem to call tex3D with two ints and one float

float w from function definition

uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
float voxel = tex3D(tex, x, y, w);

try

tex3D(tex, (float)x, (float)y, w);

wanderine · November 20, 2010, 4:20pm

I don’t know if this makes any difference but you seem to call tex3D with two ints and one float

float w from function definition

uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
float voxel = tex3D(tex, x, y, w);

try

tex3D(tex, (float)x, (float)y, w);

bitminer · January 10, 2011, 7:15pm

I know this is a dated post, but I am attempting to do the same thing and I ran into the error:

cudaErrorNoKernelImageForDevice

When calling cudaMemcpy3d. the dimensions of my data were 620x480x133 array of floats. Now the pitch calculated for this is over the 2048 limit as it is (6204 = 2480). If I try 512 for the first dimension (5124 = 2048) every thing works spectacularly. Ok so I figured this out not by looking at the obscure error code cudaErrorNoKernelImageForDevice, which means all kinds of nothing to me, but by looking back at the documentation.

Note the element type is “unsigned char” so for 640 floats of single type: sizeof(float) = 4

Ok so this is quazi documented. But I hate to be the one to ask this question… Why did cudaMalloc3D succeed? Only to fail on the copy which does document the 2048 limit. Seems to me this documentation is in the wrong place. Also what is the point of being able to allocate memory you can never effectively use (i.e. copy to/from host)? Is this some type of lazy programming where the memory is not actually allocated until it is requested by cudaMemcpy3D?

Also where I fail at creating 3D textures of this size I succeed at creating 2D textures where apparently (from looking at the documentation) there are no limits in the y direction (there are in the x due to ptich size), at least documented ones:

cudaError_t cudaMemcpy2D (void * dst, size_t dpitch, const void * src, size_t spitch, size_t width,

size_t height, enum cudaMemcpyKind kind)

Copies a matrix (height rows of width bytes each) from the memory area pointed to by src to the memory area

pointed to by dst, where kind is one of cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost,

or cudaMemcpyDeviceToDevice, and specifies the direction of the copy. dpitch and spitch are the

widths in memory in bytes of the 2D arrays pointed to by dst and src, including any padding added to the end of

each row. The memory areas may not overlap. width must not exceed either dpitch or spitch. Calling cudaMemcpy2D()

with dst and src pointers that do not match the direction of the copy results in an undefined behavior.

cudaMemcpy2D() returns an error if dpitch or spitch exceeds the maximum allowed.

which likely is:

Maximum memory pitch: 2147483647 bytes

for my Quatro 1700M found by deviceQuery.exe in gpu SDK or putting this logic in your code.

Anyway hope this helps.

Topic		Replies	Views
Matlab crashes when I use 3d texture in CUDA code CUDA Programming and Performance	21	10749	November 17, 2010
Error handling in CUDA when using cudaMemcpy3D CUDA Programming and Performance	12	11939	December 22, 2010
3D Texturing CUDA Programming and Performance	7	6164	August 8, 2008
3D texture problem CUDA Programming and Performance	0	2563	May 13, 2011
Second call to cudaMemcpy3D crashes my app First call is successful CUDA Programming and Performance	14	8843	November 30, 2010
invalid texture reference CUDA Programming and Performance	10	3499	June 16, 2009
Using textures from matlab Problems with texture CUDA Programming and Performance	0	2232	December 24, 2007
3D texture problem CUDA Programming and Performance	3	4875	January 8, 2011
cudaMemcpy3D issue when using with texture cache CUDA Programming and Performance	6	1972	April 11, 2010
Problem with cudaMemcpy3D Copy resoure to 3D Texture CUDA Programming and Performance	1	833	June 25, 2011

Matlab crashes when I use 3d texture in CUDA code

Related topics