Matlab crashes when I use 3d texture in CUDA code

I have a stack of 2d images. I want to use a 3D texture to store them and read them one by one in a kernel. But when I run the test .m file . The Matlab crashes. I do not know whether I missed something about 3D texture.

The CUDA code is just like this:

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include "mex.h"

#include "matrix.h"

#include <cutil.h>

typedef unsigned int  uint;

texture<float, 3, cudaReadModeElementType> tex;  // 3D texture

cudaArray *d_volumeArray = 0;

/***************************************************************************

 * Read**Kernel : read one layer in 3D texture

 **************************************************************************/

__global__ void

ReadKernel(float *d_output, uint imageW, uint imageH, float w)

{

    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

    // read from 3D texture

    float voxel = tex3D(tex, x, y, w);

    if ((x < imageW) && (y < imageH)) {

        uint i = __umul24(y, imageW) + x;

        d_output[i] = voxel;

    }

}

void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) 

{	

	

    float* images = (float *)mxGetPr(prhs[0]);

    int n_i = mxGetScalar(prhs[1]);

    int width = mxGetScalar(prhs[2]);

    int height = mxGetScalar(prhs[3]);

const cudaExtent volumeSize = make_cudaExtent( width, height, n_i);       // size for  images stack 

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

    CUDA_SAFE_CALL( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) );

// copy data to 3D array

    cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)images, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);

    copyParams.dstArray = d_volumeArray;

    copyParams.extent   = volumeSize;

    copyParams.kind     = cudaMemcpyHostToDevice;

    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );

// set texture parameters

    tex.normalized = false;        

    tex.filterMode = cudaFilterModePoint;  //use cudaFilterModePoint filter mode to avoid fetching from different textures layers.

    tex.addressMode[0] = cudaAddressModeClamp;   

    tex.addressMode[1] = cudaAddressModeClamp;

    tex.addressMode[2] = cudaAddressModeClamp;

// bind array to 3D texture

    CUDA_SAFE_CALL(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));

float* d_output = NULL;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_output, (width*height*sizeof(float)))); 

float *h_odata = NULL;

/* get a pointer to  the output  */

    const mwSize dims[]={width,height,n_i};

    plhs[0] = mxCreateNumericArray(3,dims,mxDOUBLE_CLASS,mxREAL);

    h_odata = (float *)mxGetPr(plhs[0]);

dim3 dimBlock1(8, 8, 1);

    dim3 dimGrid1((width+ dimBlock1.x -1) / dimBlock1.x, (height+ dimBlock1.y -1) / dimBlock1.y, 1);

    for (int i = 0; i < n_i; i++)

    { 

        // read in 3D texture one layer to get one image 

        ReadKernel<<< dimGrid1, dimBlock1 >>>( d_output, width, height, i);         

        CUDA_SAFE_CALL( cudaThreadSynchronize());

        CUDA_SAFE_CALL( cudaMemcpy(h_odata+i*width*height*sizeof(float),d_output, width*height*sizeof(float),cudaMemcpyDeviceToHost));  // memory copy

    }

    // clear memory

    CUDA_SAFE_CALL(cudaFree(d_output));

    CUDA_SAFE_CALL(cudaFreeArray(d_volumeArray));

}

and the test .m code is just like this

m=imread('cameraman.tif');

m=imresize(m,[64 64]);

images(:,:,1) = m  ; % the first image

images(:,:,2) = m   ;  % the second  image

CC = test_3dtexture(images,2,64,64);

Could anyone help me? Thank you !!

I have a stack of 2d images. I want to use a 3D texture to store them and read them one by one in a kernel. But when I run the test .m file . The Matlab crashes. I do not know whether I missed something about 3D texture.

The CUDA code is just like this:

#include <stdlib.h>

#include <stdio.h>

#include <math.h>

#include "mex.h"

#include "matrix.h"

#include <cutil.h>

typedef unsigned int  uint;

texture<float, 3, cudaReadModeElementType> tex;  // 3D texture

cudaArray *d_volumeArray = 0;

/***************************************************************************

 * Read**Kernel : read one layer in 3D texture

 **************************************************************************/

__global__ void

ReadKernel(float *d_output, uint imageW, uint imageH, float w)

{

    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;

    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;

    // read from 3D texture

    float voxel = tex3D(tex, x, y, w);

    if ((x < imageW) && (y < imageH)) {

        uint i = __umul24(y, imageW) + x;

        d_output[i] = voxel;

    }

}

void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) 

{	

	

    float* images = (float *)mxGetPr(prhs[0]);

    int n_i = mxGetScalar(prhs[1]);

    int width = mxGetScalar(prhs[2]);

    int height = mxGetScalar(prhs[3]);

const cudaExtent volumeSize = make_cudaExtent( width, height, n_i);       // size for  images stack 

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

    CUDA_SAFE_CALL( cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize) );

// copy data to 3D array

    cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)images, volumeSize.width*sizeof(float), volumeSize.width, volumeSize.height);

    copyParams.dstArray = d_volumeArray;

    copyParams.extent   = volumeSize;

    copyParams.kind     = cudaMemcpyHostToDevice;

    CUDA_SAFE_CALL( cudaMemcpy3D(&copyParams) );

// set texture parameters

    tex.normalized = false;        

    tex.filterMode = cudaFilterModePoint;  //use cudaFilterModePoint filter mode to avoid fetching from different textures layers.

    tex.addressMode[0] = cudaAddressModeClamp;   

    tex.addressMode[1] = cudaAddressModeClamp;

    tex.addressMode[2] = cudaAddressModeClamp;

// bind array to 3D texture

    CUDA_SAFE_CALL(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));

float* d_output = NULL;

    CUDA_SAFE_CALL( cudaMalloc( (void**) &d_output, (width*height*sizeof(float)))); 

float *h_odata = NULL;

/* get a pointer to  the output  */

    const mwSize dims[]={width,height,n_i};

    plhs[0] = mxCreateNumericArray(3,dims,mxDOUBLE_CLASS,mxREAL);

    h_odata = (float *)mxGetPr(plhs[0]);

dim3 dimBlock1(8, 8, 1);

    dim3 dimGrid1((width+ dimBlock1.x -1) / dimBlock1.x, (height+ dimBlock1.y -1) / dimBlock1.y, 1);

    for (int i = 0; i < n_i; i++)

    { 

        // read in 3D texture one layer to get one image 

        ReadKernel<<< dimGrid1, dimBlock1 >>>( d_output, width, height, i);         

        CUDA_SAFE_CALL( cudaThreadSynchronize());

        CUDA_SAFE_CALL( cudaMemcpy(h_odata+i*width*height*sizeof(float),d_output, width*height*sizeof(float),cudaMemcpyDeviceToHost));  // memory copy

    }

    // clear memory

    CUDA_SAFE_CALL(cudaFree(d_output));

    CUDA_SAFE_CALL(cudaFreeArray(d_volumeArray));

}

and the test .m code is just like this

m=imread('cameraman.tif');

m=imresize(m,[64 64]);

images(:,:,1) = m  ; % the first image

images(:,:,2) = m   ;  % the second  image

CC = test_3dtexture(images,2,64,64);

Could anyone help me? Thank you !!

Not 100% sure, but I think it is probably because you don’t unbind the texture before cudaFree-ing the device memory.

I always put some mexPrintf’s in my mexfunction to see where it goes wrong exactly.

Not 100% sure, but I think it is probably because you don’t unbind the texture before cudaFree-ing the device memory.

I always put some mexPrintf’s in my mexfunction to see where it goes wrong exactly.

Thank you for your reply. But it seems that there is no such unbind operation in SDK code: simpleTexture3D either.

And I do put some mexPrintf’s in it , just delete them all before I post it here. Anyway, I will try your suggestion. Thank you.

Thank you for your reply. But it seems that there is no such unbind operation in SDK code: simpleTexture3D either.

And I do put some mexPrintf’s in it , just delete them all before I post it here. Anyway, I will try your suggestion. Thank you.

I added the unbind operation before cudaFree. But still crashes. External Image

I added the unbind operation before cudaFree. But still crashes. External Image

Just try linking your example with -lcuda…!! Might work… :) Please share the outcome of this trial.!

Just try linking your example with -lcuda…!! Might work… :) Please share the outcome of this trial.!

And where exactly did it crash? binding the texture, running the kernel, or later on when you free the memory?

And where exactly did it crash? binding the texture, running the kernel, or later on when you free the memory?

Thank you very much for your reply. But I have already had -lcuda in the makefile. External Image

Thank you very much for your reply. But I have already had -lcuda in the makefile. External Image

Thank you so much!

I have put some mexPrintf in several locations. Some before the binding operation, just show some input arguments. One after the binding, the other one before the free operation. But the crash happened when I run the test .m file. Nothing output meaning no mexPrintf lines works.

Some detail crash information is just like this:

EException Type: EXC_CRASH (SIGABRT)

Exception Codes: 0x0000000000000000, 0x0000000000000000

Crashed Thread: 3

Application Specific Information:

*** error for object 0x151828608: incorrect checksum for freed object - object was probably modified after being freed.

Could you help me? Thank you again.

Thank you so much!

I have put some mexPrintf in several locations. Some before the binding operation, just show some input arguments. One after the binding, the other one before the free operation. But the crash happened when I run the test .m file. Nothing output meaning no mexPrintf lines works.

Some detail crash information is just like this:

EException Type: EXC_CRASH (SIGABRT)

Exception Codes: 0x0000000000000000, 0x0000000000000000

Crashed Thread: 3

Application Specific Information:

*** error for object 0x151828608: incorrect checksum for freed object - object was probably modified after being freed.

Could you help me? Thank you again.

I will have to try it, but I’ll not be at work for another two days… If you take away the kernel call from you .cu file do you then get any mexPrintf output? It looks like you free an mxArray before you use it, but it is not apparant from the code where that would happen.

One thing I do notice is that you assume your input from matlab is float, but it is actually double. Also you allocate output as mxDOUBLE_CLASS and then cast it to a float pointer. That is not going to work.

You can call your function like this CC = test_3dtexture(single(images),2,64,64); and allocate an mxSINGLE_CLASS array to not have to cast all your data.

Also another tip is to use mxGetDimensions to know the input sizes, it is less error prone in the end, but not the reason for your trouble.

I will have to try it, but I’ll not be at work for another two days… If you take away the kernel call from you .cu file do you then get any mexPrintf output? It looks like you free an mxArray before you use it, but it is not apparant from the code where that would happen.

One thing I do notice is that you assume your input from matlab is float, but it is actually double. Also you allocate output as mxDOUBLE_CLASS and then cast it to a float pointer. That is not going to work.

You can call your function like this CC = test_3dtexture(single(images),2,64,64); and allocate an mxSINGLE_CLASS array to not have to cast all your data.

Also another tip is to use mxGetDimensions to know the input sizes, it is less error prone in the end, but not the reason for your trouble.

Thank you so very much for your suggestions. I followed what you told me, and it did not crash. However, the result is wrong. Then I checked the code and modified the memcpy operation after the kernel invocation

change from

    CUDA_SAFE_CALL( cudaMemcpy(h_odata+(i*width*height*sizeof(float)),d_output, width*height*sizeof(float),cudaMemcpyDeviceToHost));  // memory copy

to

CUDA_SAFE_CALL( cudaMemcpy(h_odata+(iwidthheight),d_output, widthheightsizeof(float),cudaMemcpyDeviceToHost)); // memory copy

It works. Though I am not sure the reason.

Thank you again!!!

Thank you so very much for your suggestions. I followed what you told me, and it did not crash. However, the result is wrong. Then I checked the code and modified the memcpy operation after the kernel invocation

change from

    CUDA_SAFE_CALL( cudaMemcpy(h_odata+(i*width*height*sizeof(float)),d_output, width*height*sizeof(float),cudaMemcpyDeviceToHost));  // memory copy

to

CUDA_SAFE_CALL( cudaMemcpy(h_odata+(iwidthheight),d_output, widthheightsizeof(float),cudaMemcpyDeviceToHost)); // memory copy

It works. Though I am not sure the reason.

Thank you again!!!