Second call to cudaMemcpy3D crashes my app First call is successful

Hi,

I am trying to write CUDA code for processing some 3D images. All the examples in SDK work with only one 3D image. Is it possible to use two 3D images?

When I run the code below, I get following crash report:

Unhandled exception at 0x166f1f50 in Spine.exe: 0xC0000005: Access violation reading location 0x0000000008acb000.

0x0000000008acb000 does not correspond to any of the local variables.

#include <cutil_inline.h>

texture<float,3,cudaReadModeElementType> texIn, texGm;

cudaArray *arrIn, *arrGv, *arrGm;

__shared__ cudaExtent size;

__global__ void kernel(const float eps, float *gv, float *l, float *h)

{

	int idx = blockDim.x * blockIdx.x + threadIdx.x;

	if (idx>=size.width*size.height*size.depth)

		return; //over size

	int z=idx/(size.width*size.height);

	int y=(idx%(size.width*size.height))/size.width;

	int x=idx%size.width;

	l[idx]=tex3D(texIn, x, y, z); //for start, just copy input image to outputs

	h[idx]=tex3D(texGm, x, y, z);

}

//in is 3D scalar field, gm is gradient magnitude, gv is gradient vector

//space for l and h is allocated previously

void calcLHcuda(int xsize, int ysize, int zsize, const float eps, float *gm, float *gv, float *in, float *l, float *h)

{

	size.width=xsize;

	size.height=ysize;

	size.depth=zsize;

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	cutilSafeCall( cudaMalloc3DArray  ( &arrIn, &channelDesc, size )); 

	cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)in, xsize*sizeof(float), xsize, ysize);

    copyParams.dstArray = arrIn;

    copyParams.extent   = size;

    copyParams.kind     = cudaMemcpyHostToDevice;

    cutilSafeCall( cudaMemcpy3D(&copyParams) );

	cutilCheckMsg("Failed to allocate first 3D image");

    //texIn.addressMode[0] = cudaAddressModeClamp; //Clamp is default

    texIn.filterMode = cudaFilterModeLinear;

    texIn.normalized = false;

    cutilSafeCall( cudaBindTextureToArray(texIn, arrIn) );

	cutilSafeCall( cudaMalloc3DArray  ( &arrGm, &channelDesc, size ));

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)gm, xsize*sizeof(float), xsize, ysize);

    copyParams.dstArray = arrGm;

    cutilSafeCall( cudaMemcpy3D(&copyParams) );

	cutilCheckMsg("Failed to allocate second 3D image");

    texGm.filterMode = cudaFilterModeLinear;

    texGm.normalized = false;

    cutilSafeCall( cudaBindTextureToArray(texGm, arrGm) );

	kernel<<< (xsize*ysize*zsize/32+1), 32 >>>(eps, gv, l, h);

	cutilSafeCall(cudaFreeArray(arrIn));

	cutilSafeCall(cudaFreeArray(arrGm));

}

calcLHcuda is the function which is called from the rest of my C++ code.

Can anyone shed some light on this, or give some suggestinos?

Regards,

Dženan

Hi,

I am trying to write CUDA code for processing some 3D images. All the examples in SDK work with only one 3D image. Is it possible to use two 3D images?

When I run the code below, I get following crash report:

Unhandled exception at 0x166f1f50 in Spine.exe: 0xC0000005: Access violation reading location 0x0000000008acb000.

0x0000000008acb000 does not correspond to any of the local variables.

#include <cutil_inline.h>

texture<float,3,cudaReadModeElementType> texIn, texGm;

cudaArray *arrIn, *arrGv, *arrGm;

__shared__ cudaExtent size;

__global__ void kernel(const float eps, float *gv, float *l, float *h)

{

	int idx = blockDim.x * blockIdx.x + threadIdx.x;

	if (idx>=size.width*size.height*size.depth)

		return; //over size

	int z=idx/(size.width*size.height);

	int y=(idx%(size.width*size.height))/size.width;

	int x=idx%size.width;

	l[idx]=tex3D(texIn, x, y, z); //for start, just copy input image to outputs

	h[idx]=tex3D(texGm, x, y, z);

}

//in is 3D scalar field, gm is gradient magnitude, gv is gradient vector

//space for l and h is allocated previously

void calcLHcuda(int xsize, int ysize, int zsize, const float eps, float *gm, float *gv, float *in, float *l, float *h)

{

	size.width=xsize;

	size.height=ysize;

	size.depth=zsize;

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	cutilSafeCall( cudaMalloc3DArray  ( &arrIn, &channelDesc, size )); 

	cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)in, xsize*sizeof(float), xsize, ysize);

    copyParams.dstArray = arrIn;

    copyParams.extent   = size;

    copyParams.kind     = cudaMemcpyHostToDevice;

    cutilSafeCall( cudaMemcpy3D(&copyParams) );

	cutilCheckMsg("Failed to allocate first 3D image");

    //texIn.addressMode[0] = cudaAddressModeClamp; //Clamp is default

    texIn.filterMode = cudaFilterModeLinear;

    texIn.normalized = false;

    cutilSafeCall( cudaBindTextureToArray(texIn, arrIn) );

	cutilSafeCall( cudaMalloc3DArray  ( &arrGm, &channelDesc, size ));

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)gm, xsize*sizeof(float), xsize, ysize);

    copyParams.dstArray = arrGm;

    cutilSafeCall( cudaMemcpy3D(&copyParams) );

	cutilCheckMsg("Failed to allocate second 3D image");

    texGm.filterMode = cudaFilterModeLinear;

    texGm.normalized = false;

    cutilSafeCall( cudaBindTextureToArray(texGm, arrGm) );

	kernel<<< (xsize*ysize*zsize/32+1), 32 >>>(eps, gv, l, h);

	cutilSafeCall(cudaFreeArray(arrIn));

	cutilSafeCall(cudaFreeArray(arrGm));

}

calcLHcuda is the function which is called from the rest of my C++ code.

Can anyone shed some light on this, or give some suggestinos?

Regards,

Dženan

I don’t see anything glaringly obvious from glancing at it, but you might want to try using a different variable name for the copyParams for the second allocation.
“copyParams2” or something, just to make sure you’re not including things you do not want.

I don’t see anything glaringly obvious from glancing at it, but you might want to try using a different variable name for the copyParams for the second allocation.
“copyParams2” or something, just to make sure you’re not including things you do not want.

I tried that already. Moreover, I called the variable copyParams2 External Image

I tried that already. Moreover, I called the variable copyParams2 External Image

Where does it choke?

Is it your kernel call?

If you are trying to have too many thread blocks (>65535) it will not work.

Where does it choke?

Is it your kernel call?

If you are trying to have too many thread blocks (>65535) it will not work.

The execution does not even get to that line. But thanks for the hint.

I have been debugging my code, and this is my current version:

#include <cutil_inline.h>

texture<float,3,cudaReadModeElementType> texIn, texGm;

cudaArray *arrIn, *arrGv, *arrGm;

__global__ void kernel(const unsigned size, const cudaExtent size3, float *l, float *h)

{

    unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;

    if (idx>=size)

        return; //over size

    unsigned z=idx/(size3.width*size3.height);

    unsigned y=(idx%(size3.width*size3.height))/size3.width;

    unsigned x=idx%size3.width;

    l[idx]=tex3D(texIn, x+0.5, y+0.5, z+0.5); //for start, just copy input image to outputs

    h[idx]=tex3D(texGm, x+0.5, y+0.5, z+0.5);

}

//in is 3D scalar field, gm is gradient magnitude, gv is gradient vector

//space for l and h is allocated

void calcLHcuda(unsigned xsize, unsigned ysize, unsigned zsize, const float eps, float *gm, float *gv, float *in, float *l, float *h)

{

    unsigned size=xsize*ysize*zsize;

    unsigned byteSize=size*sizeof(float);

    cudaExtent size3;

    size3.width=xsize;

    size3.height=ysize;

    size3.depth=zsize;

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

cudaMalloc3DArray( &arrIn, &channelDesc, size3 );

    cudaMemcpy3DParms copyParams1 = {0};

    copyParams1.srcPtr   = make_cudaPitchedPtr((void*)in, xsize*sizeof(float), xsize, ysize);

    copyParams1.dstArray = arrIn;

    copyParams1.extent   = size3;

    copyParams1.kind     = cudaMemcpyHostToDevice;

    cudaMemcpy3D(&copyParams1);

    cutilCheckMsg("Failed to allocate first 3D image");

    texIn.filterMode = cudaFilterModeLinear;

    texIn.normalized = false;

    cudaBindTextureToArray(texIn, arrIn);

//cudaMalloc3DArray( &arrGm, &channelDesc, size3 );

    //cudaMemcpy3DParms copyParams2 = {0};

    //copyParams2.srcPtr   = make_cudaPitchedPtr((void*)gm, xsize*sizeof(float), xsize, ysize);

    //copyParams2.dstArray = arrGm;

    //copyParams2.extent   = size3;

    //copyParams2.kind     = cudaMemcpyHostToDevice;

    //cudaMemcpy3D(&copyParams2);

    //cutilCheckMsg("Failed to allocate second 3D image");

    //texGm.filterMode = cudaFilterModeLinear;

    //texGm.normalized = false;

    //cudaBindTextureToArray(texGm, arrGm);

float *gpuL, *gpuH;

    cudaMalloc((void**)&gpuL, byteSize);

    cudaMalloc((void**)&gpuH, byteSize);

    kernel<<< ceil(size/256.0), 256 >>>(size, size3, gpuL, gpuH);

    cudaMemcpy(l, gpuL, byteSize, cudaMemcpyDeviceToHost);

    cudaMemcpy(h, gpuH, byteSize, cudaMemcpyDeviceToHost);

//verify

    float maxDif=0;

    for (unsigned i=0; i<size; i++)

        if (fabs(in[i]-l[i])>maxDif)

            maxDif=fabs(in[i]-l[i]);

    //maxDif=0;

    //for (unsigned i=0; i<size; i++)

    //	if (fabs(gm[i]-h[i])>maxDif)

    //		maxDif=fabs(gm[i]-h[i]);

cudaFree(gpuL);

    cudaFree(gpuH);

    cudaFreeArray(arrIn);

    //cudaFreeArray(arrGm);

}

At this point verification (near the end of function) succeeds. “h” image is filled with zeros. However, if I uncomment the code related to second texture, the app still crashes on [font=“Courier New”]cudaMemcpy3D(&copyParams2);[/font]

Am I doing something wrong, or is this just a plain old bug?

Regards,

Dženan

You’re not unbinding your texture.

Thank you for taking interest, but if I unbind the first texture, will it still be accessible from within the kernel? Or did you mean at the end, together with frees? Because the execution doesn’t reach frees if the code is uncommented.

Sorry, should have looked a little closer.

Not sure what the effect is of your calling

cudaBindTextureToArray(texIn, arrIn);

instead of the usual

cudaBindTextureToArray(texIn, arrIn, channelDesc);

You’ve pulled out the error checking. I assume the 2nd malloc is successful before cudaMemcpy3D(&copyParams2);

If I remember correctly, channelDesc is taken from array if it is not provided in to call to bind. But I will try your idea tomorrow when I get back to work.

Incredible! That solved the problem. Thank you DittoAway!

Edit: copyParams variable can be reused without problem (as given in the initial example).