Second call to cudaMemcpy3D crashes my app First call is successful

DA_enan · November 26, 2010, 3:06pm

Hi,

I am trying to write CUDA code for processing some 3D images. All the examples in SDK work with only one 3D image. Is it possible to use two 3D images?

When I run the code below, I get following crash report:

Unhandled exception at 0x166f1f50 in Spine.exe: 0xC0000005: Access violation reading location 0x0000000008acb000.

0x0000000008acb000 does not correspond to any of the local variables.

#include <cutil_inline.h>

texture<float,3,cudaReadModeElementType> texIn, texGm;

cudaArray *arrIn, *arrGv, *arrGm;

__shared__ cudaExtent size;

__global__ void kernel(const float eps, float *gv, float *l, float *h)

{

	int idx = blockDim.x * blockIdx.x + threadIdx.x;

	if (idx>=size.width*size.height*size.depth)

		return; //over size

	int z=idx/(size.width*size.height);

	int y=(idx%(size.width*size.height))/size.width;

	int x=idx%size.width;

	l[idx]=tex3D(texIn, x, y, z); //for start, just copy input image to outputs

	h[idx]=tex3D(texGm, x, y, z);

}

//in is 3D scalar field, gm is gradient magnitude, gv is gradient vector

//space for l and h is allocated previously

void calcLHcuda(int xsize, int ysize, int zsize, const float eps, float *gm, float *gv, float *in, float *l, float *h)

{

	size.width=xsize;

	size.height=ysize;

	size.depth=zsize;

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	cutilSafeCall( cudaMalloc3DArray  ( &arrIn, &channelDesc, size )); 

	cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)in, xsize*sizeof(float), xsize, ysize);

    copyParams.dstArray = arrIn;

    copyParams.extent   = size;

    copyParams.kind     = cudaMemcpyHostToDevice;

    cutilSafeCall( cudaMemcpy3D(&copyParams) );

	cutilCheckMsg("Failed to allocate first 3D image");

    //texIn.addressMode[0] = cudaAddressModeClamp; //Clamp is default

    texIn.filterMode = cudaFilterModeLinear;

    texIn.normalized = false;

    cutilSafeCall( cudaBindTextureToArray(texIn, arrIn) );

	cutilSafeCall( cudaMalloc3DArray  ( &arrGm, &channelDesc, size ));

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)gm, xsize*sizeof(float), xsize, ysize);

    copyParams.dstArray = arrGm;

    cutilSafeCall( cudaMemcpy3D(&copyParams) );

	cutilCheckMsg("Failed to allocate second 3D image");

    texGm.filterMode = cudaFilterModeLinear;

    texGm.normalized = false;

    cutilSafeCall( cudaBindTextureToArray(texGm, arrGm) );

	kernel<<< (xsize*ysize*zsize/32+1), 32 >>>(eps, gv, l, h);

	cutilSafeCall(cudaFreeArray(arrIn));

	cutilSafeCall(cudaFreeArray(arrGm));

}

calcLHcuda is the function which is called from the rest of my C++ code.

Can anyone shed some light on this, or give some suggestinos?

Regards,

DÅ¾enan

DA_enan · November 26, 2010, 3:06pm

Hi,

I am trying to write CUDA code for processing some 3D images. All the examples in SDK work with only one 3D image. Is it possible to use two 3D images?

When I run the code below, I get following crash report:

Unhandled exception at 0x166f1f50 in Spine.exe: 0xC0000005: Access violation reading location 0x0000000008acb000.

0x0000000008acb000 does not correspond to any of the local variables.

#include <cutil_inline.h>

texture<float,3,cudaReadModeElementType> texIn, texGm;

cudaArray *arrIn, *arrGv, *arrGm;

__shared__ cudaExtent size;

__global__ void kernel(const float eps, float *gv, float *l, float *h)

{

	int idx = blockDim.x * blockIdx.x + threadIdx.x;

	if (idx>=size.width*size.height*size.depth)

		return; //over size

	int z=idx/(size.width*size.height);

	int y=(idx%(size.width*size.height))/size.width;

	int x=idx%size.width;

	l[idx]=tex3D(texIn, x, y, z); //for start, just copy input image to outputs

	h[idx]=tex3D(texGm, x, y, z);

}

//in is 3D scalar field, gm is gradient magnitude, gv is gradient vector

//space for l and h is allocated previously

void calcLHcuda(int xsize, int ysize, int zsize, const float eps, float *gm, float *gv, float *in, float *l, float *h)

{

	size.width=xsize;

	size.height=ysize;

	size.depth=zsize;

cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

	cutilSafeCall( cudaMalloc3DArray  ( &arrIn, &channelDesc, size )); 

	cudaMemcpy3DParms copyParams = {0};

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)in, xsize*sizeof(float), xsize, ysize);

    copyParams.dstArray = arrIn;

    copyParams.extent   = size;

    copyParams.kind     = cudaMemcpyHostToDevice;

    cutilSafeCall( cudaMemcpy3D(&copyParams) );

	cutilCheckMsg("Failed to allocate first 3D image");

    //texIn.addressMode[0] = cudaAddressModeClamp; //Clamp is default

    texIn.filterMode = cudaFilterModeLinear;

    texIn.normalized = false;

    cutilSafeCall( cudaBindTextureToArray(texIn, arrIn) );

	cutilSafeCall( cudaMalloc3DArray  ( &arrGm, &channelDesc, size ));

    copyParams.srcPtr   = make_cudaPitchedPtr((void*)gm, xsize*sizeof(float), xsize, ysize);

    copyParams.dstArray = arrGm;

    cutilSafeCall( cudaMemcpy3D(&copyParams) );

	cutilCheckMsg("Failed to allocate second 3D image");

    texGm.filterMode = cudaFilterModeLinear;

    texGm.normalized = false;

    cutilSafeCall( cudaBindTextureToArray(texGm, arrGm) );

	kernel<<< (xsize*ysize*zsize/32+1), 32 >>>(eps, gv, l, h);

	cutilSafeCall(cudaFreeArray(arrIn));

	cutilSafeCall(cudaFreeArray(arrGm));

}

calcLHcuda is the function which is called from the rest of my C++ code.

Can anyone shed some light on this, or give some suggestinos?

Regards,

DÅ¾enan

fna · November 28, 2010, 2:26am

I don’t see anything glaringly obvious from glancing at it, but you might want to try using a different variable name for the copyParams for the second allocation.
“copyParams2” or something, just to make sure you’re not including things you do not want.

fna · November 28, 2010, 2:26am

I don’t see anything glaringly obvious from glancing at it, but you might want to try using a different variable name for the copyParams for the second allocation.
“copyParams2” or something, just to make sure you’re not including things you do not want.

DA_enan · November 28, 2010, 9:12am

I tried that already. Moreover, I called the variable copyParams2 External Image

DA_enan · November 28, 2010, 9:12am

I tried that already. Moreover, I called the variable copyParams2 External Image

fna · November 28, 2010, 3:10pm

Where does it choke?

Is it your kernel call?

If you are trying to have too many thread blocks (>65535) it will not work.

fna · November 28, 2010, 3:10pm

Where does it choke?

Is it your kernel call?

If you are trying to have too many thread blocks (>65535) it will not work.

DA_enan · November 28, 2010, 10:36pm

The execution does not even get to that line. But thanks for the hint.

DA_enan · November 29, 2010, 3:53pm

I have been debugging my code, and this is my current version:

#include <cutil_inline.h>

texture<float,3,cudaReadModeElementType> texIn, texGm;

cudaArray *arrIn, *arrGv, *arrGm;

__global__ void kernel(const unsigned size, const cudaExtent size3, float *l, float *h)

{

    unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;

    if (idx>=size)

        return; //over size

    unsigned z=idx/(size3.width*size3.height);

    unsigned y=(idx%(size3.width*size3.height))/size3.width;

    unsigned x=idx%size3.width;

    l[idx]=tex3D(texIn, x+0.5, y+0.5, z+0.5); //for start, just copy input image to outputs

    h[idx]=tex3D(texGm, x+0.5, y+0.5, z+0.5);

}

//in is 3D scalar field, gm is gradient magnitude, gv is gradient vector

//space for l and h is allocated

void calcLHcuda(unsigned xsize, unsigned ysize, unsigned zsize, const float eps, float *gm, float *gv, float *in, float *l, float *h)

{

    unsigned size=xsize*ysize*zsize;

    unsigned byteSize=size*sizeof(float);

    cudaExtent size3;

    size3.width=xsize;

    size3.height=ysize;

    size3.depth=zsize;

    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

cudaMalloc3DArray( &arrIn, &channelDesc, size3 );

    cudaMemcpy3DParms copyParams1 = {0};

    copyParams1.srcPtr   = make_cudaPitchedPtr((void*)in, xsize*sizeof(float), xsize, ysize);

    copyParams1.dstArray = arrIn;

    copyParams1.extent   = size3;

    copyParams1.kind     = cudaMemcpyHostToDevice;

    cudaMemcpy3D(&copyParams1);

    cutilCheckMsg("Failed to allocate first 3D image");

    texIn.filterMode = cudaFilterModeLinear;

    texIn.normalized = false;

    cudaBindTextureToArray(texIn, arrIn);

//cudaMalloc3DArray( &arrGm, &channelDesc, size3 );

    //cudaMemcpy3DParms copyParams2 = {0};

    //copyParams2.srcPtr   = make_cudaPitchedPtr((void*)gm, xsize*sizeof(float), xsize, ysize);

    //copyParams2.dstArray = arrGm;

    //copyParams2.extent   = size3;

    //copyParams2.kind     = cudaMemcpyHostToDevice;

    //cudaMemcpy3D(&copyParams2);

    //cutilCheckMsg("Failed to allocate second 3D image");

    //texGm.filterMode = cudaFilterModeLinear;

    //texGm.normalized = false;

    //cudaBindTextureToArray(texGm, arrGm);

float *gpuL, *gpuH;

    cudaMalloc((void**)&gpuL, byteSize);

    cudaMalloc((void**)&gpuH, byteSize);

    kernel<<< ceil(size/256.0), 256 >>>(size, size3, gpuL, gpuH);

    cudaMemcpy(l, gpuL, byteSize, cudaMemcpyDeviceToHost);

    cudaMemcpy(h, gpuH, byteSize, cudaMemcpyDeviceToHost);

//verify

    float maxDif=0;

    for (unsigned i=0; i<size; i++)

        if (fabs(in[i]-l[i])>maxDif)

            maxDif=fabs(in[i]-l[i]);

    //maxDif=0;

    //for (unsigned i=0; i<size; i++)

    //	if (fabs(gm[i]-h[i])>maxDif)

    //		maxDif=fabs(gm[i]-h[i]);

cudaFree(gpuL);

    cudaFree(gpuH);

    cudaFreeArray(arrIn);

    //cudaFreeArray(arrGm);

}

At this point verification (near the end of function) succeeds. “h” image is filled with zeros. However, if I uncomment the code related to second texture, the app still crashes on [font=“Courier New”]cudaMemcpy3D(&copyParams2);[/font]

Am I doing something wrong, or is this just a plain old bug?

Regards,

DÅ¾enan

Dittoaway · November 29, 2010, 6:45pm

You’re not unbinding your texture.

DA_enan · November 29, 2010, 7:25pm

Thank you for taking interest, but if I unbind the first texture, will it still be accessible from within the kernel? Or did you mean at the end, together with frees? Because the execution doesn’t reach frees if the code is uncommented.

Dittoaway · November 29, 2010, 7:57pm

Sorry, should have looked a little closer.

Not sure what the effect is of your calling

cudaBindTextureToArray(texIn, arrIn);

instead of the usual

cudaBindTextureToArray(texIn, arrIn, channelDesc);

You’ve pulled out the error checking. I assume the 2nd malloc is successful before cudaMemcpy3D(&copyParams2);

DA_enan · November 29, 2010, 8:02pm

If I remember correctly, channelDesc is taken from array if it is not provided in to call to bind. But I will try your idea tomorrow when I get back to work.

DA_enan · November 30, 2010, 8:19am

Incredible! That solved the problem. Thank you DittoAway!

Edit: copyParams variable can be reused without problem (as given in the initial example).

Topic		Replies	Views
Using Textures CUDA Programming and Performance	10	21841	March 29, 2007
Matlab crashes when I use 3d texture in CUDA code CUDA Programming and Performance	4	1281	January 10, 2011
Matlab crashes when I use 3d texture in CUDA code CUDA Programming and Performance	21	10741	November 17, 2010
Reading R8G8B8A8 texture using tex2D() causes strange result. CUDA Programming and Performance	27	2907	April 28, 2018
cudaMemcpy problem problem with using cudaMemcpy CUDA Programming and Performance	12	5909	January 5, 2010
CudaAPI calls in functions, compiler/linking bug? CUDA Programming and Performance	6	374	August 17, 2023
Error handling in CUDA when using cudaMemcpy3D CUDA Programming and Performance	12	11929	December 22, 2010
Troubles converting float array to float2 array CUDA Programming and Performance	11	3550	July 3, 2017
2D Texture access How can I access pixels from 2D texture CUDA Programming and Performance	24	18493	June 5, 2010
Simplest texture 2D examples CUDA Programming and Performance	11	11514	March 26, 2019

Second call to cudaMemcpy3D crashes my app First call is successful

Related topics