Problem with loop and Shared memory

Hi!

I want to describe two ways of the simple function realization (there is an assignment statement of the first array elements to the second arrays elements). I faced with strange things. One of the examples works fine but for second one I get “unspecified launch failure”.

The problem is that these functions (ProcessSuccess() and ProcessFail()) look like “the same”. And, I can’t understand why the second function doesn’t work. Guru, please help newbie!

Thanks a lot!

#include <stdio.h>

#include <string.h>

#include <cutil.h>

#define ALIGN 8 //	bytes in PixelType

#define  THREAD_N 128

#define  BLOCK_N 16

#define DATASIZE 2*BLOCK_N*THREAD_N*ALIGN

union __align__(ALIGN)  un

{

	unsigned char c[ALIGN];

};

typedef un PixelType;

unsigned char RandChar(){    

    return (unsigned char)((float)rand() /(float) (RAND_MAX)*255);

}

__global__ void ProcessSuccess(PixelType* d_frame2, PixelType* d_frame1)

{

	extern __shared__ PixelType shared[];

	int num = 2*((blockIdx.x*THREAD_N)+threadIdx.x);

	int tid = 2*(threadIdx.x);

	for (int i = 0; i<2; i++)

  shared[tid+i] = d_frame1[num+i];

	for (int i = 0; i<2; i++)

  d_frame2[num+i] = shared[tid+i];

}

__global__ void ProcessFail(PixelType* d_frame2, PixelType* d_frame1)

{

	extern __shared__ PixelType shared[];

	int num = 2*((blockIdx.x*THREAD_N)+threadIdx.x);

	int tid = 2*(threadIdx.x);

	shared[tid] = d_frame1[num];

	shared[tid+1] = d_frame1[num+1];

	d_frame2[num] = shared[tid];

	d_frame2[num+1] = shared[tid+1];	

}

int main(int argc, char *argv[])

{   

	unsigned char *d_buffcur, *d_buffnext;

	unsigned char *h_buffcur, *h_buffnext;

   

	cudaMalloc((void **)&d_buffcur, DATASIZE);

	cudaMalloc((void **)&d_buffnext, DATASIZE);	

	h_buffcur = (unsigned char *)malloc(DATASIZE);

	h_buffnext = (unsigned char *)malloc(DATASIZE);

	for (int i = 0; i<DATASIZE; i++)

	{

  h_buffcur[i] = RandChar();

  h_buffnext[i] = RandChar();

	}

	

	cudaMemcpy(d_buffnext,h_buffnext,DATASIZE, cudaMemcpyHostToDevice);

	cudaMemcpy(d_buffcur,h_buffcur,DATASIZE, cudaMemcpyHostToDevice);

	CUT_CHECK_DEVICE();

	printf("Executing GPU kernel...\n");

            	

//	ProcessSuccess<<<BLOCK_N, THREAD_N, ALIGN*2*THREAD_N>>>((PixelType*)d_buffnext, (PixelType*)d_buffcur);	

	ProcessFail<<<BLOCK_N, THREAD_N, ALIGN*2*THREAD_N>>>((PixelType*)d_buffnext, (PixelType*)d_buffcur);	

	cudaError_t lasterror = cudaGetLastError();

	const char *lech = cudaGetErrorString(lasterror);

	(lasterror == cudaSuccess) ? 

	printf("Kernel executed successfully!\n""\tlast_error: %i \n""\terror_string: %s\n", lasterror, lech) :

	printf("***Kernel execution failed!!!***\n""\tlast_error: %i \n""\terror_string: %s\n", lasterror, lech);	

	cudaMemcpy(h_buffnext,d_buffnext,DATASIZE, cudaMemcpyDeviceToHost);

	cudaMemcpy(h_buffcur,d_buffcur,DATASIZE, cudaMemcpyDeviceToHost);

	unsigned char *pcur = h_buffcur;

	unsigned char *pnext = h_buffnext;

	bool fal = false;

    for(int i = 0; i < DATASIZE; i++, pcur++, pnext++)

	{

  if(*pcur!=*pnext) 

  {

  	printf("TEST FAILED\n");

  	fal = true;

  	break;

  }

    }

	if (fal == false) printf("TEST PASSED\n");

       

    CUDA_SAFE_CALL( cudaFree(d_buffnext) );

    CUDA_SAFE_CALL( cudaFree(d_buffcur)  );

	

    printf("Shutdown done.\n");

}

I would like to explain the problem I faced with. If I perform writing like following:

for (int i = 0; i<2; i++)

    shared[tid+i] = d_frame1[num+i];

for (int i = 0; i<2; i++)

    d_frame2[num+i] = shared[tid+i];

all works perfectly. But if I change my code for following way:

shared[tid] = d_frame1[num];

shared[tid+1] = d_frame1[num+1];

d_frame2[num] = shared[tid];

d_frame2[num+1] = shared[tid+1];

I guess that such representing should increase calculation performance but I’m always getting “unspecified launch failure” error.

Anybody know what cause of problem is? Where I’m wrong?

Many Thanks.

I have the same problem. I managed to work around it by using texfetch to load the data instead of loading it directly from global memory.