Hi!
I want to describe two ways of the simple function realization (there is an assignment statement of the first array elements to the second arrays elements). I faced with strange things. One of the examples works fine but for second one I get “unspecified launch failure”.
The problem is that these functions (ProcessSuccess() and ProcessFail()) look like “the same”. And, I can’t understand why the second function doesn’t work. Guru, please help newbie!
Thanks a lot!
#include <stdio.h>
#include <string.h>
#include <cutil.h>
#define ALIGN 8 // bytes in PixelType
#define THREAD_N 128
#define BLOCK_N 16
#define DATASIZE 2*BLOCK_N*THREAD_N*ALIGN
union __align__(ALIGN) un
{
unsigned char c[ALIGN];
};
typedef un PixelType;
unsigned char RandChar(){
return (unsigned char)((float)rand() /(float) (RAND_MAX)*255);
}
__global__ void ProcessSuccess(PixelType* d_frame2, PixelType* d_frame1)
{
extern __shared__ PixelType shared[];
int num = 2*((blockIdx.x*THREAD_N)+threadIdx.x);
int tid = 2*(threadIdx.x);
for (int i = 0; i<2; i++)
shared[tid+i] = d_frame1[num+i];
for (int i = 0; i<2; i++)
d_frame2[num+i] = shared[tid+i];
}
__global__ void ProcessFail(PixelType* d_frame2, PixelType* d_frame1)
{
extern __shared__ PixelType shared[];
int num = 2*((blockIdx.x*THREAD_N)+threadIdx.x);
int tid = 2*(threadIdx.x);
shared[tid] = d_frame1[num];
shared[tid+1] = d_frame1[num+1];
d_frame2[num] = shared[tid];
d_frame2[num+1] = shared[tid+1];
}
int main(int argc, char *argv[])
{
unsigned char *d_buffcur, *d_buffnext;
unsigned char *h_buffcur, *h_buffnext;
cudaMalloc((void **)&d_buffcur, DATASIZE);
cudaMalloc((void **)&d_buffnext, DATASIZE);
h_buffcur = (unsigned char *)malloc(DATASIZE);
h_buffnext = (unsigned char *)malloc(DATASIZE);
for (int i = 0; i<DATASIZE; i++)
{
h_buffcur[i] = RandChar();
h_buffnext[i] = RandChar();
}
cudaMemcpy(d_buffnext,h_buffnext,DATASIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_buffcur,h_buffcur,DATASIZE, cudaMemcpyHostToDevice);
CUT_CHECK_DEVICE();
printf("Executing GPU kernel...\n");
// ProcessSuccess<<<BLOCK_N, THREAD_N, ALIGN*2*THREAD_N>>>((PixelType*)d_buffnext, (PixelType*)d_buffcur);
ProcessFail<<<BLOCK_N, THREAD_N, ALIGN*2*THREAD_N>>>((PixelType*)d_buffnext, (PixelType*)d_buffcur);
cudaError_t lasterror = cudaGetLastError();
const char *lech = cudaGetErrorString(lasterror);
(lasterror == cudaSuccess) ?
printf("Kernel executed successfully!\n""\tlast_error: %i \n""\terror_string: %s\n", lasterror, lech) :
printf("***Kernel execution failed!!!***\n""\tlast_error: %i \n""\terror_string: %s\n", lasterror, lech);
cudaMemcpy(h_buffnext,d_buffnext,DATASIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(h_buffcur,d_buffcur,DATASIZE, cudaMemcpyDeviceToHost);
unsigned char *pcur = h_buffcur;
unsigned char *pnext = h_buffnext;
bool fal = false;
for(int i = 0; i < DATASIZE; i++, pcur++, pnext++)
{
if(*pcur!=*pnext)
{
printf("TEST FAILED\n");
fal = true;
break;
}
}
if (fal == false) printf("TEST PASSED\n");
CUDA_SAFE_CALL( cudaFree(d_buffnext) );
CUDA_SAFE_CALL( cudaFree(d_buffcur) );
printf("Shutdown done.\n");
}