Parallel DCT algo. Need help.

Hello everybody, I’m an italian student in Computer Engineering and I need some help about my unworking implementation of the DCT algorithm with CUDA libraries.

Here is the main host part…

#include <stdio.h>

#include <math.h>

#include <cutil.h>

float h_COSENI[64] = {

	0.353553390593273730f,	//	COSENOSCLDBL_0_0

	0.353553390593273730f,	//	COSENOSCLDBL_0_1

	0.353553390593273730f,	//	COSENOSCLDBL_0_2

	0.353553390593273730f,	//	COSENOSCLDBL_0_3

	0.353553390593273730f,	//	COSENOSCLDBL_0_4

	0.353553390593273730f,	//	COSENOSCLDBL_0_5

	0.353553390593273730f,	//	COSENOSCLDBL_0_6

	0.353553390593273730f,	//	COSENOSCLDBL_0_7

	0.490392640201615220f,	//	COSENOSCLDBL_1_0

	0.415734806151272620f,	//	COSENOSCLDBL_1_1

	0.277785116509801140f,	//	COSENOSCLDBL_1_2

	0.097545161008064166f,	//	COSENOSCLDBL_1_3

	-0.097545161008064096f,	//	COSENOSCLDBL_1_4

	-0.277785116509800980f,	//	COSENOSCLDBL_1_5

	-0.415734806151272670f,	//	COSENOSCLDBL_1_6

	-0.490392640201615220f,	//	COSENOSCLDBL_1_7

	0.461939766255643370f,	//	COSENOSCLDBL_2_0

	0.191341716182544920f,	//	COSENOSCLDBL_2_1

	-0.191341716182544860f,	//	COSENOSCLDBL_2_2

	-0.461939766255643370f,	//	COSENOSCLDBL_2_3

	-0.461939766255643420f,	//	COSENOSCLDBL_2_4

	-0.191341716182545170f,	//	COSENOSCLDBL_2_5

	0.191341716182545000f,	//	COSENOSCLDBL_2_6

	0.461939766255643260f,	//	COSENOSCLDBL_2_7

	0.415734806151272620f,	//	COSENOSCLDBL_3_0

	-0.097545161008064096f,	//	COSENOSCLDBL_3_1

	-0.490392640201615220f,	//	COSENOSCLDBL_3_2

	-0.277785116509801090f,	//	COSENOSCLDBL_3_3

	0.277785116509800920f,	//	COSENOSCLDBL_3_4

	0.490392640201615220f,	//	COSENOSCLDBL_3_5

	0.097545161008064388f,	//	COSENOSCLDBL_3_6

	-0.415734806151272560f,	//	COSENOSCLDBL_3_7

	0.353553390593273790f,	//	COSENOSCLDBL_4_0

	-0.353553390593273730f,	//	COSENOSCLDBL_4_1

	-0.353553390593273840f,	//	COSENOSCLDBL_4_2

	0.353553390593273680f,	//	COSENOSCLDBL_4_3

	0.353553390593273840f,	//	COSENOSCLDBL_4_4

	-0.353553390593273340f,	//	COSENOSCLDBL_4_5

	-0.353553390593273560f,	//	COSENOSCLDBL_4_6

	0.353553390593273290f,	//	COSENOSCLDBL_4_7

	0.277785116509801140f,	//	COSENOSCLDBL_5_0

	-0.490392640201615220f,	//	COSENOSCLDBL_5_1

	0.097545161008064152f,	//	COSENOSCLDBL_5_2

	0.415734806151272730f,	//	COSENOSCLDBL_5_3

	-0.415734806151272560f,	//	COSENOSCLDBL_5_4

	-0.097545161008064013f,	//	COSENOSCLDBL_5_5

	0.490392640201615330f,	//	COSENOSCLDBL_5_6

	-0.277785116509800760f,	//	COSENOSCLDBL_5_7

	0.191341716182544920f,	//	COSENOSCLDBL_6_0

	-0.461939766255643420f,	//	COSENOSCLDBL_6_1

	0.461939766255643260f,	//	COSENOSCLDBL_6_2

	-0.191341716182544950f,	//	COSENOSCLDBL_6_3

	-0.191341716182545280f,	//	COSENOSCLDBL_6_4

	0.461939766255643370f,	//	COSENOSCLDBL_6_5

	-0.461939766255643200f,	//	COSENOSCLDBL_6_6

	0.191341716182544780f,	//	COSENOSCLDBL_6_7

	0.097545161008064166f,	//	COSENOSCLDBL_7_0

	-0.277785116509801090f,	//	COSENOSCLDBL_7_1

	0.415734806151272730f,	//	COSENOSCLDBL_7_2

	-0.490392640201615330f,	//	COSENOSCLDBL_7_3

	0.490392640201615220f,	//	COSENOSCLDBL_7_4

	-0.415734806151272510f,	//	COSENOSCLDBL_7_5

	0.277785116509800760f,	//	COSENOSCLDBL_7_6

	-0.097545161008064291f	//	COSENOSCLDBL_7_7

	};

__constant__ float d_COSENI[64];

///////////////////////////////////////////////////////////////////////////////

// Executing DCT on CPU

///////////////////////////////////////////////////////////////////////////////

extern "C"

void computeDCTCPU(

     short  *,//PIC OUT

     float  *,//PIC TEMP

     __int8 *,//PIC IN

     int,  //WIDTH

     int,  //HEIGHT

     int,  //NRIGHE

     int  //BLOCCHITOTALI

   );

///////////////////////////////////////////////////////////////////////////////

// Helper functions

////////////////////////////////////////////////////////////////////////////////

extern "C"

int isPICTURE_DCTcorrect(short *, short *, int);

extern "C"

void printResToFilePICT(__int8 *, short *, short *, int, int);

extern "C"

__int8 RandInt8(__int8, __int8);

///////////////////////////////////////////////////////////////////////////////

// Executing monodimensional DCT on GPU

///////////////////////////////////////////////////////////////////////////////

#include "test_kernel.cu"

int main(int argc, char **argv){

CUT_DEVICE_INIT();

//DEVICE

short  *d_PICTURE_OUT;

float  *d_PICTURE_TMP;

__int8 *d_PICTURE_IN8;

//HOST

__int8 *h_PICTURE_IN8;

float  *h_PICTURE_TMP;

int    *h_PICTURE_TMP_INT;

short  *h_PICTURE_OUT_CPU;

short  *h_PICTURE_OUT_GPU;//To download host results

//WORK_VAR

int PICTURE_WIDTH, PICTURE_HEIGHT, PICTURE_DATA_N, PICTURE_SZ_I8;

int PICTURE_SZ_FL, PICTURE_SZ_SH, PICTURE_SZ_32;

int TOT_DBLOCKS, DBLOCKS_PER_ROW, TOT_FRAGMENTS;

float MBtotalGPUallocatedMEM;

const int DCT_ORDER = 8;

const int NUMERO_COSENI = 64;

const int RIGHE_CONC = 8;

const int DATABLOCK_CONC = 16;

const int SH_MEM_SZ = NUMERO_COSENI * sizeof(float);

int loop = 3;

//INIT

dim3 grid(16, 8, 1);

dim3 block(64, 1, 1);

PICTURE_WIDTH =  (loop*8);

PICTURE_HEIGHT = (loop*8);

PICTURE_DATA_N	= PICTURE_WIDTH * PICTURE_HEIGHT;

PICTURE_SZ_I8	= PICTURE_DATA_N * sizeof(__int8);

PICTURE_SZ_FL	= PICTURE_DATA_N * sizeof(float);

PICTURE_SZ_SH	= PICTURE_DATA_N * sizeof(short);

PICTURE_SZ_32	= PICTURE_DATA_N * sizeof(int);

TOT_DBLOCKS	= (PICTURE_WIDTH*PICTURE_HEIGHT)/NUMERO_COSENI;

DBLOCKS_PER_ROW = PICTURE_WIDTH / RIGHE_CONC;

TOT_FRAGMENTS = 

         (unsigned int)(ceilf((float)TOT_DBLOCKS/DATABLOCK_CONC));

MBtotalGPUallocatedMEM =

(PICTURE_SZ_SH+PICTURE_SZ_I8+PICTURE_SZ_FL)/(1048576.0f);

	

h_PICTURE_TMP  = (float *)malloc(PICTURE_SZ_FL);

h_PICTURE_TMP_INT	= (int   *)malloc(PICTURE_SZ_32);

h_PICTURE_OUT_CPU	= (short *)malloc(PICTURE_SZ_SH);

//Pinned memory mode - Use custom function to get OS-pinned memory

CUDA_SAFE_CALL( cudaMallocHost((void **)&h_PICTURE_IN8,     PICTURE_SZ_I8) );

CUDA_SAFE_CALL( cudaMallocHost((void **)&h_PICTURE_OUT_GPU, PICTURE_SZ_SH) );

CUDA_SAFE_CALL( cudaMalloc((void **)&d_PICTURE_IN8, PICTURE_SZ_I8) );

CUDA_SAFE_CALL( cudaMalloc((void **)&d_PICTURE_TMP, PICTURE_SZ_FL) );

CUDA_SAFE_CALL( cudaMalloc((void **)&d_PICTURE_OUT, PICTURE_SZ_SH) );

//FILL

srand(123);

srand( (unsigned)time(NULL) );

for(int i=0; i<PICTURE_DATA_N; i++){

	h_PICTURE_IN8[i]  = (__int8)127;//TEST

	}

//DEVICE CALLS

CUDA_SAFE_CALL( cudaMemcpy(d_PICTURE_IN8, h_PICTURE_IN8, PICTURE_SZ_I8, cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL( cudaMemcpyToSymbol(d_COSENI, &h_COSENI, NUMERO_COSENI*sizeof(float), 0, cudaMemcpyHostToDevice) );

CUDA_SAFE_CALL( cudaThreadSynchronize() );

computeDCTGPU<<<grid, block, SH_MEM_SZ>>>(d_PICTURE_OUT, d_PICTURE_TMP, d_PICTURE_IN8, PICTURE_WIDTH, PICTURE_HEIGHT, RIGHE_CONC, TOT_DBLOCKS);

CUT_CHECK_ERROR("Kernel execution failed!\n");

CUDA_SAFE_CALL( cudaThreadSynchronize() );

CUDA_SAFE_CALL( cudaMemcpy(h_PICTURE_OUT_GPU, d_PICTURE_OUT, PICTURE_DATA_N * sizeof(short), cudaMemcpyDeviceToHost) );

//HOST REF CALL

computeDCTCPU(h_PICTURE_OUT_CPU, h_PICTURE_TMP, h_PICTURE_IN8, PICTURE_WIDTH, PICTURE_HEIGHT, RIGHE_CONC, TOT_DBLOCKS);

	

//CHECK

printf((isPICTURE_DCTcorrect(h_PICTURE_OUT_CPU, h_PICTURE_OUT_GPU, PICTURE_DATA_N)) ? "\n\t\t\t\t\t\t ===> TEST PASSED <===\n" : "\n\t\t\t\t\t\t ===> TEST FAILED <===\n");

printResToFilePICT(h_PICTURE_IN8, h_PICTURE_OUT_CPU, h_PICTURE_OUT_GPU, PICTURE_DATA_N, PICTURE_WIDTH);

	

//DEALLOC

free(h_PICTURE_TMP);

free(h_PICTURE_TMP_INT);

free(h_PICTURE_OUT_CPU);

CUDA_SAFE_CALL( cudaFreeHost(h_PICTURE_IN8)     );

CUDA_SAFE_CALL( cudaFreeHost(h_PICTURE_OUT_GPU) );

CUDA_SAFE_CALL( cudaFree(d_PICTURE_IN8) );

CUDA_SAFE_CALL( cudaFree(d_PICTURE_TMP) );

CUDA_SAFE_CALL( cudaFree(d_PICTURE_OUT) );

	CUT_EXIT(argc, argv);

}

And Here is the Kernel…

#define IMUL(a, b) __mul24(a, b)

#define IMOD(i, n) ((i)%(n))

#define MUL(c, d) ((c)*(d))

#define NCOSENI 64

#define ACCUM_N 64

__global__

void computeDCTGPU(

     short  *d_PICTUREOUT,//PIC OUT

     float  *d_PICTURETMP,//PIC TEMP

     __int8 *d_PICTUREIN8,//PIC IN

     int WIDTH,	//WIDTH

     int HEIGHT,	//HEIGHT

     int NRIGHE,	//NRIGHE

     int TOT_DBLOCKS	//BLOCCHITOTALI

){

extern __constant__ float d_COSENI[];

extern __shared__ float tempMatSum[];//64 elements

//blockIdx.x <=>  ID CUDA block	[0-15]

//blockIdx.y <=>  ID row (of CUDA block)	[0-7]

//threadIdx.x <=> ID thread  [0-7]

const unsigned int tid = threadIdx.x;

__shared__ int DBLOCKS_PER_ROW;

if (tid == 0) DBLOCKS_PER_ROW = WIDTH / NRIGHE;

	

for(int iBlocco = blockIdx.x; iBlocco < TOT_DBLOCKS; iBlocco += gridDim.x){

	__syncthreads();

	//Coordinate del datablock (x,y)

	unsigned int dblk_Y = (iBlocco / DBLOCKS_PER_ROW);

	unsigned int dblk_X = (iBlocco % DBLOCKS_PER_ROW);

	//Indirizzo dell'elemento (0,0) del datablock corrente

	__shared__ unsigned int vBase;

	if (tid == 0) 

vBase = IMUL(dblk_X, NRIGHE) + IMUL((dblk_Y*DBLOCKS_PER_ROW), NCOSENI);

	//monodim DCT

	for(unsigned int rigaDCT = blockIdx.y; rigaDCT < NRIGHE; rigaDCT += gridDim.y){

  __syncthreads();

  //First element (of row) address  	unsigned int relAddress = vBase + IMUL(WIDTH, rigaDCT);

  	for(unsigned int pos = tid; pos < NCOSENI; pos += blockDim.x){

  	tempMatSum[pos] = (float)(d_PICTUREIN8[relAddress+(pos%NRIGHE)] * d_COSENI[pos]);

  	//tempMatSum[pos] = 1.0f;//TEST

  }

  //tree-like reduction    

  for(unsigned int stride = (NRIGHE/2); stride > 0; stride >>= 1){

  	__syncthreads();

  	for(unsigned int iAcc = tid; iAcc < (stride*NRIGHE); iAcc += blockDim.x){

    unsigned int pos = NRIGHE*(iAcc/stride)+(iAcc%stride);

    tempMatSum[pos] += tempMatSum[pos + stride];

  	}

  }

  //utilizzando solo il numero di thread necessario.

  if(tid < NRIGHE) d_PICTURETMP[vBase+((tid*WIDTH)+rigaDCT)] = tempMatSum[tid*NRIGHE];

	

	}//Fine DCT monodimensionale "righe"

	__syncthreads();

	//the other mono DCT

	for(unsigned int rigaDCT = blockIdx.y; rigaDCT < NRIGHE; rigaDCT += gridDim.y){

  __syncthreads();

  unsigned int relAddress = vBase + IMUL(WIDTH, rigaDCT);

 for(unsigned int pos = tid; pos < NCOSENI; pos += blockDim.x){

  	tempMatSum[pos] = (float)(d_PICTURETMP[relAddress+(pos%NRIGHE)] * d_COSENI[pos]);

  	//tempMatSum[pos] = 1.0f;//TEST

  }

  //tree-like reduction    

  for(unsigned int stride = (NRIGHE/2); stride > 0; stride >>= 1){

  	__syncthreads();

  	for(unsigned int iAcc = tid; iAcc < (stride*NRIGHE); iAcc += blockDim.x){

    unsigned int pos = NRIGHE*(iAcc/stride)+(iAcc%stride);

    tempMatSum[pos] += tempMatSum[pos + stride];

  	}

  }

  if(tid < NRIGHE) d_PICTUREOUT[vBase+((tid*WIDTH)+rigaDCT)] = (short)(floor(tempMatSum[tid*NRIGHE]+0.5f));

	

	}//Second DCT end

	}//Current block end

}

And finally the Working CPU reference function…

#include <stdlib.h>

#include <math.h>

extern "C"

void computeDCTCPU(

     short  *h_PICTUREOUT,//PIC OUT

     float  *h_PICTURETMP,//PIC TEMP

     __int8 *h_PICTUREIN8,//PIC IN

     int WIDTH,  	//WIDTH

     int HEIGHT,  	//HEIGHT

     int NRIGHE,  	//NRIGHE

     int TOT_DBLOCKS  //BLOCCHITOTALI

){

	extern float h_COSENI[];

	//COSTANTI

	const unsigned int ConcBlksN = 16;	//concurrents 8x8 DCT

    const unsigned int latoDCT = 8;  //elements per DCT row

	//datablock size

	const unsigned int COSDATA_N = latoDCT*latoDCT;

               //Temporary matrix

	float tempMatSum[COSDATA_N];

	//Number of datablocks for the picture

	unsigned int N_DBLOCKS_TOT = (WIDTH*HEIGHT)/COSDATA_N;

	//datablock per row

	unsigned int DBLK_PER_ROW = WIDTH / latoDCT;

	//Needed Iteration to transform the whole picture

	unsigned int TOT_FRAGMENTS = (unsigned int)(ceilf((float)N_DBLOCKS_TOT/ConcBlksN));

	//Split the original picture in fragments

	for(unsigned int fragment = 0; fragment < TOT_FRAGMENTS; fragment++){

 //for each iteration it process ConcBlksN datablock

  for(unsigned int dblk_ID = 0; dblk_ID < ConcBlksN; dblk_ID++){

  	//Global Id of datablock

  	unsigned int dblk_GlobalID = dblk_ID + (fragment*ConcBlksN);

  	if (dblk_GlobalID > (N_DBLOCKS_TOT-1)) break;

  	//Coordinates of datablock (x,y)

  	unsigned int dblk_Y = (dblk_GlobalID / DBLK_PER_ROW);

  	unsigned int dblk_X = (dblk_GlobalID % DBLK_PER_ROW);

  	//Indirizzo dell'elemento (0,0) del datablock corrente

  	unsigned int vBase =

    (dblk_X*latoDCT) + (dblk_Y*DBLK_PER_ROW*COSDATA_N);

 	//First mono DCT

  	for(unsigned int rigaDCT = 0; rigaDCT < latoDCT; rigaDCT++){

    //Indirizzo del primo elemento della riga corrente nel datablock

    unsigned int relAddress = vBase + (WIDTH*rigaDCT);

        for(unsigned int pos = 0; pos < COSDATA_N; pos++){

    	tempMatSum[pos] = (float)(h_PICTUREIN8[relAddress+(pos%latoDCT)]*h_COSENI[pos]);

    }

	for(unsigned int stride = (latoDCT>>1); stride > 0; stride >>= 1){

    	for(unsigned int iAcc = 0; iAcc < (stride*latoDCT); iAcc++){

      unsigned int pos = (latoDCT*(iAcc/stride))+(iAcc%stride);

      tempMatSum[pos] += tempMatSum[pos + stride];

    	}

    }

	for(unsigned int tid=0;tid < latoDCT;tid++){

    	h_PICTURETMP[vBase+((tid*WIDTH)+rigaDCT)] = tempMatSum[tid*latoDCT];

    }

  	

  	}//First DCT end

 	//second DCT mono

  	for(unsigned int rigaDCT = 0; rigaDCT < latoDCT; rigaDCT++){

    //Indirizzo del primo elemento della riga corrente nel datablock

    unsigned int relAddress = vBase + (WIDTH*rigaDCT);

	for(unsigned int pos = 0; pos < COSDATA_N; pos++){

    	tempMatSum[pos] = (float)((h_PICTURETMP[relAddress+(pos%latoDCT)]*h_COSENI[pos]));

    }

    

  	for(unsigned int stride = (latoDCT>>1); stride > 0; stride >>= 1){

    	for(unsigned int iAcc = 0; iAcc < (stride*latoDCT); iAcc++){

      unsigned int pos = (latoDCT*(iAcc/stride))+(iAcc%stride);

      tempMatSum[pos] += tempMatSum[pos + stride];

    	}

    }

        for(unsigned int tid=0;tid < latoDCT;tid++){

    	h_PICTUREOUT[vBase+((tid*WIDTH)+rigaDCT)] = (short)(floorf(tempMatSum[tid*latoDCT]+0.5f));

    }

  	

  	}//End of the second DCT

 }//end of current datablock

	}//End of iteration

}

Anyway I think the kernel not correctly manages the shared resources…and I am sure the threads are serialized when doing tree-like reduction even if I don’t mind.

If anyone could help me I’ll be very grateful…

PS.I’m sorry if the code is not too much readable!