Hello everybody, I’m an italian student in Computer Engineering and I need some help about my unworking implementation of the DCT algorithm with CUDA libraries.
Here is the main host part…
#include <stdio.h>
#include <math.h>
#include <cutil.h>
float h_COSENI[64] = {
0.353553390593273730f, // COSENOSCLDBL_0_0
0.353553390593273730f, // COSENOSCLDBL_0_1
0.353553390593273730f, // COSENOSCLDBL_0_2
0.353553390593273730f, // COSENOSCLDBL_0_3
0.353553390593273730f, // COSENOSCLDBL_0_4
0.353553390593273730f, // COSENOSCLDBL_0_5
0.353553390593273730f, // COSENOSCLDBL_0_6
0.353553390593273730f, // COSENOSCLDBL_0_7
0.490392640201615220f, // COSENOSCLDBL_1_0
0.415734806151272620f, // COSENOSCLDBL_1_1
0.277785116509801140f, // COSENOSCLDBL_1_2
0.097545161008064166f, // COSENOSCLDBL_1_3
-0.097545161008064096f, // COSENOSCLDBL_1_4
-0.277785116509800980f, // COSENOSCLDBL_1_5
-0.415734806151272670f, // COSENOSCLDBL_1_6
-0.490392640201615220f, // COSENOSCLDBL_1_7
0.461939766255643370f, // COSENOSCLDBL_2_0
0.191341716182544920f, // COSENOSCLDBL_2_1
-0.191341716182544860f, // COSENOSCLDBL_2_2
-0.461939766255643370f, // COSENOSCLDBL_2_3
-0.461939766255643420f, // COSENOSCLDBL_2_4
-0.191341716182545170f, // COSENOSCLDBL_2_5
0.191341716182545000f, // COSENOSCLDBL_2_6
0.461939766255643260f, // COSENOSCLDBL_2_7
0.415734806151272620f, // COSENOSCLDBL_3_0
-0.097545161008064096f, // COSENOSCLDBL_3_1
-0.490392640201615220f, // COSENOSCLDBL_3_2
-0.277785116509801090f, // COSENOSCLDBL_3_3
0.277785116509800920f, // COSENOSCLDBL_3_4
0.490392640201615220f, // COSENOSCLDBL_3_5
0.097545161008064388f, // COSENOSCLDBL_3_6
-0.415734806151272560f, // COSENOSCLDBL_3_7
0.353553390593273790f, // COSENOSCLDBL_4_0
-0.353553390593273730f, // COSENOSCLDBL_4_1
-0.353553390593273840f, // COSENOSCLDBL_4_2
0.353553390593273680f, // COSENOSCLDBL_4_3
0.353553390593273840f, // COSENOSCLDBL_4_4
-0.353553390593273340f, // COSENOSCLDBL_4_5
-0.353553390593273560f, // COSENOSCLDBL_4_6
0.353553390593273290f, // COSENOSCLDBL_4_7
0.277785116509801140f, // COSENOSCLDBL_5_0
-0.490392640201615220f, // COSENOSCLDBL_5_1
0.097545161008064152f, // COSENOSCLDBL_5_2
0.415734806151272730f, // COSENOSCLDBL_5_3
-0.415734806151272560f, // COSENOSCLDBL_5_4
-0.097545161008064013f, // COSENOSCLDBL_5_5
0.490392640201615330f, // COSENOSCLDBL_5_6
-0.277785116509800760f, // COSENOSCLDBL_5_7
0.191341716182544920f, // COSENOSCLDBL_6_0
-0.461939766255643420f, // COSENOSCLDBL_6_1
0.461939766255643260f, // COSENOSCLDBL_6_2
-0.191341716182544950f, // COSENOSCLDBL_6_3
-0.191341716182545280f, // COSENOSCLDBL_6_4
0.461939766255643370f, // COSENOSCLDBL_6_5
-0.461939766255643200f, // COSENOSCLDBL_6_6
0.191341716182544780f, // COSENOSCLDBL_6_7
0.097545161008064166f, // COSENOSCLDBL_7_0
-0.277785116509801090f, // COSENOSCLDBL_7_1
0.415734806151272730f, // COSENOSCLDBL_7_2
-0.490392640201615330f, // COSENOSCLDBL_7_3
0.490392640201615220f, // COSENOSCLDBL_7_4
-0.415734806151272510f, // COSENOSCLDBL_7_5
0.277785116509800760f, // COSENOSCLDBL_7_6
-0.097545161008064291f // COSENOSCLDBL_7_7
};
__constant__ float d_COSENI[64];
///////////////////////////////////////////////////////////////////////////////
// Executing DCT on CPU
///////////////////////////////////////////////////////////////////////////////
extern "C"
void computeDCTCPU(
short *,//PIC OUT
float *,//PIC TEMP
__int8 *,//PIC IN
int, //WIDTH
int, //HEIGHT
int, //NRIGHE
int //BLOCCHITOTALI
);
///////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
extern "C"
int isPICTURE_DCTcorrect(short *, short *, int);
extern "C"
void printResToFilePICT(__int8 *, short *, short *, int, int);
extern "C"
__int8 RandInt8(__int8, __int8);
///////////////////////////////////////////////////////////////////////////////
// Executing monodimensional DCT on GPU
///////////////////////////////////////////////////////////////////////////////
#include "test_kernel.cu"
int main(int argc, char **argv){
CUT_DEVICE_INIT();
//DEVICE
short *d_PICTURE_OUT;
float *d_PICTURE_TMP;
__int8 *d_PICTURE_IN8;
//HOST
__int8 *h_PICTURE_IN8;
float *h_PICTURE_TMP;
int *h_PICTURE_TMP_INT;
short *h_PICTURE_OUT_CPU;
short *h_PICTURE_OUT_GPU;//To download host results
//WORK_VAR
int PICTURE_WIDTH, PICTURE_HEIGHT, PICTURE_DATA_N, PICTURE_SZ_I8;
int PICTURE_SZ_FL, PICTURE_SZ_SH, PICTURE_SZ_32;
int TOT_DBLOCKS, DBLOCKS_PER_ROW, TOT_FRAGMENTS;
float MBtotalGPUallocatedMEM;
const int DCT_ORDER = 8;
const int NUMERO_COSENI = 64;
const int RIGHE_CONC = 8;
const int DATABLOCK_CONC = 16;
const int SH_MEM_SZ = NUMERO_COSENI * sizeof(float);
int loop = 3;
//INIT
dim3 grid(16, 8, 1);
dim3 block(64, 1, 1);
PICTURE_WIDTH = (loop*8);
PICTURE_HEIGHT = (loop*8);
PICTURE_DATA_N = PICTURE_WIDTH * PICTURE_HEIGHT;
PICTURE_SZ_I8 = PICTURE_DATA_N * sizeof(__int8);
PICTURE_SZ_FL = PICTURE_DATA_N * sizeof(float);
PICTURE_SZ_SH = PICTURE_DATA_N * sizeof(short);
PICTURE_SZ_32 = PICTURE_DATA_N * sizeof(int);
TOT_DBLOCKS = (PICTURE_WIDTH*PICTURE_HEIGHT)/NUMERO_COSENI;
DBLOCKS_PER_ROW = PICTURE_WIDTH / RIGHE_CONC;
TOT_FRAGMENTS =
(unsigned int)(ceilf((float)TOT_DBLOCKS/DATABLOCK_CONC));
MBtotalGPUallocatedMEM =
(PICTURE_SZ_SH+PICTURE_SZ_I8+PICTURE_SZ_FL)/(1048576.0f);
h_PICTURE_TMP = (float *)malloc(PICTURE_SZ_FL);
h_PICTURE_TMP_INT = (int *)malloc(PICTURE_SZ_32);
h_PICTURE_OUT_CPU = (short *)malloc(PICTURE_SZ_SH);
//Pinned memory mode - Use custom function to get OS-pinned memory
CUDA_SAFE_CALL( cudaMallocHost((void **)&h_PICTURE_IN8, PICTURE_SZ_I8) );
CUDA_SAFE_CALL( cudaMallocHost((void **)&h_PICTURE_OUT_GPU, PICTURE_SZ_SH) );
CUDA_SAFE_CALL( cudaMalloc((void **)&d_PICTURE_IN8, PICTURE_SZ_I8) );
CUDA_SAFE_CALL( cudaMalloc((void **)&d_PICTURE_TMP, PICTURE_SZ_FL) );
CUDA_SAFE_CALL( cudaMalloc((void **)&d_PICTURE_OUT, PICTURE_SZ_SH) );
//FILL
srand(123);
srand( (unsigned)time(NULL) );
for(int i=0; i<PICTURE_DATA_N; i++){
h_PICTURE_IN8[i] = (__int8)127;//TEST
}
//DEVICE CALLS
CUDA_SAFE_CALL( cudaMemcpy(d_PICTURE_IN8, h_PICTURE_IN8, PICTURE_SZ_I8, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaMemcpyToSymbol(d_COSENI, &h_COSENI, NUMERO_COSENI*sizeof(float), 0, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL( cudaThreadSynchronize() );
computeDCTGPU<<<grid, block, SH_MEM_SZ>>>(d_PICTURE_OUT, d_PICTURE_TMP, d_PICTURE_IN8, PICTURE_WIDTH, PICTURE_HEIGHT, RIGHE_CONC, TOT_DBLOCKS);
CUT_CHECK_ERROR("Kernel execution failed!\n");
CUDA_SAFE_CALL( cudaThreadSynchronize() );
CUDA_SAFE_CALL( cudaMemcpy(h_PICTURE_OUT_GPU, d_PICTURE_OUT, PICTURE_DATA_N * sizeof(short), cudaMemcpyDeviceToHost) );
//HOST REF CALL
computeDCTCPU(h_PICTURE_OUT_CPU, h_PICTURE_TMP, h_PICTURE_IN8, PICTURE_WIDTH, PICTURE_HEIGHT, RIGHE_CONC, TOT_DBLOCKS);
//CHECK
printf((isPICTURE_DCTcorrect(h_PICTURE_OUT_CPU, h_PICTURE_OUT_GPU, PICTURE_DATA_N)) ? "\n\t\t\t\t\t\t ===> TEST PASSED <===\n" : "\n\t\t\t\t\t\t ===> TEST FAILED <===\n");
printResToFilePICT(h_PICTURE_IN8, h_PICTURE_OUT_CPU, h_PICTURE_OUT_GPU, PICTURE_DATA_N, PICTURE_WIDTH);
//DEALLOC
free(h_PICTURE_TMP);
free(h_PICTURE_TMP_INT);
free(h_PICTURE_OUT_CPU);
CUDA_SAFE_CALL( cudaFreeHost(h_PICTURE_IN8) );
CUDA_SAFE_CALL( cudaFreeHost(h_PICTURE_OUT_GPU) );
CUDA_SAFE_CALL( cudaFree(d_PICTURE_IN8) );
CUDA_SAFE_CALL( cudaFree(d_PICTURE_TMP) );
CUDA_SAFE_CALL( cudaFree(d_PICTURE_OUT) );
CUT_EXIT(argc, argv);
}
And Here is the Kernel…
#define IMUL(a, b) __mul24(a, b)
#define IMOD(i, n) ((i)%(n))
#define MUL(c, d) ((c)*(d))
#define NCOSENI 64
#define ACCUM_N 64
__global__
void computeDCTGPU(
short *d_PICTUREOUT,//PIC OUT
float *d_PICTURETMP,//PIC TEMP
__int8 *d_PICTUREIN8,//PIC IN
int WIDTH, //WIDTH
int HEIGHT, //HEIGHT
int NRIGHE, //NRIGHE
int TOT_DBLOCKS //BLOCCHITOTALI
){
extern __constant__ float d_COSENI[];
extern __shared__ float tempMatSum[];//64 elements
//blockIdx.x <=> ID CUDA block [0-15]
//blockIdx.y <=> ID row (of CUDA block) [0-7]
//threadIdx.x <=> ID thread [0-7]
const unsigned int tid = threadIdx.x;
__shared__ int DBLOCKS_PER_ROW;
if (tid == 0) DBLOCKS_PER_ROW = WIDTH / NRIGHE;
for(int iBlocco = blockIdx.x; iBlocco < TOT_DBLOCKS; iBlocco += gridDim.x){
__syncthreads();
//Coordinate del datablock (x,y)
unsigned int dblk_Y = (iBlocco / DBLOCKS_PER_ROW);
unsigned int dblk_X = (iBlocco % DBLOCKS_PER_ROW);
//Indirizzo dell'elemento (0,0) del datablock corrente
__shared__ unsigned int vBase;
if (tid == 0)
vBase = IMUL(dblk_X, NRIGHE) + IMUL((dblk_Y*DBLOCKS_PER_ROW), NCOSENI);
//monodim DCT
for(unsigned int rigaDCT = blockIdx.y; rigaDCT < NRIGHE; rigaDCT += gridDim.y){
__syncthreads();
//First element (of row) address unsigned int relAddress = vBase + IMUL(WIDTH, rigaDCT);
for(unsigned int pos = tid; pos < NCOSENI; pos += blockDim.x){
tempMatSum[pos] = (float)(d_PICTUREIN8[relAddress+(pos%NRIGHE)] * d_COSENI[pos]);
//tempMatSum[pos] = 1.0f;//TEST
}
//tree-like reduction
for(unsigned int stride = (NRIGHE/2); stride > 0; stride >>= 1){
__syncthreads();
for(unsigned int iAcc = tid; iAcc < (stride*NRIGHE); iAcc += blockDim.x){
unsigned int pos = NRIGHE*(iAcc/stride)+(iAcc%stride);
tempMatSum[pos] += tempMatSum[pos + stride];
}
}
//utilizzando solo il numero di thread necessario.
if(tid < NRIGHE) d_PICTURETMP[vBase+((tid*WIDTH)+rigaDCT)] = tempMatSum[tid*NRIGHE];
}//Fine DCT monodimensionale "righe"
__syncthreads();
//the other mono DCT
for(unsigned int rigaDCT = blockIdx.y; rigaDCT < NRIGHE; rigaDCT += gridDim.y){
__syncthreads();
unsigned int relAddress = vBase + IMUL(WIDTH, rigaDCT);
for(unsigned int pos = tid; pos < NCOSENI; pos += blockDim.x){
tempMatSum[pos] = (float)(d_PICTURETMP[relAddress+(pos%NRIGHE)] * d_COSENI[pos]);
//tempMatSum[pos] = 1.0f;//TEST
}
//tree-like reduction
for(unsigned int stride = (NRIGHE/2); stride > 0; stride >>= 1){
__syncthreads();
for(unsigned int iAcc = tid; iAcc < (stride*NRIGHE); iAcc += blockDim.x){
unsigned int pos = NRIGHE*(iAcc/stride)+(iAcc%stride);
tempMatSum[pos] += tempMatSum[pos + stride];
}
}
if(tid < NRIGHE) d_PICTUREOUT[vBase+((tid*WIDTH)+rigaDCT)] = (short)(floor(tempMatSum[tid*NRIGHE]+0.5f));
}//Second DCT end
}//Current block end
}
And finally the Working CPU reference function…
#include <stdlib.h>
#include <math.h>
extern "C"
void computeDCTCPU(
short *h_PICTUREOUT,//PIC OUT
float *h_PICTURETMP,//PIC TEMP
__int8 *h_PICTUREIN8,//PIC IN
int WIDTH, //WIDTH
int HEIGHT, //HEIGHT
int NRIGHE, //NRIGHE
int TOT_DBLOCKS //BLOCCHITOTALI
){
extern float h_COSENI[];
//COSTANTI
const unsigned int ConcBlksN = 16; //concurrents 8x8 DCT
const unsigned int latoDCT = 8; //elements per DCT row
//datablock size
const unsigned int COSDATA_N = latoDCT*latoDCT;
//Temporary matrix
float tempMatSum[COSDATA_N];
//Number of datablocks for the picture
unsigned int N_DBLOCKS_TOT = (WIDTH*HEIGHT)/COSDATA_N;
//datablock per row
unsigned int DBLK_PER_ROW = WIDTH / latoDCT;
//Needed Iteration to transform the whole picture
unsigned int TOT_FRAGMENTS = (unsigned int)(ceilf((float)N_DBLOCKS_TOT/ConcBlksN));
//Split the original picture in fragments
for(unsigned int fragment = 0; fragment < TOT_FRAGMENTS; fragment++){
//for each iteration it process ConcBlksN datablock
for(unsigned int dblk_ID = 0; dblk_ID < ConcBlksN; dblk_ID++){
//Global Id of datablock
unsigned int dblk_GlobalID = dblk_ID + (fragment*ConcBlksN);
if (dblk_GlobalID > (N_DBLOCKS_TOT-1)) break;
//Coordinates of datablock (x,y)
unsigned int dblk_Y = (dblk_GlobalID / DBLK_PER_ROW);
unsigned int dblk_X = (dblk_GlobalID % DBLK_PER_ROW);
//Indirizzo dell'elemento (0,0) del datablock corrente
unsigned int vBase =
(dblk_X*latoDCT) + (dblk_Y*DBLK_PER_ROW*COSDATA_N);
//First mono DCT
for(unsigned int rigaDCT = 0; rigaDCT < latoDCT; rigaDCT++){
//Indirizzo del primo elemento della riga corrente nel datablock
unsigned int relAddress = vBase + (WIDTH*rigaDCT);
for(unsigned int pos = 0; pos < COSDATA_N; pos++){
tempMatSum[pos] = (float)(h_PICTUREIN8[relAddress+(pos%latoDCT)]*h_COSENI[pos]);
}
for(unsigned int stride = (latoDCT>>1); stride > 0; stride >>= 1){
for(unsigned int iAcc = 0; iAcc < (stride*latoDCT); iAcc++){
unsigned int pos = (latoDCT*(iAcc/stride))+(iAcc%stride);
tempMatSum[pos] += tempMatSum[pos + stride];
}
}
for(unsigned int tid=0;tid < latoDCT;tid++){
h_PICTURETMP[vBase+((tid*WIDTH)+rigaDCT)] = tempMatSum[tid*latoDCT];
}
}//First DCT end
//second DCT mono
for(unsigned int rigaDCT = 0; rigaDCT < latoDCT; rigaDCT++){
//Indirizzo del primo elemento della riga corrente nel datablock
unsigned int relAddress = vBase + (WIDTH*rigaDCT);
for(unsigned int pos = 0; pos < COSDATA_N; pos++){
tempMatSum[pos] = (float)((h_PICTURETMP[relAddress+(pos%latoDCT)]*h_COSENI[pos]));
}
for(unsigned int stride = (latoDCT>>1); stride > 0; stride >>= 1){
for(unsigned int iAcc = 0; iAcc < (stride*latoDCT); iAcc++){
unsigned int pos = (latoDCT*(iAcc/stride))+(iAcc%stride);
tempMatSum[pos] += tempMatSum[pos + stride];
}
}
for(unsigned int tid=0;tid < latoDCT;tid++){
h_PICTUREOUT[vBase+((tid*WIDTH)+rigaDCT)] = (short)(floorf(tempMatSum[tid*latoDCT]+0.5f));
}
}//End of the second DCT
}//end of current datablock
}//End of iteration
}
Anyway I think the kernel not correctly manages the shared resources…and I am sure the threads are serialized when doing tree-like reduction even if I don’t mind.
If anyone could help me I’ll be very grateful…
PS.I’m sorry if the code is not too much readable!