32x32 Block-based 2D-DCT on image coding/concurrency problem

Hi, i’m beginner with CUDA (downloaded this morning)

i try to implement a CUDA program to compute each 32x32 DCT of image (ex 1920x1080 => 60x33 block of 32x32 pels and i want all their DCT)

So i have write a cuda program which work well in Emul mode, but failed in Device Mode , i suppose that’s come from a concurrency problem, but i d’on’t know how find it ?

If someone as an idea, it will be welcome…please found in the following my source code.

  1. init a memory buffer of uiWuiH32*32

  2. init transfrom coefficient

  3. transfer buffer and coefficient to device

  4. define block&thead and call the kernel

  5. get back the computed data

  6. compare GPU computation with CPU computation

/* 

*	Image Y float frame / DCT 32x32.

*/

// includes, system

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <memory.h>

#include <time.h>

// includes, project

#include "cutil.h"

////////////////////////////////////////////////////////////////////////////////

// declaration, forward

void vfRunTest(int argc, char** argv);

void vfRandomInit(float*, int);

void vfPrintDiff(float *data1, float *data2, int iBW, int iBH, int iStride, int iW, int iH);

void vfPrintBlock(float* , int, int);

//! 32x32 dct C/C++ - CPU Reference

extern "C" int init32x32();

extern "C" int dct32x32(float* block, int s);

extern "C" float* pf_internal_coeff;

#include "DCT32x32_Kernel.cu"

#define ACCURACY_FLOAT 1e-6f

//#define ACCURACY_FLOAT 1e-2f

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv)

{

    vfRunTest(argc, argv);

   CUT_EXIT(argc, argv);

}

////////////////////////////////////////////////////////////////////////////////

//! Run a simple test for CUDA

////////////////////////////////////////////////////////////////////////////////

void vfRunTest(int argc, char** argv)

{

    CUT_DEVICE_INIT();

   // set seed for rand()

    srand(2006);

   // allocate host memory for matrices A and B

    unsigned int uiN = 32;

	unsigned int uiW = 2;//1920/32

	unsigned int uiH = 1;//1080/32 = 33.75 => 33

	unsigned int uiStride = uiN*uiW;

	unsigned int uiSize = uiN*uiN*uiW*uiH;

	unsigned int uiMemSize = sizeof(float) * uiSize;

    float* pfBlock__Src = (float*) malloc(uiMemSize);

	

	float* pfBlock__CPU = (float*) malloc(uiMemSize);

	float* pfBlock__GPU = (float*) malloc(uiMemSize);

    

	// initialize float multiplier

	init32x32();

    // initialize host memory

    vfRandomInit(pfBlock__Src, uiSize);

    

	{//Start CUDA Processing Section 

 // allocate device memory

  float* pfBlock__CUDA; CUDA_SAFE_CALL(cudaMalloc((void**) &pfBlock__CUDA, uiMemSize));

  float* pfCoeff__CUDA; CUDA_SAFE_CALL(cudaMalloc((void**) &pfCoeff__CUDA, uiN*sizeof(float)));

      

  // create and start timer

  unsigned int timer = 0;

  CUT_SAFE_CALL(cutCreateTimer(&timer));

  CUT_SAFE_CALL(cutStartTimer(timer));

 // copy host memory to device

  CUDA_SAFE_CALL(cudaMemcpy(pfBlock__CUDA, pfBlock__Src, uiMemSize, cudaMemcpyHostToDevice) );

  CUDA_SAFE_CALL(cudaMemcpy(pfCoeff__CUDA, pf_internal_coeff,  uiN*sizeof(float), cudaMemcpyHostToDevice) );  

 printf("GPU start\n");

 /*// setup execution parameters - 1 thread 1 block

  dim3 threads(1, 1);

  dim3 grid(1 / threads.x, 1/ threads.y);

 for(unsigned int uiby = 0; uiby < uiH; uiby++){

  	for(unsigned int uibx = 0; uibx < uiW; uibx++){

  	// execute the kernel

  	CuImageDCT32x32<<< grid, threads >>>(pfBlock__CUDA+uiby*uiN*uiStride+uibx*uiN, uiStride, pfCoeff__CUDA);

  	}

  }//*/

 // setup execution parameters

  dim3 threads(uiW, 1);

  dim3 grid(uiW / threads.x, uiH/ threads.y);

  CuImageDCT32x32<<< grid, threads >>>(pfBlock__CUDA, uiStride, pfCoeff__CUDA);//*/

 // check if kernel execution generated and error

  CUT_CHECK_ERROR("Kernel execution failed");

 // copy result from device to host

  CUDA_SAFE_CALL(cudaMemcpy(pfBlock__GPU, pfBlock__CUDA, uiMemSize, cudaMemcpyDeviceToHost) );

  

  // stop and destroy timer

  CUT_SAFE_CALL(cutStopTimer(timer));

  printf("GPU Processing time: %f (ms) \n", cutGetTimerValue(timer));

  CUT_SAFE_CALL(cutDeleteTimer(timer));

 vfPrintBlock(pfBlock__GPU, uiN*uiW, uiN*uiH);

  fprintf(stdout, "\n"); getchar();//*/

 CUDA_SAFE_CALL(cudaFree(pfBlock__CUDA));

  CUDA_SAFE_CALL(cudaFree(pfCoeff__CUDA));

	}//End Cuda Processing

	{// compute reference solution

  memcpy(pfBlock__CPU, pfBlock__Src, uiMemSize);

 //float f_start_clock = (float)clock();

 unsigned int timer = 0;

  CUT_SAFE_CALL(cutCreateTimer(&timer));

  CUT_SAFE_CALL(cutStartTimer(timer));//*/

 for(unsigned int uiby = 0; uiby < uiH; uiby++){

  	for(unsigned int uibx = 0; uibx < uiW; uibx++){

  	dct32x32(pfBlock__CPU+uiby*uiN*uiStride+uibx*uiN, uiStride);

  	}

  }

 // stop and destroy timer

  CUT_SAFE_CALL(cutStopTimer(timer));

  printf("CPU based reference - Processing time: %f (ms) \n", cutGetTimerValue(timer));

  CUT_SAFE_CALL(cutDeleteTimer(timer));//*/

 /*float  f_finish_clock = (float)clock();

  float  f_duration = (float)(f_finish_clock - f_start_clock)/(float)CLOCKS_PER_SEC;

  printf("CPU based reference : %2.2f seconds\n", f_duration);//*/

 vfPrintBlock(pfBlock__CPU, uiN*uiW, uiN*uiH);

  fprintf(stdout, "\n"); getchar();//*/

	}

   // check result

    CUTBoolean res = cutCompareL2fe(pfBlock__CPU, pfBlock__GPU, uiSize, ACCURACY_FLOAT);

	printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");//*/

    //if (res!=1) vfPrintDiff(pfBlock__CPU, pfBlock__GPU, uiN, uiN, uiStride, uiW, uiH);

   // clean up memory

    free(pfBlock__Src);

    free(pfBlock__CPU);

    free(pfBlock__GPU);

}

// Allocates a matrix with random float entries.

void vfRandomInit(float* data, int size)

{

    for (int i = 0; i < size; ++i)

        data[i] = rand() / (float)RAND_MAX;

}

void vfPrintBlock(float* data, int w, int h){

	for(unsigned int i = 0; i < h; i++){

  for(unsigned int j = 0; j < w; j++) {

  	fprintf(stdout, "%.2f ", data[i*w+j]);

  }

  fprintf(stdout, "\n");

	}

	return;

}

#define SCALER(v) (int)(v*10000.0f)

void vfPrintDiff(float *data1, float *data2, int iBW, int iBH, int iStride, int iW, int iH)

{

  int i,j,k,n,m;

  int error_count=0;

  for (j=0; j<iH; j++) {

    for (i=0; i<iW; i++) {

  for(m=0; m<iBH; m++) {

  	for(n=0; n<iBW; n++) {

    k = j*iBH*iStride + i*iBW + m*iStride + n;

    if (SCALER(data1[k]) != SCALER(data2[k])) {

    	printf("diff(%d,%d):(%d,%d) CPU=%4.4f, GPU=%4.4f \n", j, i, m, n, data1[k], data2[k]);

    	error_count++;

    	getchar();

    }

  	}

  }

    }

  }

  printf(" nTotal Errors = %d n", error_count);

}

and the kernel :

/* 

*	DCT32x32 with Cuda

*	Device code.

*/

#ifndef _DCT32x32_KERNEL_H_

#define _DCT32x32_KERNEL_H_

__device__ int uiBlock = 0;

__global__ void CuImageDCT32x32(float *pf_iodata, int s, float* pf_Coeff)

{	

	float pfTempBlock[1024];

	float fDC_Scaler;

	float fdonMonroScaler;

	fDC_Scaler = 0.7071067814f;

	fdonMonroScaler = 0.0625f;

	int u;

	float* c = pf_Coeff;

	float *f, *o;

	float T, Cfac;

	// Block index

    int bx = blockIdx.x;

    int by = blockIdx.y;

   // Thread index

    int tx = threadIdx.x;

    int ty = threadIdx.y;

#ifdef __DEVICE_EMULATION__

	printf("Block %d - %d : Thread %d - %d => %d \n", bx, by, tx, ty, uiBlock);

	uiBlock++;//*/

#endif

	float* pf_curr = pf_iodata + (by*32)*s /*pels based index*/ + bx*32 + ty*32*s + tx*32;

	for (u = 0; u < 32; u++){//1st dct pass line based

  f = &(pf_curr[u*s]);

 T = f[16]; 

  f[16] = f[1]; 

  f[1] = T; 

  T = f[8]; 

  f[8] = f[2]; 

  f[2] = T; 

  T = f[24]; 

  f[24] = f[3]; 

  f[3] = T; 

  T = f[20]; 

  f[20] = f[5]; 

  f[5] = T; 

  T = f[12]; 

  f[12] = f[6]; 

  f[6] = T; 

  T = f[28]; 

  f[28] = f[7]; 

  f[7] = T; 

  T = f[18]; 

  f[18] = f[9]; 

  f[9] = T; 

  T = f[26]; 

  f[26] = f[11]; 

  f[11] = T; 

  T = f[22]; 

  f[22] = f[13]; 

  f[13] = T; 

  T = f[30]; 

  f[30] = f[15]; 

  f[15] = T; 

  T = f[25]; 

  f[25] = f[19]; 

  f[19] = T; 

  T = f[29]; 

  f[29] = f[23]; 

  f[23] = T; 

  T = f[8]; 

  f[8] = f[1]; 

  f[1] = T; 

  T = f[4]; 

  f[4] = f[2]; 

  f[2] = T; 

  T = f[12]; 

  f[12] = f[3]; 

  f[3] = T; 

  T = f[10]; 

  f[10] = f[5]; 

  f[5] = T; 

  T = f[14]; 

  f[14] = f[7]; 

  f[7] = T; 

  T = f[13]; 

  f[13] = f[11]; 

  f[11] = T; 

  f += 16;

  T = f[8]; 

  f[8] = f[1]; 

  f[1] = T; 

  T = f[4]; 

  f[4] = f[2]; 

  f[2] = T; 

  T = f[12]; 

  f[12] = f[3]; 

  f[3] = T; 

  T = f[10]; 

  f[10] = f[5]; 

  f[5] = T; 

  T = f[14]; 

  f[14] = f[7]; 

  f[7] = T; 

  T = f[13]; 

  f[13] = f[11]; 

  f[11] = T; 

  f -= 16;

  T = f[31];

  f[31] = f[16];

  f[16] = T;

  T = f[30];

  f[30] = f[17];

  f[17] = T;

  T = f[29];

  f[29] = f[18];

  f[18] = T;

  T = f[28];

  f[28] = f[19];

  f[19] = T;

  T = f[27];

  f[27] = f[20];

  f[20] = T;

  T = f[26];

  f[26] = f[21];

  f[21] = T;

  T = f[25];

  f[25] = f[22];

  f[22] = T;

  T = f[24];

  f[24] = f[23];

  f[23] = T;

  Cfac = c[16];

  T = f[16];

  f[16] = Cfac *(f[0]-T);

  f[0] += T;

  Cfac = c[17];

  T = f[17];

  f[17] = Cfac *(f[1]-T);

  f[1] += T;

  Cfac = c[18];

  T = f[18];

  f[18] = Cfac *(f[2]-T);

  f[2] += T;

  Cfac = c[19];

  T = f[19];

  f[19] = Cfac *(f[3]-T);

  f[3] += T;

  Cfac = c[20];

  T = f[20];

  f[20] = Cfac *(f[4]-T);

  f[4] += T;

  Cfac = c[21];

  T = f[21];

  f[21] = Cfac *(f[5]-T);

  f[5] += T;

  Cfac = c[22];

  T = f[22];

  f[22] = Cfac *(f[6]-T);

  f[6] += T;

  Cfac = c[23];

  T = f[23];

  f[23] = Cfac *(f[7]-T);

  f[7] += T;

  Cfac = c[24];

  T = f[24];

  f[24] = Cfac *(f[8]-T);

  f[8] += T;

  Cfac = c[25];

  T = f[25];

  f[25] = Cfac *(f[9]-T);

  f[9] += T;

  Cfac = c[26];

  T = f[26];

  f[26] = Cfac *(f[10]-T);

  f[10] += T;

  Cfac = c[27];

  T = f[27];

  f[27] = Cfac *(f[11]-T);

  f[11] += T;

  Cfac = c[28];

  T = f[28];

  f[28] = Cfac *(f[12]-T);

  f[12] += T;

  Cfac = c[29];

  T = f[29];

  f[29] = Cfac *(f[13]-T);

  f[13] += T;

  Cfac = c[30];

  T = f[30];

  f[30] = Cfac *(f[14]-T);

  f[14] += T;

  Cfac = c[31];

  T = f[31];

  f[31] = Cfac *(f[15]-T);

  f[15] += T;

  Cfac = c[8];

  T = f[8];

  f[8] = Cfac *(f[0]-T);

  f[0] += T;

  T = f[24];

  f[24] = Cfac *(f[16]-T);

  f[16] += T;

  Cfac = c[9];

  T = f[9];

  f[9] = Cfac *(f[1]-T);

  f[1] += T;

  T = f[25];

  f[25] = Cfac *(f[17]-T);

  f[17] += T;

  Cfac = c[10];

  T = f[10];

  f[10] = Cfac *(f[2]-T);

  f[2] += T;

  T = f[26];

  f[26] = Cfac *(f[18]-T);

  f[18] += T;

  Cfac = c[11];

  T = f[11];

  f[11] = Cfac *(f[3]-T);

  f[3] += T;

  T = f[27];

  f[27] = Cfac *(f[19]-T);

  f[19] += T;

  Cfac = c[12];

  T = f[12];

  f[12] = Cfac *(f[4]-T);

  f[4] += T;

  T = f[28];

  f[28] = Cfac *(f[20]-T);

  f[20] += T;

  Cfac = c[13];

  T = f[13];

  f[13] = Cfac *(f[5]-T);

  f[5] += T;

  T = f[29];

  f[29] = Cfac *(f[21]-T);

  f[21] += T;

  Cfac = c[14];

  T = f[14];

  f[14] = Cfac *(f[6]-T);

  f[6] += T;

  T = f[30];

  f[30] = Cfac *(f[22]-T);

  f[22] += T;

  Cfac = c[15];

  T = f[15];

  f[15] = Cfac *(f[7]-T);

  f[7] += T;

  T = f[31];

  f[31] = Cfac *(f[23]-T);

  f[23] += T;

  Cfac = c[4];

  T = f[4];

  f[4] = Cfac *(f[0]-T);

  f[0] += T;

  T = f[12];

  f[12] = Cfac *(f[8]-T);

  f[8] += T;

  T = f[20];

  f[20] = Cfac *(f[16]-T);

  f[16] += T;

  T = f[28];

  f[28] = Cfac *(f[24]-T);

  f[24] += T;

  Cfac = c[5];

  T = f[5];

  f[5] = Cfac *(f[1]-T);

  f[1] += T;

  T = f[13];

  f[13] = Cfac *(f[9]-T);

  f[9] += T;

  T = f[21];

  f[21] = Cfac *(f[17]-T);

  f[17] += T;

  T = f[29];

  f[29] = Cfac *(f[25]-T);

  f[25] += T;

  Cfac = c[6];

  T = f[6];

  f[6] = Cfac *(f[2]-T);

  f[2] += T;

  T = f[14];

  f[14] = Cfac *(f[10]-T);

  f[10] += T;

  T = f[22];

  f[22] = Cfac *(f[18]-T);

  f[18] += T;

  T = f[30];

  f[30] = Cfac *(f[26]-T);

  f[26] += T;

  Cfac = c[7];

  T = f[7];

  f[7] = Cfac *(f[3]-T);

  f[3] += T;

  T = f[15];

  f[15] = Cfac *(f[11]-T);

  f[11] += T;

  T = f[23];

  f[23] = Cfac *(f[19]-T);

  f[19] += T;

  T = f[31];

  f[31] = Cfac *(f[27]-T);

  f[27] += T;

  Cfac = c[2];

  T = f[2];

  f[2] = Cfac *(f[0]-T);

  f[0] += T;

  T = f[6];

  f[6] = Cfac *(f[4]-T);

  f[4] += T;

  T = f[10];

  f[10] = Cfac *(f[8]-T);

  f[8] += T;

  T = f[14];

  f[14] = Cfac *(f[12]-T);

  f[12] += T;

  T = f[18];

  f[18] = Cfac *(f[16]-T);

  f[16] += T;

  T = f[22];

  f[22] = Cfac *(f[20]-T);

  f[20] += T;

  T = f[26];

  f[26] = Cfac *(f[24]-T);

  f[24] += T;

  T = f[30];

  f[30] = Cfac *(f[28]-T);

  f[28] += T;

  Cfac = c[3];

  T = f[3];

  f[3] = Cfac *(f[1]-T);

  f[1] += T;

  T = f[7];

  f[7] = Cfac *(f[5]-T);

  f[5] += T;

  T = f[11];

  f[11] = Cfac *(f[9]-T);

  f[9] += T;

  T = f[15];

  f[15] = Cfac *(f[13]-T);

  f[13] += T;

  T = f[19];

  f[19] = Cfac *(f[17]-T);

  f[17] += T;

  T = f[23];

  f[23] = Cfac *(f[21]-T);

  f[21] += T;

  T = f[27];

  f[27] = Cfac *(f[25]-T);

  f[25] += T;

  T = f[31];

  f[31] = Cfac *(f[29]-T);

  f[29] += T;

  Cfac = c[1];

  T = f[1];

  f[1] = Cfac *(f[0]-T);

  f[0] += T;

  T = f[3];

  f[3] = Cfac *(f[2]-T);

  f[2] += T;

  T = f[5];

  f[5] = Cfac *(f[4]-T);

  f[4] += T;

  T = f[7];

  f[7] = Cfac *(f[6]-T);

  f[6] += T;

  T = f[9];

  f[9] = Cfac *(f[8]-T);

  f[8] += T;

  T = f[11];

  f[11] = Cfac *(f[10]-T);

  f[10] += T;

  T = f[13];

  f[13] = Cfac *(f[12]-T);

  f[12] += T;

  T = f[15];

  f[15] = Cfac *(f[14]-T);

  f[14] += T;

  T = f[17];

  f[17] = Cfac *(f[16]-T);

  f[16] += T;

  T = f[19];

  f[19] = Cfac *(f[18]-T);

  f[18] += T;

  T = f[21];

  f[21] = Cfac *(f[20]-T);

  f[20] += T;

  T = f[23];

  f[23] = Cfac *(f[22]-T);

  f[22] += T;

  T = f[25];

  f[25] = Cfac *(f[24]-T);

  f[24] += T;

  T = f[27];

  f[27] = Cfac *(f[26]-T);

  f[26] += T;

  T = f[29];

  f[29] = Cfac *(f[28]-T);

  f[28] += T;

  T = f[31];

  f[31] = Cfac *(f[30]-T);

  f[30] += T;

  T = f[16]; 

  f[16] = f[1]; 

  f[1] = T; 

  T = f[8]; 

  f[8] = f[2]; 

  f[2] = T; 

  T = f[24]; 

  f[24] = f[3]; 

  f[3] = T; 

  T = f[20]; 

  f[20] = f[5]; 

  f[5] = T; 

  T = f[12]; 

  f[12] = f[6]; 

  f[6] = T; 

  T = f[28]; 

  f[28] = f[7]; 

  f[7] = T; 

  T = f[18]; 

  f[18] = f[9]; 

  f[9] = T; 

  T = f[26]; 

  f[26] = f[11]; 

  f[11] = T; 

  T = f[22]; 

  f[22] = f[13]; 

  f[13] = T; 

  T = f[30]; 

  f[30] = f[15]; 

  f[15] = T; 

  T = f[25]; 

  f[25] = f[19]; 

  f[19] = T; 

  T = f[29]; 

  f[29] = f[23]; 

  f[23] = T; 

 pfTempBlock[8*32+u] = f[8] + f[24];

  pfTempBlock[9*32+u] = f[9] + f[25];

  pfTempBlock[10*32+u] = f[10] + f[26];

  pfTempBlock[11*32+u] = f[11] + f[27];

  pfTempBlock[12*32+u] = f[12] + f[28];

  pfTempBlock[13*32+u] = f[13] + f[29];

  pfTempBlock[14*32+u] = f[14] + f[30];

  pfTempBlock[15*32+u] = f[15] + f[31];

  pfTempBlock[4*32+u] = f[4] + pfTempBlock[12*32+u];

  pfTempBlock[12*32+u] += f[20];

  pfTempBlock[20*32+u] = f[20] + f[28];

  pfTempBlock[5*32+u] = f[5] + pfTempBlock[13*32+u];

  pfTempBlock[13*32+u] += f[21];

  pfTempBlock[21*32+u] = f[21] + f[29];

  pfTempBlock[6*32+u] = f[6] + pfTempBlock[14*32+u];

  pfTempBlock[14*32+u] += f[22];

  pfTempBlock[22*32+u] = f[22] + f[30];

  pfTempBlock[7*32+u] = f[7] + pfTempBlock[15*32+u];

  pfTempBlock[15*32+u] += f[23];

  pfTempBlock[23*32+u] = f[23] + f[31];

  pfTempBlock[2*32+u] = f[2] + pfTempBlock[6*32+u];

  pfTempBlock[6*32+u] += pfTempBlock[10*32+u];

  pfTempBlock[10*32+u] += pfTempBlock[14*32+u];

  pfTempBlock[14*32+u] += f[18];

  pfTempBlock[18*32+u] = f[18] + pfTempBlock[22*32+u];

  pfTempBlock[22*32+u] += f[26];

  pfTempBlock[26*32+u] = f[26] + f[30];

  pfTempBlock[3*32+u] = f[3] + pfTempBlock[7*32+u];

  pfTempBlock[7*32+u] += pfTempBlock[11*32+u];

  pfTempBlock[11*32+u] += pfTempBlock[15*32+u];

  pfTempBlock[15*32+u] += f[19];

  pfTempBlock[16*32+u] = f[16];

  pfTempBlock[19*32+u] = f[19] + pfTempBlock[23*32+u];

  pfTempBlock[23*32+u] += f[27];

  pfTempBlock[27*32+u] = f[27] + f[31];

  pfTempBlock[32+u] = f[1] + pfTempBlock[3*32+u];

  pfTempBlock[3*32+u] += pfTempBlock[5*32+u];

  pfTempBlock[5*32+u] += pfTempBlock[7*32+u];

  pfTempBlock[7*32+u] += pfTempBlock[9*32+u];

  pfTempBlock[9*32+u] += pfTempBlock[11*32+u];

  pfTempBlock[11*32+u] += pfTempBlock[13*32+u];

  pfTempBlock[13*32+u] +=pfTempBlock[15*32+u];

  pfTempBlock[15*32+u] += f[17];

  pfTempBlock[17*32+u] = f[17] + pfTempBlock[19*32+u];

  pfTempBlock[19*32+u] += pfTempBlock[21*32+u];

  pfTempBlock[21*32+u] += pfTempBlock[23*32+u];

  pfTempBlock[23*32+u] += f[25];

  pfTempBlock[24*32+u] = f[24];

  pfTempBlock[25*32+u] = f[25] + pfTempBlock[27*32+u];

  pfTempBlock[27*32+u] += f[29];

  pfTempBlock[28*32+u] = f[28];

  pfTempBlock[29*32+u] = f[29] + f[31];

  pfTempBlock[30*32+u] = f[30];

  pfTempBlock[31*32+u] = f[31];

 //transpose dc & scaling

  pfTempBlock[u] = f[0] * fDC_Scaler;  

	}

	for (u = 0; u < 32; u++){//2nd dct pass row based on internal buffer, transpose output and post-scale

  f = &(pfTempBlock[u*32]);

  o = pf_curr;

 T = f[16]; 

  f[16] = f[1]; 

  f[1] = T; 

  T = f[8]; 

  f[8] = f[2]; 

  f[2] = T; 

  T = f[24]; 

  f[24] = f[3]; 

  f[3] = T; 

  T = f[20]; 

  f[20] = f[5]; 

  f[5] = T; 

  T = f[12]; 

  f[12] = f[6]; 

  f[6] = T; 

  T = f[28]; 

  f[28] = f[7]; 

  f[7] = T; 

  T = f[18]; 

  f[18] = f[9]; 

  f[9] = T; 

  T = f[26]; 

  f[26] = f[11]; 

  f[11] = T; 

  T = f[22]; 

  f[22] = f[13]; 

  f[13] = T; 

  T = f[30]; 

  f[30] = f[15]; 

  f[15] = T; 

  T = f[25]; 

  f[25] = f[19]; 

  f[19] = T; 

  T = f[29]; 

  f[29] = f[23]; 

  f[23] = T; 

  T = f[8]; 

  f[8] = f[1]; 

  f[1] = T; 

  T = f[4]; 

  f[4] = f[2]; 

  f[2] = T; 

  T = f[12]; 

  f[12] = f[3]; 

  f[3] = T; 

  T = f[10]; 

  f[10] = f[5]; 

  f[5] = T; 

  T = f[14]; 

  f[14] = f[7]; 

  f[7] = T; 

  T = f[13]; 

  f[13] = f[11]; 

  f[11] = T; 

  f += 16;

  T = f[8]; 

  f[8] = f[1]; 

  f[1] = T; 

  T = f[4]; 

  f[4] = f[2]; 

  f[2] = T; 

  T = f[12]; 

  f[12] = f[3]; 

  f[3] = T; 

  T = f[10]; 

  f[10] = f[5]; 

  f[5] = T; 

  T = f[14]; 

  f[14] = f[7]; 

  f[7] = T; 

  T = f[13]; 

  f[13] = f[11]; 

  f[11] = T; 

  f -= 16;

  T = f[31];

  f[31] = f[16];

  f[16] = T;

  T = f[30];

  f[30] = f[17];

  f[17] = T;

  T = f[29];

  f[29] = f[18];

  f[18] = T;

  T = f[28];

  f[28] = f[19];

  f[19] = T;

  T = f[27];

  f[27] = f[20];

  f[20] = T;

  T = f[26];

  f[26] = f[21];

  f[21] = T;

  T = f[25];

  f[25] = f[22];

  f[22] = T;

  T = f[24];

  f[24] = f[23];

  f[23] = T;

  Cfac = c[16];

  T = f[16];

  f[16] = Cfac *(f[0]-T);

  f[0] += T;

  Cfac = c[17];

  T = f[17];

  f[17] = Cfac *(f[1]-T);

  f[1] += T;

  Cfac = c[18];

  T = f[18];

  f[18] = Cfac *(f[2]-T);

  f[2] += T;

  Cfac = c[19];

  T = f[19];

  f[19] = Cfac *(f[3]-T);

  f[3] += T;

  Cfac = c[20];

  T = f[20];

  f[20] = Cfac *(f[4]-T);

  f[4] += T;

  Cfac = c[21];

  T = f[21];

  f[21] = Cfac *(f[5]-T);

  f[5] += T;

  Cfac = c[22];

  T = f[22];

  f[22] = Cfac *(f[6]-T);

  f[6] += T;

  Cfac = c[23];

  T = f[23];

  f[23] = Cfac *(f[7]-T);

  f[7] += T;

  Cfac = c[24];

  T = f[24];

  f[24] = Cfac *(f[8]-T);

  f[8] += T;

  Cfac = c[25];

  T = f[25];

  f[25] = Cfac *(f[9]-T);

  f[9] += T;

  Cfac = c[26];

  T = f[26];

  f[26] = Cfac *(f[10]-T);

  f[10] += T;

  Cfac = c[27];

  T = f[27];

  f[27] = Cfac *(f[11]-T);

  f[11] += T;

  Cfac = c[28];

  T = f[28];

  f[28] = Cfac *(f[12]-T);

  f[12] += T;

  Cfac = c[29];

  T = f[29];

  f[29] = Cfac *(f[13]-T);

  f[13] += T;

  Cfac = c[30];

  T = f[30];

  f[30] = Cfac *(f[14]-T);

  f[14] += T;

  Cfac = c[31];

  T = f[31];

  f[31] = Cfac *(f[15]-T);

  f[15] += T;

  Cfac = c[8];

  T = f[8];

  f[8] = Cfac *(f[0]-T);

  f[0] += T;

  T = f[24];

  f[24] = Cfac *(f[16]-T);

  f[16] += T;

  Cfac = c[9];

  T = f[9];

  f[9] = Cfac *(f[1]-T);

  f[1] += T;

  T = f[25];

  f[25] = Cfac *(f[17]-T);

  f[17] += T;

  Cfac = c[10];

  T = f[10];

  f[10] = Cfac *(f[2]-T);

  f[2] += T;

  T = f[26];

  f[26] = Cfac *(f[18]-T);

  f[18] += T;

  Cfac = c[11];

  T = f[11];

  f[11] = Cfac *(f[3]-T);

  f[3] += T;

  T = f[27];

  f[27] = Cfac *(f[19]-T);

  f[19] += T;

  Cfac = c[12];

  T = f[12];

  f[12] = Cfac *(f[4]-T);

  f[4] += T;

  T = f[28];

  f[28] = Cfac *(f[20]-T);

  f[20] += T;

  Cfac = c[13];

  T = f[13];

  f[13] = Cfac *(f[5]-T);

  f[5] += T;

  T = f[29];

  f[29] = Cfac *(f[21]-T);

  f[21] += T;

  Cfac = c[14];

  T = f[14];

  f[14] = Cfac *(f[6]-T);

  f[6] += T;

  T = f[30];

  f[30] = Cfac *(f[22]-T);

  f[22] += T;

  Cfac = c[15];

  T = f[15];

  f[15] = Cfac *(f[7]-T);

  f[7] += T;

  T = f[31];

  f[31] = Cfac *(f[23]-T);

  f[23] += T;

  Cfac = c[4];

  T = f[4];

  f[4] = Cfac *(f[0]-T);

  f[0] += T;

  T = f[12];

  f[12] = Cfac *(f[8]-T);

  f[8] += T;

  T = f[20];

  f[20] = Cfac *(f[16]-T);

  f[16] += T;

  T = f[28];

  f[28] = Cfac *(f[24]-T);

  f[24] += T;

  Cfac = c[5];

  T = f[5];

  f[5] = Cfac *(f[1]-T);

  f[1] += T;

  T = f[13];

  f[13] = Cfac *(f[9]-T);

  f[9] += T;

  T = f[21];

  f[21] = Cfac *(f[17]-T);

  f[17] += T;

  T = f[29];

  f[29] = Cfac *(f[25]-T);

  f[25] += T;

  Cfac = c[6];

  T = f[6];

  f[6] = Cfac *(f[2]-T);

  f[2] += T;

  T = f[14];

  f[14] = Cfac *(f[10]-T);

  f[10] += T;

  T = f[22];

  f[22] = Cfac *(f[18]-T);

  f[18] += T;

  T = f[30];

  f[30] = Cfac *(f[26]-T);

  f[26] += T;

  Cfac = c[7];

  T = f[7];

  f[7] = Cfac *(f[3]-T);

  f[3] += T;

  T = f[15];

  f[15] = Cfac *(f[11]-T);

  f[11] += T;

  T = f[23];

  f[23] = Cfac *(f[19]-T);

  f[19] += T;

  T = f[31];

  f[31] = Cfac *(f[27]-T);

  f[27] += T;

  Cfac = c[2];

  T = f[2];

  f[2] = Cfac *(f[0]-T);

  f[0] += T;

  T = f[6];

  f[6] = Cfac *(f[4]-T);

  f[4] += T;

  T = f[10];

  f[10] = Cfac *(f[8]-T);

  f[8] += T;

  T = f[14];

  f[14] = Cfac *(f[12]-T);

  f[12] += T;

  T = f[18];

  f[18] = Cfac *(f[16]-T);

  f[16] += T;

  T = f[22];

  f[22] = Cfac *(f[20]-T);

  f[20] += T;

  T = f[26];

  f[26] = Cfac *(f[24]-T);

  f[24] += T;

  T = f[30];

  f[30] = Cfac *(f[28]-T);

  f[28] += T;

  Cfac = c[3];

  T = f[3];

  f[3] = Cfac *(f[1]-T);

  f[1] += T;

  T = f[7];

  f[7] = Cfac *(f[5]-T);

  f[5] += T;

  T = f[11];

  f[11] = Cfac *(f[9]-T);

  f[9] += T;

  T = f[15];

  f[15] = Cfac *(f[13]-T);

  f[13] += T;

  T = f[19];

  f[19] = Cfac *(f[17]-T);

  f[17] += T;

  T = f[23];

  f[23] = Cfac *(f[21]-T);

  f[21] += T;

  T = f[27];

  f[27] = Cfac *(f[25]-T);

  f[25] += T;

  T = f[31];

  f[31] = Cfac *(f[29]-T);

  f[29] += T;

  Cfac = c[1];

  T = f[1];

  f[1] = Cfac *(f[0]-T);

  f[0] += T;

  T = f[3];

  f[3] = Cfac *(f[2]-T);

  f[2] += T;

  T = f[5];

  f[5] = Cfac *(f[4]-T);

  f[4] += T;

  T = f[7];

  f[7] = Cfac *(f[6]-T);

  f[6] += T;

  T = f[9];

  f[9] = Cfac *(f[8]-T);

  f[8] += T;

  T = f[11];

  f[11] = Cfac *(f[10]-T);

  f[10] += T;

  T = f[13];

  f[13] = Cfac *(f[12]-T);

  f[12] += T;

  T = f[15];

  f[15] = Cfac *(f[14]-T);

  f[14] += T;

  T = f[17];

  f[17] = Cfac *(f[16]-T);

  f[16] += T;

  T = f[19];

  f[19] = Cfac *(f[18]-T);

  f[18] += T;

  T = f[21];

  f[21] = Cfac *(f[20]-T);

  f[20] += T;

  T = f[23];

  f[23] = Cfac *(f[22]-T);

  f[22] += T;

  T = f[25];

  f[25] = Cfac *(f[24]-T);

  f[24] += T;

  T = f[27];

  f[27] = Cfac *(f[26]-T);

  f[26] += T;

  T = f[29];

  f[29] = Cfac *(f[28]-T);

  f[28] += T;

  T = f[31];

  f[31] = Cfac *(f[30]-T);

  f[30] += T;

  T = f[16]; 

  f[16] = f[1]; 

  f[1] = T; 

  T = f[8]; 

  f[8] = f[2]; 

  f[2] = T; 

  T = f[24]; 

  f[24] = f[3]; 

  f[3] = T; 

  T = f[20]; 

  f[20] = f[5]; 

  f[5] = T; 

  T = f[12]; 

  f[12] = f[6]; 

  f[6] = T; 

  T = f[28]; 

  f[28] = f[7]; 

  f[7] = T; 

  T = f[18]; 

  f[18] = f[9]; 

  f[9] = T; 

  T = f[26]; 

  f[26] = f[11]; 

  f[11] = T; 

  T = f[22]; 

  f[22] = f[13]; 

  f[13] = T; 

  T = f[30]; 

  f[30] = f[15]; 

  f[15] = T; 

  T = f[25]; 

  f[25] = f[19]; 

  f[19] = T; 

  T = f[29]; 

  f[29] = f[23]; 

  f[23] = T; 

 //fwd_sum

  o[8*s+u] = (f[8] + f[24])*fdonMonroScaler;

  o[9*s+u] = f[9] + f[25];

  o[10*s+u] = f[10] + f[26];

  o[11*s+u] = f[11] + f[27];

  o[12*s+u] = f[12] + f[28];

  o[13*s+u] = f[13] + f[29];

  o[14*s+u] = f[14] + f[30];

  o[15*s+u] = f[15] + f[31];

  o[4*s+u] = (f[4] + o[12*s+u])*fdonMonroScaler;

  o[12*s+u] += f[20]; o[12*s+u]*=fdonMonroScaler;

  o[20*s+u] = (f[20] + f[28])*fdonMonroScaler;

  o[5*s+u] = f[5] + o[13*s+u];

  o[13*s+u] += f[21];

  o[21*s+u] = f[21] + f[29];

  o[6*s+u] = f[6] + o[14*s+u];

  o[14*s+u] += f[22];

  o[22*s+u] = f[22] + f[30];

  o[7*s+u] = f[7] + o[15*s+u];

  o[15*s+u] += f[23];

  o[23*s+u] = f[23] + f[31];

  o[2*s+u] = (f[2] + o[6*s+u])*fdonMonroScaler; 

  o[6*s+u] += o[10*s+u]; o[6*s+u]*=fdonMonroScaler;

  o[10*s+u] += o[14*s+u]; o[10*s+u]*=fdonMonroScaler;

  o[14*s+u] += f[18]; o[14*s+u]*=fdonMonroScaler;

  o[18*s+u] = (f[18] + o[22*s+u])*fdonMonroScaler;

  o[22*s+u] += f[26]; o[22*s+u]*=fdonMonroScaler;

  o[26*s+u] = (f[26] + f[30])*fdonMonroScaler; 

  o[3*s+u] = f[3] + o[7*s+u];

  o[7*s+u] += o[11*s+u];

  o[11*s+u] += o[15*s+u];

  o[15*s+u] += f[19];

  o[16*s+u] = f[16] * fdonMonroScaler;

  o[19*s+u] = f[19] + o[23*s+u];

  o[23*s+u] += f[27];

  o[27*s+u] = f[27] + f[31];

  o[32+u] = (f[1] + o[3*s+u])*fdonMonroScaler; 

  o[3*s+u] += o[5*s+u]; o[3*s+u]*=fdonMonroScaler;

  o[5*s+u] += o[7*s+u]; o[5*s+u]*=fdonMonroScaler;

  o[7*s+u] += o[9*s+u]; o[7*s+u]*=fdonMonroScaler;

  o[9*s+u] += o[11*s+u]; o[9*s+u]*=fdonMonroScaler;

  o[11*s+u] += o[13*s+u]; o[11*s+u]*=fdonMonroScaler;

  o[13*s+u] +=o[15*s+u]; o[13*s+u]*=fdonMonroScaler;

  o[15*s+u] += f[17]; o[15*s+u]*=fdonMonroScaler;

  o[17*s+u] = f[17] + o[19*s+u]; o[17*s+u]*=fdonMonroScaler;

  o[19*s+u] += o[21*s+u]; o[19*s+u]*=fdonMonroScaler;

  o[21*s+u] += o[23*s+u]; o[21*s+u]*=fdonMonroScaler;

  o[23*s+u] += f[25]; o[23*s+u]*=fdonMonroScaler;

  o[24*s+u] = f[24]*fdonMonroScaler;

  o[25*s+u] = f[25] + o[27*s+u]; o[25*s+u]*=fdonMonroScaler;

  o[27*s+u] += f[29]; o[27*s+u]*=fdonMonroScaler;

  o[28*s+u] = f[28]* fdonMonroScaler;

  o[29*s+u] = (f[29] + f[31])* fdonMonroScaler;

  o[30*s+u] = f[30]* fdonMonroScaler;

  o[31*s+u] = f[31]* fdonMonroScaler;

 //transpose dc & scaling

  o[u] = f[0] * fDC_Scaler * fdonMonroScaler;

	}//*/

}

#endif // _DCT32x32_KERNEL_H_

I haven’t examined your code in detail, but here are some pointers:

The first thing I notice is that your blocks are only 2 x 1 threads - you should use at least 32 threads per block for performance.

Every thread seems to be writing to an address which is a function of the variable u, but u is not dependent on the thread index, so they will be overwriting each other.

Have you looked at the DCT example in the CUDA 2.0 SDK?

It’s seems that i have a bad understanding of some CUDA concept, i have just try to done the same thing done with OpenMP.

This 2.0 version was not available on the .fr web site, but i have found and setup. The result of dct8x8 program is disturbing (see below)…

Thank to your help.

Using device 0: GeForce 8800 GTX

CUDA sample DCT/IDCT implementation

===================================

Loading test image: barbara.bmp... [512 x 512]... Success

Running Gold 1 (CPU) version... Success

Running Gold 2 (CPU) version... Success

Running CUDA 1 (GPU) version... Success

Running CUDA 2 (GPU) version... Success

Dumping result to barbara_gold1.bmp... Success

Dumping result to barbara_gold2.bmp... Success

Dumping result to barbara_cuda1.bmp... Success

Dumping result to barbara_cuda2.bmp... Success

Processing time (CUDA 1) : 0.385775 ms

Processing time (CUDA 2) : 0.268302 ms

PSNR Original    <---> CPU(Gold 1) : 32.777126

PSNR Original    <---> CPU(Gold 2) : 32.777016

PSNR Original    <---> GPU(CUDA 1) : 14.204351

PSNR Original    <---> GPU(CUDA 2) : 14.204351

PSNR CPU(Gold 1) <---> GPU(CUDA 1) : 14.177763

PSNR CPU(Gold 2) <---> GPU(CUDA 2) : 14.178191

TEST FAILED! (CPU and GPU results differ too much)

Press ENTER to exit...