Hi, i’m beginner with CUDA (downloaded this morning)
i try to implement a CUDA program to compute each 32x32 DCT of image (ex 1920x1080 => 60x33 block of 32x32 pels and i want all their DCT)
So i have write a cuda program which work well in Emul mode, but failed in Device Mode , i suppose that’s come from a concurrency problem, but i d’on’t know how find it ?
If someone as an idea, it will be welcome…please found in the following my source code.
-
init a memory buffer of uiWuiH32*32
-
init transfrom coefficient
-
transfer buffer and coefficient to device
-
define block&thead and call the kernel
-
get back the computed data
-
compare GPU computation with CPU computation
/*
* Image Y float frame / DCT 32x32.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <memory.h>
#include <time.h>
// includes, project
#include "cutil.h"
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void vfRunTest(int argc, char** argv);
void vfRandomInit(float*, int);
void vfPrintDiff(float *data1, float *data2, int iBW, int iBH, int iStride, int iW, int iH);
void vfPrintBlock(float* , int, int);
//! 32x32 dct C/C++ - CPU Reference
extern "C" int init32x32();
extern "C" int dct32x32(float* block, int s);
extern "C" float* pf_internal_coeff;
#include "DCT32x32_Kernel.cu"
#define ACCURACY_FLOAT 1e-6f
//#define ACCURACY_FLOAT 1e-2f
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
vfRunTest(argc, argv);
CUT_EXIT(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void vfRunTest(int argc, char** argv)
{
CUT_DEVICE_INIT();
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
unsigned int uiN = 32;
unsigned int uiW = 2;//1920/32
unsigned int uiH = 1;//1080/32 = 33.75 => 33
unsigned int uiStride = uiN*uiW;
unsigned int uiSize = uiN*uiN*uiW*uiH;
unsigned int uiMemSize = sizeof(float) * uiSize;
float* pfBlock__Src = (float*) malloc(uiMemSize);
float* pfBlock__CPU = (float*) malloc(uiMemSize);
float* pfBlock__GPU = (float*) malloc(uiMemSize);
// initialize float multiplier
init32x32();
// initialize host memory
vfRandomInit(pfBlock__Src, uiSize);
{//Start CUDA Processing Section
// allocate device memory
float* pfBlock__CUDA; CUDA_SAFE_CALL(cudaMalloc((void**) &pfBlock__CUDA, uiMemSize));
float* pfCoeff__CUDA; CUDA_SAFE_CALL(cudaMalloc((void**) &pfCoeff__CUDA, uiN*sizeof(float)));
// create and start timer
unsigned int timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(pfBlock__CUDA, pfBlock__Src, uiMemSize, cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(pfCoeff__CUDA, pf_internal_coeff, uiN*sizeof(float), cudaMemcpyHostToDevice) );
printf("GPU start\n");
/*// setup execution parameters - 1 thread 1 block
dim3 threads(1, 1);
dim3 grid(1 / threads.x, 1/ threads.y);
for(unsigned int uiby = 0; uiby < uiH; uiby++){
for(unsigned int uibx = 0; uibx < uiW; uibx++){
// execute the kernel
CuImageDCT32x32<<< grid, threads >>>(pfBlock__CUDA+uiby*uiN*uiStride+uibx*uiN, uiStride, pfCoeff__CUDA);
}
}//*/
// setup execution parameters
dim3 threads(uiW, 1);
dim3 grid(uiW / threads.x, uiH/ threads.y);
CuImageDCT32x32<<< grid, threads >>>(pfBlock__CUDA, uiStride, pfCoeff__CUDA);//*/
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
// copy result from device to host
CUDA_SAFE_CALL(cudaMemcpy(pfBlock__GPU, pfBlock__CUDA, uiMemSize, cudaMemcpyDeviceToHost) );
// stop and destroy timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf("GPU Processing time: %f (ms) \n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));
vfPrintBlock(pfBlock__GPU, uiN*uiW, uiN*uiH);
fprintf(stdout, "\n"); getchar();//*/
CUDA_SAFE_CALL(cudaFree(pfBlock__CUDA));
CUDA_SAFE_CALL(cudaFree(pfCoeff__CUDA));
}//End Cuda Processing
{// compute reference solution
memcpy(pfBlock__CPU, pfBlock__Src, uiMemSize);
//float f_start_clock = (float)clock();
unsigned int timer = 0;
CUT_SAFE_CALL(cutCreateTimer(&timer));
CUT_SAFE_CALL(cutStartTimer(timer));//*/
for(unsigned int uiby = 0; uiby < uiH; uiby++){
for(unsigned int uibx = 0; uibx < uiW; uibx++){
dct32x32(pfBlock__CPU+uiby*uiN*uiStride+uibx*uiN, uiStride);
}
}
// stop and destroy timer
CUT_SAFE_CALL(cutStopTimer(timer));
printf("CPU based reference - Processing time: %f (ms) \n", cutGetTimerValue(timer));
CUT_SAFE_CALL(cutDeleteTimer(timer));//*/
/*float f_finish_clock = (float)clock();
float f_duration = (float)(f_finish_clock - f_start_clock)/(float)CLOCKS_PER_SEC;
printf("CPU based reference : %2.2f seconds\n", f_duration);//*/
vfPrintBlock(pfBlock__CPU, uiN*uiW, uiN*uiH);
fprintf(stdout, "\n"); getchar();//*/
}
// check result
CUTBoolean res = cutCompareL2fe(pfBlock__CPU, pfBlock__GPU, uiSize, ACCURACY_FLOAT);
printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED");//*/
//if (res!=1) vfPrintDiff(pfBlock__CPU, pfBlock__GPU, uiN, uiN, uiStride, uiW, uiH);
// clean up memory
free(pfBlock__Src);
free(pfBlock__CPU);
free(pfBlock__GPU);
}
// Allocates a matrix with random float entries.
void vfRandomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
void vfPrintBlock(float* data, int w, int h){
for(unsigned int i = 0; i < h; i++){
for(unsigned int j = 0; j < w; j++) {
fprintf(stdout, "%.2f ", data[i*w+j]);
}
fprintf(stdout, "\n");
}
return;
}
#define SCALER(v) (int)(v*10000.0f)
void vfPrintDiff(float *data1, float *data2, int iBW, int iBH, int iStride, int iW, int iH)
{
int i,j,k,n,m;
int error_count=0;
for (j=0; j<iH; j++) {
for (i=0; i<iW; i++) {
for(m=0; m<iBH; m++) {
for(n=0; n<iBW; n++) {
k = j*iBH*iStride + i*iBW + m*iStride + n;
if (SCALER(data1[k]) != SCALER(data2[k])) {
printf("diff(%d,%d):(%d,%d) CPU=%4.4f, GPU=%4.4f \n", j, i, m, n, data1[k], data2[k]);
error_count++;
getchar();
}
}
}
}
}
printf(" nTotal Errors = %d n", error_count);
}
and the kernel :
/*
* DCT32x32 with Cuda
* Device code.
*/
#ifndef _DCT32x32_KERNEL_H_
#define _DCT32x32_KERNEL_H_
__device__ int uiBlock = 0;
__global__ void CuImageDCT32x32(float *pf_iodata, int s, float* pf_Coeff)
{
float pfTempBlock[1024];
float fDC_Scaler;
float fdonMonroScaler;
fDC_Scaler = 0.7071067814f;
fdonMonroScaler = 0.0625f;
int u;
float* c = pf_Coeff;
float *f, *o;
float T, Cfac;
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
#ifdef __DEVICE_EMULATION__
printf("Block %d - %d : Thread %d - %d => %d \n", bx, by, tx, ty, uiBlock);
uiBlock++;//*/
#endif
float* pf_curr = pf_iodata + (by*32)*s /*pels based index*/ + bx*32 + ty*32*s + tx*32;
for (u = 0; u < 32; u++){//1st dct pass line based
f = &(pf_curr[u*s]);
T = f[16];
f[16] = f[1];
f[1] = T;
T = f[8];
f[8] = f[2];
f[2] = T;
T = f[24];
f[24] = f[3];
f[3] = T;
T = f[20];
f[20] = f[5];
f[5] = T;
T = f[12];
f[12] = f[6];
f[6] = T;
T = f[28];
f[28] = f[7];
f[7] = T;
T = f[18];
f[18] = f[9];
f[9] = T;
T = f[26];
f[26] = f[11];
f[11] = T;
T = f[22];
f[22] = f[13];
f[13] = T;
T = f[30];
f[30] = f[15];
f[15] = T;
T = f[25];
f[25] = f[19];
f[19] = T;
T = f[29];
f[29] = f[23];
f[23] = T;
T = f[8];
f[8] = f[1];
f[1] = T;
T = f[4];
f[4] = f[2];
f[2] = T;
T = f[12];
f[12] = f[3];
f[3] = T;
T = f[10];
f[10] = f[5];
f[5] = T;
T = f[14];
f[14] = f[7];
f[7] = T;
T = f[13];
f[13] = f[11];
f[11] = T;
f += 16;
T = f[8];
f[8] = f[1];
f[1] = T;
T = f[4];
f[4] = f[2];
f[2] = T;
T = f[12];
f[12] = f[3];
f[3] = T;
T = f[10];
f[10] = f[5];
f[5] = T;
T = f[14];
f[14] = f[7];
f[7] = T;
T = f[13];
f[13] = f[11];
f[11] = T;
f -= 16;
T = f[31];
f[31] = f[16];
f[16] = T;
T = f[30];
f[30] = f[17];
f[17] = T;
T = f[29];
f[29] = f[18];
f[18] = T;
T = f[28];
f[28] = f[19];
f[19] = T;
T = f[27];
f[27] = f[20];
f[20] = T;
T = f[26];
f[26] = f[21];
f[21] = T;
T = f[25];
f[25] = f[22];
f[22] = T;
T = f[24];
f[24] = f[23];
f[23] = T;
Cfac = c[16];
T = f[16];
f[16] = Cfac *(f[0]-T);
f[0] += T;
Cfac = c[17];
T = f[17];
f[17] = Cfac *(f[1]-T);
f[1] += T;
Cfac = c[18];
T = f[18];
f[18] = Cfac *(f[2]-T);
f[2] += T;
Cfac = c[19];
T = f[19];
f[19] = Cfac *(f[3]-T);
f[3] += T;
Cfac = c[20];
T = f[20];
f[20] = Cfac *(f[4]-T);
f[4] += T;
Cfac = c[21];
T = f[21];
f[21] = Cfac *(f[5]-T);
f[5] += T;
Cfac = c[22];
T = f[22];
f[22] = Cfac *(f[6]-T);
f[6] += T;
Cfac = c[23];
T = f[23];
f[23] = Cfac *(f[7]-T);
f[7] += T;
Cfac = c[24];
T = f[24];
f[24] = Cfac *(f[8]-T);
f[8] += T;
Cfac = c[25];
T = f[25];
f[25] = Cfac *(f[9]-T);
f[9] += T;
Cfac = c[26];
T = f[26];
f[26] = Cfac *(f[10]-T);
f[10] += T;
Cfac = c[27];
T = f[27];
f[27] = Cfac *(f[11]-T);
f[11] += T;
Cfac = c[28];
T = f[28];
f[28] = Cfac *(f[12]-T);
f[12] += T;
Cfac = c[29];
T = f[29];
f[29] = Cfac *(f[13]-T);
f[13] += T;
Cfac = c[30];
T = f[30];
f[30] = Cfac *(f[14]-T);
f[14] += T;
Cfac = c[31];
T = f[31];
f[31] = Cfac *(f[15]-T);
f[15] += T;
Cfac = c[8];
T = f[8];
f[8] = Cfac *(f[0]-T);
f[0] += T;
T = f[24];
f[24] = Cfac *(f[16]-T);
f[16] += T;
Cfac = c[9];
T = f[9];
f[9] = Cfac *(f[1]-T);
f[1] += T;
T = f[25];
f[25] = Cfac *(f[17]-T);
f[17] += T;
Cfac = c[10];
T = f[10];
f[10] = Cfac *(f[2]-T);
f[2] += T;
T = f[26];
f[26] = Cfac *(f[18]-T);
f[18] += T;
Cfac = c[11];
T = f[11];
f[11] = Cfac *(f[3]-T);
f[3] += T;
T = f[27];
f[27] = Cfac *(f[19]-T);
f[19] += T;
Cfac = c[12];
T = f[12];
f[12] = Cfac *(f[4]-T);
f[4] += T;
T = f[28];
f[28] = Cfac *(f[20]-T);
f[20] += T;
Cfac = c[13];
T = f[13];
f[13] = Cfac *(f[5]-T);
f[5] += T;
T = f[29];
f[29] = Cfac *(f[21]-T);
f[21] += T;
Cfac = c[14];
T = f[14];
f[14] = Cfac *(f[6]-T);
f[6] += T;
T = f[30];
f[30] = Cfac *(f[22]-T);
f[22] += T;
Cfac = c[15];
T = f[15];
f[15] = Cfac *(f[7]-T);
f[7] += T;
T = f[31];
f[31] = Cfac *(f[23]-T);
f[23] += T;
Cfac = c[4];
T = f[4];
f[4] = Cfac *(f[0]-T);
f[0] += T;
T = f[12];
f[12] = Cfac *(f[8]-T);
f[8] += T;
T = f[20];
f[20] = Cfac *(f[16]-T);
f[16] += T;
T = f[28];
f[28] = Cfac *(f[24]-T);
f[24] += T;
Cfac = c[5];
T = f[5];
f[5] = Cfac *(f[1]-T);
f[1] += T;
T = f[13];
f[13] = Cfac *(f[9]-T);
f[9] += T;
T = f[21];
f[21] = Cfac *(f[17]-T);
f[17] += T;
T = f[29];
f[29] = Cfac *(f[25]-T);
f[25] += T;
Cfac = c[6];
T = f[6];
f[6] = Cfac *(f[2]-T);
f[2] += T;
T = f[14];
f[14] = Cfac *(f[10]-T);
f[10] += T;
T = f[22];
f[22] = Cfac *(f[18]-T);
f[18] += T;
T = f[30];
f[30] = Cfac *(f[26]-T);
f[26] += T;
Cfac = c[7];
T = f[7];
f[7] = Cfac *(f[3]-T);
f[3] += T;
T = f[15];
f[15] = Cfac *(f[11]-T);
f[11] += T;
T = f[23];
f[23] = Cfac *(f[19]-T);
f[19] += T;
T = f[31];
f[31] = Cfac *(f[27]-T);
f[27] += T;
Cfac = c[2];
T = f[2];
f[2] = Cfac *(f[0]-T);
f[0] += T;
T = f[6];
f[6] = Cfac *(f[4]-T);
f[4] += T;
T = f[10];
f[10] = Cfac *(f[8]-T);
f[8] += T;
T = f[14];
f[14] = Cfac *(f[12]-T);
f[12] += T;
T = f[18];
f[18] = Cfac *(f[16]-T);
f[16] += T;
T = f[22];
f[22] = Cfac *(f[20]-T);
f[20] += T;
T = f[26];
f[26] = Cfac *(f[24]-T);
f[24] += T;
T = f[30];
f[30] = Cfac *(f[28]-T);
f[28] += T;
Cfac = c[3];
T = f[3];
f[3] = Cfac *(f[1]-T);
f[1] += T;
T = f[7];
f[7] = Cfac *(f[5]-T);
f[5] += T;
T = f[11];
f[11] = Cfac *(f[9]-T);
f[9] += T;
T = f[15];
f[15] = Cfac *(f[13]-T);
f[13] += T;
T = f[19];
f[19] = Cfac *(f[17]-T);
f[17] += T;
T = f[23];
f[23] = Cfac *(f[21]-T);
f[21] += T;
T = f[27];
f[27] = Cfac *(f[25]-T);
f[25] += T;
T = f[31];
f[31] = Cfac *(f[29]-T);
f[29] += T;
Cfac = c[1];
T = f[1];
f[1] = Cfac *(f[0]-T);
f[0] += T;
T = f[3];
f[3] = Cfac *(f[2]-T);
f[2] += T;
T = f[5];
f[5] = Cfac *(f[4]-T);
f[4] += T;
T = f[7];
f[7] = Cfac *(f[6]-T);
f[6] += T;
T = f[9];
f[9] = Cfac *(f[8]-T);
f[8] += T;
T = f[11];
f[11] = Cfac *(f[10]-T);
f[10] += T;
T = f[13];
f[13] = Cfac *(f[12]-T);
f[12] += T;
T = f[15];
f[15] = Cfac *(f[14]-T);
f[14] += T;
T = f[17];
f[17] = Cfac *(f[16]-T);
f[16] += T;
T = f[19];
f[19] = Cfac *(f[18]-T);
f[18] += T;
T = f[21];
f[21] = Cfac *(f[20]-T);
f[20] += T;
T = f[23];
f[23] = Cfac *(f[22]-T);
f[22] += T;
T = f[25];
f[25] = Cfac *(f[24]-T);
f[24] += T;
T = f[27];
f[27] = Cfac *(f[26]-T);
f[26] += T;
T = f[29];
f[29] = Cfac *(f[28]-T);
f[28] += T;
T = f[31];
f[31] = Cfac *(f[30]-T);
f[30] += T;
T = f[16];
f[16] = f[1];
f[1] = T;
T = f[8];
f[8] = f[2];
f[2] = T;
T = f[24];
f[24] = f[3];
f[3] = T;
T = f[20];
f[20] = f[5];
f[5] = T;
T = f[12];
f[12] = f[6];
f[6] = T;
T = f[28];
f[28] = f[7];
f[7] = T;
T = f[18];
f[18] = f[9];
f[9] = T;
T = f[26];
f[26] = f[11];
f[11] = T;
T = f[22];
f[22] = f[13];
f[13] = T;
T = f[30];
f[30] = f[15];
f[15] = T;
T = f[25];
f[25] = f[19];
f[19] = T;
T = f[29];
f[29] = f[23];
f[23] = T;
pfTempBlock[8*32+u] = f[8] + f[24];
pfTempBlock[9*32+u] = f[9] + f[25];
pfTempBlock[10*32+u] = f[10] + f[26];
pfTempBlock[11*32+u] = f[11] + f[27];
pfTempBlock[12*32+u] = f[12] + f[28];
pfTempBlock[13*32+u] = f[13] + f[29];
pfTempBlock[14*32+u] = f[14] + f[30];
pfTempBlock[15*32+u] = f[15] + f[31];
pfTempBlock[4*32+u] = f[4] + pfTempBlock[12*32+u];
pfTempBlock[12*32+u] += f[20];
pfTempBlock[20*32+u] = f[20] + f[28];
pfTempBlock[5*32+u] = f[5] + pfTempBlock[13*32+u];
pfTempBlock[13*32+u] += f[21];
pfTempBlock[21*32+u] = f[21] + f[29];
pfTempBlock[6*32+u] = f[6] + pfTempBlock[14*32+u];
pfTempBlock[14*32+u] += f[22];
pfTempBlock[22*32+u] = f[22] + f[30];
pfTempBlock[7*32+u] = f[7] + pfTempBlock[15*32+u];
pfTempBlock[15*32+u] += f[23];
pfTempBlock[23*32+u] = f[23] + f[31];
pfTempBlock[2*32+u] = f[2] + pfTempBlock[6*32+u];
pfTempBlock[6*32+u] += pfTempBlock[10*32+u];
pfTempBlock[10*32+u] += pfTempBlock[14*32+u];
pfTempBlock[14*32+u] += f[18];
pfTempBlock[18*32+u] = f[18] + pfTempBlock[22*32+u];
pfTempBlock[22*32+u] += f[26];
pfTempBlock[26*32+u] = f[26] + f[30];
pfTempBlock[3*32+u] = f[3] + pfTempBlock[7*32+u];
pfTempBlock[7*32+u] += pfTempBlock[11*32+u];
pfTempBlock[11*32+u] += pfTempBlock[15*32+u];
pfTempBlock[15*32+u] += f[19];
pfTempBlock[16*32+u] = f[16];
pfTempBlock[19*32+u] = f[19] + pfTempBlock[23*32+u];
pfTempBlock[23*32+u] += f[27];
pfTempBlock[27*32+u] = f[27] + f[31];
pfTempBlock[32+u] = f[1] + pfTempBlock[3*32+u];
pfTempBlock[3*32+u] += pfTempBlock[5*32+u];
pfTempBlock[5*32+u] += pfTempBlock[7*32+u];
pfTempBlock[7*32+u] += pfTempBlock[9*32+u];
pfTempBlock[9*32+u] += pfTempBlock[11*32+u];
pfTempBlock[11*32+u] += pfTempBlock[13*32+u];
pfTempBlock[13*32+u] +=pfTempBlock[15*32+u];
pfTempBlock[15*32+u] += f[17];
pfTempBlock[17*32+u] = f[17] + pfTempBlock[19*32+u];
pfTempBlock[19*32+u] += pfTempBlock[21*32+u];
pfTempBlock[21*32+u] += pfTempBlock[23*32+u];
pfTempBlock[23*32+u] += f[25];
pfTempBlock[24*32+u] = f[24];
pfTempBlock[25*32+u] = f[25] + pfTempBlock[27*32+u];
pfTempBlock[27*32+u] += f[29];
pfTempBlock[28*32+u] = f[28];
pfTempBlock[29*32+u] = f[29] + f[31];
pfTempBlock[30*32+u] = f[30];
pfTempBlock[31*32+u] = f[31];
//transpose dc & scaling
pfTempBlock[u] = f[0] * fDC_Scaler;
}
for (u = 0; u < 32; u++){//2nd dct pass row based on internal buffer, transpose output and post-scale
f = &(pfTempBlock[u*32]);
o = pf_curr;
T = f[16];
f[16] = f[1];
f[1] = T;
T = f[8];
f[8] = f[2];
f[2] = T;
T = f[24];
f[24] = f[3];
f[3] = T;
T = f[20];
f[20] = f[5];
f[5] = T;
T = f[12];
f[12] = f[6];
f[6] = T;
T = f[28];
f[28] = f[7];
f[7] = T;
T = f[18];
f[18] = f[9];
f[9] = T;
T = f[26];
f[26] = f[11];
f[11] = T;
T = f[22];
f[22] = f[13];
f[13] = T;
T = f[30];
f[30] = f[15];
f[15] = T;
T = f[25];
f[25] = f[19];
f[19] = T;
T = f[29];
f[29] = f[23];
f[23] = T;
T = f[8];
f[8] = f[1];
f[1] = T;
T = f[4];
f[4] = f[2];
f[2] = T;
T = f[12];
f[12] = f[3];
f[3] = T;
T = f[10];
f[10] = f[5];
f[5] = T;
T = f[14];
f[14] = f[7];
f[7] = T;
T = f[13];
f[13] = f[11];
f[11] = T;
f += 16;
T = f[8];
f[8] = f[1];
f[1] = T;
T = f[4];
f[4] = f[2];
f[2] = T;
T = f[12];
f[12] = f[3];
f[3] = T;
T = f[10];
f[10] = f[5];
f[5] = T;
T = f[14];
f[14] = f[7];
f[7] = T;
T = f[13];
f[13] = f[11];
f[11] = T;
f -= 16;
T = f[31];
f[31] = f[16];
f[16] = T;
T = f[30];
f[30] = f[17];
f[17] = T;
T = f[29];
f[29] = f[18];
f[18] = T;
T = f[28];
f[28] = f[19];
f[19] = T;
T = f[27];
f[27] = f[20];
f[20] = T;
T = f[26];
f[26] = f[21];
f[21] = T;
T = f[25];
f[25] = f[22];
f[22] = T;
T = f[24];
f[24] = f[23];
f[23] = T;
Cfac = c[16];
T = f[16];
f[16] = Cfac *(f[0]-T);
f[0] += T;
Cfac = c[17];
T = f[17];
f[17] = Cfac *(f[1]-T);
f[1] += T;
Cfac = c[18];
T = f[18];
f[18] = Cfac *(f[2]-T);
f[2] += T;
Cfac = c[19];
T = f[19];
f[19] = Cfac *(f[3]-T);
f[3] += T;
Cfac = c[20];
T = f[20];
f[20] = Cfac *(f[4]-T);
f[4] += T;
Cfac = c[21];
T = f[21];
f[21] = Cfac *(f[5]-T);
f[5] += T;
Cfac = c[22];
T = f[22];
f[22] = Cfac *(f[6]-T);
f[6] += T;
Cfac = c[23];
T = f[23];
f[23] = Cfac *(f[7]-T);
f[7] += T;
Cfac = c[24];
T = f[24];
f[24] = Cfac *(f[8]-T);
f[8] += T;
Cfac = c[25];
T = f[25];
f[25] = Cfac *(f[9]-T);
f[9] += T;
Cfac = c[26];
T = f[26];
f[26] = Cfac *(f[10]-T);
f[10] += T;
Cfac = c[27];
T = f[27];
f[27] = Cfac *(f[11]-T);
f[11] += T;
Cfac = c[28];
T = f[28];
f[28] = Cfac *(f[12]-T);
f[12] += T;
Cfac = c[29];
T = f[29];
f[29] = Cfac *(f[13]-T);
f[13] += T;
Cfac = c[30];
T = f[30];
f[30] = Cfac *(f[14]-T);
f[14] += T;
Cfac = c[31];
T = f[31];
f[31] = Cfac *(f[15]-T);
f[15] += T;
Cfac = c[8];
T = f[8];
f[8] = Cfac *(f[0]-T);
f[0] += T;
T = f[24];
f[24] = Cfac *(f[16]-T);
f[16] += T;
Cfac = c[9];
T = f[9];
f[9] = Cfac *(f[1]-T);
f[1] += T;
T = f[25];
f[25] = Cfac *(f[17]-T);
f[17] += T;
Cfac = c[10];
T = f[10];
f[10] = Cfac *(f[2]-T);
f[2] += T;
T = f[26];
f[26] = Cfac *(f[18]-T);
f[18] += T;
Cfac = c[11];
T = f[11];
f[11] = Cfac *(f[3]-T);
f[3] += T;
T = f[27];
f[27] = Cfac *(f[19]-T);
f[19] += T;
Cfac = c[12];
T = f[12];
f[12] = Cfac *(f[4]-T);
f[4] += T;
T = f[28];
f[28] = Cfac *(f[20]-T);
f[20] += T;
Cfac = c[13];
T = f[13];
f[13] = Cfac *(f[5]-T);
f[5] += T;
T = f[29];
f[29] = Cfac *(f[21]-T);
f[21] += T;
Cfac = c[14];
T = f[14];
f[14] = Cfac *(f[6]-T);
f[6] += T;
T = f[30];
f[30] = Cfac *(f[22]-T);
f[22] += T;
Cfac = c[15];
T = f[15];
f[15] = Cfac *(f[7]-T);
f[7] += T;
T = f[31];
f[31] = Cfac *(f[23]-T);
f[23] += T;
Cfac = c[4];
T = f[4];
f[4] = Cfac *(f[0]-T);
f[0] += T;
T = f[12];
f[12] = Cfac *(f[8]-T);
f[8] += T;
T = f[20];
f[20] = Cfac *(f[16]-T);
f[16] += T;
T = f[28];
f[28] = Cfac *(f[24]-T);
f[24] += T;
Cfac = c[5];
T = f[5];
f[5] = Cfac *(f[1]-T);
f[1] += T;
T = f[13];
f[13] = Cfac *(f[9]-T);
f[9] += T;
T = f[21];
f[21] = Cfac *(f[17]-T);
f[17] += T;
T = f[29];
f[29] = Cfac *(f[25]-T);
f[25] += T;
Cfac = c[6];
T = f[6];
f[6] = Cfac *(f[2]-T);
f[2] += T;
T = f[14];
f[14] = Cfac *(f[10]-T);
f[10] += T;
T = f[22];
f[22] = Cfac *(f[18]-T);
f[18] += T;
T = f[30];
f[30] = Cfac *(f[26]-T);
f[26] += T;
Cfac = c[7];
T = f[7];
f[7] = Cfac *(f[3]-T);
f[3] += T;
T = f[15];
f[15] = Cfac *(f[11]-T);
f[11] += T;
T = f[23];
f[23] = Cfac *(f[19]-T);
f[19] += T;
T = f[31];
f[31] = Cfac *(f[27]-T);
f[27] += T;
Cfac = c[2];
T = f[2];
f[2] = Cfac *(f[0]-T);
f[0] += T;
T = f[6];
f[6] = Cfac *(f[4]-T);
f[4] += T;
T = f[10];
f[10] = Cfac *(f[8]-T);
f[8] += T;
T = f[14];
f[14] = Cfac *(f[12]-T);
f[12] += T;
T = f[18];
f[18] = Cfac *(f[16]-T);
f[16] += T;
T = f[22];
f[22] = Cfac *(f[20]-T);
f[20] += T;
T = f[26];
f[26] = Cfac *(f[24]-T);
f[24] += T;
T = f[30];
f[30] = Cfac *(f[28]-T);
f[28] += T;
Cfac = c[3];
T = f[3];
f[3] = Cfac *(f[1]-T);
f[1] += T;
T = f[7];
f[7] = Cfac *(f[5]-T);
f[5] += T;
T = f[11];
f[11] = Cfac *(f[9]-T);
f[9] += T;
T = f[15];
f[15] = Cfac *(f[13]-T);
f[13] += T;
T = f[19];
f[19] = Cfac *(f[17]-T);
f[17] += T;
T = f[23];
f[23] = Cfac *(f[21]-T);
f[21] += T;
T = f[27];
f[27] = Cfac *(f[25]-T);
f[25] += T;
T = f[31];
f[31] = Cfac *(f[29]-T);
f[29] += T;
Cfac = c[1];
T = f[1];
f[1] = Cfac *(f[0]-T);
f[0] += T;
T = f[3];
f[3] = Cfac *(f[2]-T);
f[2] += T;
T = f[5];
f[5] = Cfac *(f[4]-T);
f[4] += T;
T = f[7];
f[7] = Cfac *(f[6]-T);
f[6] += T;
T = f[9];
f[9] = Cfac *(f[8]-T);
f[8] += T;
T = f[11];
f[11] = Cfac *(f[10]-T);
f[10] += T;
T = f[13];
f[13] = Cfac *(f[12]-T);
f[12] += T;
T = f[15];
f[15] = Cfac *(f[14]-T);
f[14] += T;
T = f[17];
f[17] = Cfac *(f[16]-T);
f[16] += T;
T = f[19];
f[19] = Cfac *(f[18]-T);
f[18] += T;
T = f[21];
f[21] = Cfac *(f[20]-T);
f[20] += T;
T = f[23];
f[23] = Cfac *(f[22]-T);
f[22] += T;
T = f[25];
f[25] = Cfac *(f[24]-T);
f[24] += T;
T = f[27];
f[27] = Cfac *(f[26]-T);
f[26] += T;
T = f[29];
f[29] = Cfac *(f[28]-T);
f[28] += T;
T = f[31];
f[31] = Cfac *(f[30]-T);
f[30] += T;
T = f[16];
f[16] = f[1];
f[1] = T;
T = f[8];
f[8] = f[2];
f[2] = T;
T = f[24];
f[24] = f[3];
f[3] = T;
T = f[20];
f[20] = f[5];
f[5] = T;
T = f[12];
f[12] = f[6];
f[6] = T;
T = f[28];
f[28] = f[7];
f[7] = T;
T = f[18];
f[18] = f[9];
f[9] = T;
T = f[26];
f[26] = f[11];
f[11] = T;
T = f[22];
f[22] = f[13];
f[13] = T;
T = f[30];
f[30] = f[15];
f[15] = T;
T = f[25];
f[25] = f[19];
f[19] = T;
T = f[29];
f[29] = f[23];
f[23] = T;
//fwd_sum
o[8*s+u] = (f[8] + f[24])*fdonMonroScaler;
o[9*s+u] = f[9] + f[25];
o[10*s+u] = f[10] + f[26];
o[11*s+u] = f[11] + f[27];
o[12*s+u] = f[12] + f[28];
o[13*s+u] = f[13] + f[29];
o[14*s+u] = f[14] + f[30];
o[15*s+u] = f[15] + f[31];
o[4*s+u] = (f[4] + o[12*s+u])*fdonMonroScaler;
o[12*s+u] += f[20]; o[12*s+u]*=fdonMonroScaler;
o[20*s+u] = (f[20] + f[28])*fdonMonroScaler;
o[5*s+u] = f[5] + o[13*s+u];
o[13*s+u] += f[21];
o[21*s+u] = f[21] + f[29];
o[6*s+u] = f[6] + o[14*s+u];
o[14*s+u] += f[22];
o[22*s+u] = f[22] + f[30];
o[7*s+u] = f[7] + o[15*s+u];
o[15*s+u] += f[23];
o[23*s+u] = f[23] + f[31];
o[2*s+u] = (f[2] + o[6*s+u])*fdonMonroScaler;
o[6*s+u] += o[10*s+u]; o[6*s+u]*=fdonMonroScaler;
o[10*s+u] += o[14*s+u]; o[10*s+u]*=fdonMonroScaler;
o[14*s+u] += f[18]; o[14*s+u]*=fdonMonroScaler;
o[18*s+u] = (f[18] + o[22*s+u])*fdonMonroScaler;
o[22*s+u] += f[26]; o[22*s+u]*=fdonMonroScaler;
o[26*s+u] = (f[26] + f[30])*fdonMonroScaler;
o[3*s+u] = f[3] + o[7*s+u];
o[7*s+u] += o[11*s+u];
o[11*s+u] += o[15*s+u];
o[15*s+u] += f[19];
o[16*s+u] = f[16] * fdonMonroScaler;
o[19*s+u] = f[19] + o[23*s+u];
o[23*s+u] += f[27];
o[27*s+u] = f[27] + f[31];
o[32+u] = (f[1] + o[3*s+u])*fdonMonroScaler;
o[3*s+u] += o[5*s+u]; o[3*s+u]*=fdonMonroScaler;
o[5*s+u] += o[7*s+u]; o[5*s+u]*=fdonMonroScaler;
o[7*s+u] += o[9*s+u]; o[7*s+u]*=fdonMonroScaler;
o[9*s+u] += o[11*s+u]; o[9*s+u]*=fdonMonroScaler;
o[11*s+u] += o[13*s+u]; o[11*s+u]*=fdonMonroScaler;
o[13*s+u] +=o[15*s+u]; o[13*s+u]*=fdonMonroScaler;
o[15*s+u] += f[17]; o[15*s+u]*=fdonMonroScaler;
o[17*s+u] = f[17] + o[19*s+u]; o[17*s+u]*=fdonMonroScaler;
o[19*s+u] += o[21*s+u]; o[19*s+u]*=fdonMonroScaler;
o[21*s+u] += o[23*s+u]; o[21*s+u]*=fdonMonroScaler;
o[23*s+u] += f[25]; o[23*s+u]*=fdonMonroScaler;
o[24*s+u] = f[24]*fdonMonroScaler;
o[25*s+u] = f[25] + o[27*s+u]; o[25*s+u]*=fdonMonroScaler;
o[27*s+u] += f[29]; o[27*s+u]*=fdonMonroScaler;
o[28*s+u] = f[28]* fdonMonroScaler;
o[29*s+u] = (f[29] + f[31])* fdonMonroScaler;
o[30*s+u] = f[30]* fdonMonroScaler;
o[31*s+u] = f[31]* fdonMonroScaler;
//transpose dc & scaling
o[u] = f[0] * fDC_Scaler * fdonMonroScaler;
}//*/
}
#endif // _DCT32x32_KERNEL_H_