Sorry, I give you more informations.
Yes I include in my measurement data copy. I will try measure without this.
The first Kernel is a reduction based on the Mark Harris algorithm (Reduction #6 : Completely Unrolled).
The second Kernel is just an acces to a texture in Linear filter mode to make a bilinear interpolation on a picture.
Here is the code of the 1st kernel :
[codebox]
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
/******************************************************
******************************************************/
device void reduction(unsigned int blockSize, float sdata, int tid)
{
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) {
if (blockSize >= 64)
sdata[tid] += sdata[tid + 32];
if (blockSize >= 32)
sdata[tid] += sdata[tid + 16];
if (blockSize >= 16)
sdata[tid] += sdata[tid + 8];
if (blockSize >= 8)
sdata[tid] += sdata[tid + 4];
if (blockSize >= 4)
sdata[tid] += sdata[tid + 2];
if (blockSize >= 2)
sdata[tid] += sdata[tid + 1];
}
}
/****************************************************
/ SOMME - float en entrée
/***************************************************/
global void sum(float *d_in, float *d_out, unsigned int blocksize)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blocksize*2) + tid;
sdata[tid] = d_in[i] + d_in[i+blocksize];
__syncthreads();
reduction(blocksize, sdata, tid);
if (tid == 0)
d_out[blockIdx.x] = sdata[0];
}
/************************************************************
*********/
double somme(dim3 dimGrid, dim3 dimBlock, int smemSize, float* d_in, float* d_out, float* h_out, size_t size) {
int threads = dimBlock.x*dimBlock.y;
sum<<< dimGrid, dimBlock, smemSize >>>(d_in, d_out, threads);
cudaThreadSynchronize();
cudaMemcpy( h_out, d_out, size, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
double somme = 0;
for ( int i = 0; i < dimGrid.x; i++ )
somme += (double)h_out[i];
return somme;
}
/************************************************************
*********/
void readRaw(float *tab, size_t count, char *fileName) {
FILE * pFile;
char * buffer;
size_t result;
pFile = fopen ( fileName , "rb" );
if (pFile==NULL) {fputs ("File error",stderr); exit (1);}
// allocate memory to contain the whole file:
buffer = (char*) malloc (sizeof(char)*count);
if (buffer == NULL) {fputs ("Memory error",stderr); exit (2);}
// copy the file into the buffer:
result = fread (buffer, 1, count, pFile);
if (result != count) {fputs ("Reading error",stderr); exit (3);}
// copy the buffer into the array
for (int i = 0; i < (int)count; i++) {
tab[i] = (float)buffer[i];
}
}
/************************************************************
**********/
int main(int argc, char** argv) {
float *h_data, *d_result, *d_data, *h_result;
unsigned int hTimer;
cutCreateTimer(&hTimer);
h_result = (float *)malloc( ((512*512)/128/2)*sizeof(float));
h_data = (float *)malloc(512*512*sizeof(float));
readRaw(h_data, (512*512), "../image_raw_cuda/imageREF512x512_37deg_150.raw");
cudaMalloc( (void **)&d_result, ((512*512)/128/2)*sizeof(float));
cudaMalloc( (void **)&d_data, ((512*512)*sizeof(float)));
cudaMemcpy( d_data, h_data, 512*512*sizeof(float), cudaMemcpyHostToDevice );
dim3 blockSizeEXT(128);
dim3 gridSizeEXT((512*512)/128/2);
int smemSizeRED = 128*sizeof(float);
cutResetTimer(hTimer);
cutStartTimer(hTimer);
for (int i=0; i<10000; i++)
somme(gridSizeEXT, blockSizeEXT, smemSizeRED, d_data, d_result, h_result, ((512*512)/128/2)*sizeof(float));
cutStopTimer(hTimer);
double gpuTime = cutGetTimerValue(hTimer);
printf("Temps de calcul : %fmsec\n", gpuTime/10000);
system("PAUSE");
}[/codebox]
2nd Kernel :
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#define IMUL(a,b ) __mul24(a,b )
// Declaration d’une texture 2D
texture<float, 2, cudaReadModeElementType> dataTest_tex;
texture<float, 2, cudaReadModeElementType> &getTexture() { return dataTest_tex; }
static inline device texture<float, 2, cudaReadModeElementType> &getDeviceTexture() { return dataTest_tex; }
global void bilinear(float *d_out, int dimIn, int dimOut, float ax, float ay, float bx, float by, float cx, float cy) {
int ix = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
int iy = IMUL(blockDim.y, blockIdx.y) + threadIdx.y;
float xp = ax*ix + bx*iy + cx;
float yp = ay*ix + by*iy + cy;
/* int w1 = int(xp);
int h1 = int(yp);*/
d_out[iy*dimOut + ix] = tex2D(getDeviceTexture(), xp, yp);
}
/************************************************************
*********/
void readRaw(float *tab, size_t count, char *fileName) {
FILE * pFile;
char * buffer;
size_t result;
pFile = fopen ( fileName , "rb" );
if (pFile==NULL) {fputs ("File error",stderr); exit (1);}
// allocate memory to contain the whole file:
buffer = (char*) malloc (sizeof(char)*count);
if (buffer == NULL) {fputs ("Memory error",stderr); exit (2);}
// copy the file into the buffer:
result = fread (buffer, 1, count, pFile);
if (result != count) {fputs ("Reading error",stderr); exit (3);}
// copy the buffer into the array
for (int i = 0; i < (int)count; i++) {
tab[i] = (float)buffer[i];
}
}
/************************************************************
**********/
int main(int argc, char** argv) {
float *h_data, *d_result;
cudaArray *d_data;
unsigned int hTimer;
cutCreateTimer(&hTimer);
h_data = (float *)malloc(1024*1024*sizeof(float));
readRaw(h_data, (1024*1024), "../image_raw_cuda/imageTEST1024x1024.raw");
cudaMalloc( (void **)&d_result, 512*512*sizeof(float));
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMallocArray( &d_data, &channelDesc, 1024, 1024);
cudaMemcpyToArray( d_data, 0, 0, (void*)h_data, 1024*1024*sizeof(float), cudaMemcpyHostToDevice);
texture<float, 2, cudaReadModeElementType> &myTexture = getTexture();
myTexture.filterMode = cudaFilterModeLinear; // Interpolation intégrée
myTexture.normalized = false;
cudaBindTextureToArray( myTexture, d_data, channelDesc);
dim3 blockSizeEXT(16,16);
dim3 gridSizeEXT(512/16,512/16);
cutResetTimer(hTimer);
cutStartTimer(hTimer);
for (int i=0; i<10000; i++)
{
bilinear<<< gridSizeEXT, blockSizeEXT >>>(d_result, 512, 512, 0.1, 0.1, 0.1, 0.1, 256, 256);
cudaThreadSynchronize();
}
cutStopTimer(hTimer);
double gpuTime = cutGetTimerValue(hTimer);
printf("Temps de calcul : %fmsec\n", gpuTime/10000);
system("PAUSE");
}[/codebox]
I just can say that the pc with GTX285 have a better processor (core 2 quad Q9550) and faster RAM than the pc with geforce 8. The mac have better processor (double intel xeon quad core) and more ram (8Gb) than the pc with GTX285.