time of kernel 100ms to 14ms i dont why

cricri1 · February 8, 2011, 10:11pm

i try to do a fft

so i call the same thread 26 time

the first 4 time

i use memory and memory+1 take 100 ms

i use memory and memory+2 take 100 ms

i use memory and memory+4 take 100 ms

i use memory and memory+8 take 100 ms

after

i use memory and memory+16 take 14 ms

i use memory and memory+32 take 14 ms

…

seem if you use memory to close is not good ??

why ?

#include <stdio.h>  

 #include <cuda.h>  

 #include <time.h>

#include <math.h>

 #include "cutil_inline.h"

 // Kernel that executes on the CUDA device  

 static unsigned __int64 start_tics;

 __global__ void square_array(float *a, int N,int in,int ss)  

 {  

    float  Cnter = 0.0;

    int z =0;

	   int idx = blockIdx.x * blockDim.x + threadIdx.x;  

	   int idy = blockIdx.y * blockDim.y + threadIdx.y;  

	   int id = idx+65536*idy;;

	  if (  id<N)

	   { 

	      z=int(id/in)*in+id;

		   Cnter = a[z] + a[z+in];  

		   a[z+in] = a[z] - a[z+in];

		   a[z] = Cnter;

		}

 }  

// main routine that executes on the host  

 int main(void)  

 {  

   float *memoirecpu1, *memoiregraphique1;  // Pointer to host & device arrays  

   cudaEvent_t start, stop;

   int S = 0;

   int N=1;

for (S=1 ;S<27;S=S+1)

    {

      N=N*2;

    }

S=S-1;

   size_t size = N * sizeof(float);  

   memoirecpu1 = (float *)malloc(size);        // Allocate array on host  

   // Initialize host array and copy it to CUDA device  

   for (int i=0; i<N; i++)

   { memoirecpu1[i] = (float)i;  

   }

   N=N;

    cutilSafeCall( cudaEventCreate(&start) );

    cutilSafeCall( cudaEventCreate(&stop)  );

    unsigned int timer;

    cutilCheckError(  cutCreateTimer(&timer)  );

    cutilCheckError(  cutResetTimer(timer)    );

    cutilSafeCall( cudaThreadSynchronize() );

    float gpu_time = 0.0f;

    cutilCheckError( cutStartTimer(timer) );

//---------------------------

  cudaMalloc((void **) &memoiregraphique1, size);   // Allocate array on device  

  cudaMemcpy(memoiregraphique1, memoirecpu1, size, cudaMemcpyHostToDevice);  

int in =1;

int ss=2;

N=N/2;

for (int bou=0;bou<S;bou=bou+1)

  {

    cudaEventRecord(start, 0);

if (bou<4)    square_array <<< dim3(4096,34,1),dim3(16,32,1) >>> (memoiregraphique1, N,in,ss);  // i win 10 ms now only 95 ms   

if (bou>3)   square_array <<< dim3(1024,66,1),dim3(64,8,1) >>> (memoiregraphique1, N,in,ss);    // only 14 ms

        cudaEventRecord(stop, 0);

         unsigned long int counter=0;

    while( cudaEventQuery(stop) == cudaErrorNotReady )

    {

        counter++;

    }

  cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );        

    printf("time spent executing by the GPU: %.2f\n", gpu_time);

   in=in*2;

   ss=ss*2;

    }

   cudaMemcpy(memoirecpu1, memoiregraphique1, sizeof(float)*N, cudaMemcpyDeviceToHost);  

//------------------------------

    cutilCheckError( cutStopTimer(timer) );

// have CPU do some work while waiting for stage 1 to finish

    unsigned long int counter=0;

    while( cudaEventQuery(stop) == cudaErrorNotReady )

    {

        counter++;

    }

    cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );

// print the cpu and gpu times

    printf("time spent executing by the GPU: %.2f\n", gpu_time);

    printf("time spent by CPU in CUDA calls: %.2f\n", cutGetTimerValue(timer) );

    printf("CPU executed %d iterations while waiting for GPU to finish\n", counter);

for (int i=0; i<20; i=i+1) 

   {

   printf("%d %f\n", i, memoirecpu1[i]);  

   }

   free(memoirecpu1); cudaFree(memoiregraphique1);  

}

MichaA_Skrzypkowski · February 8, 2011, 10:38pm

Well, yes, if you fetch too small chunks of array it’s worse.

This happens because CUDA serializes every IO in warp chunks.

So it’s best to fetch a multiply of warp size (very basicly speaking),

you can read more about this in the programming guide (chapter 5, 5.3 exacly)

cbuchner1 · February 8, 2011, 10:38pm

Your description is extremely vague.

But there’s an effect called “partition camping” which may have an influence on performance. Google it External Image

cricri1 · February 8, 2011, 10:48pm

ok i try to read more

but i still not understand all processor block thread External Image

i will do more test and research

if i can do all in 14 ms it s 120x more faster than the processor

i read a lot try something for partition camping not work and dont understand all

now with

square_array <<< dim3(2048,513,1),dim3(32,1,1) >>> (memoiregraphique1, N,in,ss);

the first 4 take 60ms

the over 11 ms

why with only 32 thread it s better than 512 ??? External Image

Pradeep_rao · February 10, 2011, 8:14am

I think this has got to do with the Global Memory coalescing. The hardware coalesces 16 bit words in every warp. So an efficient access would be Thread 0 accessing “memory” and “memory + 16”, Thread 1 accessing “memory + 1” and “memory + 17” and so on.

cricri1 · February 10, 2011, 1:34pm

yes i thinks it s Coalescing

i read that(very good i thinks) now i must try
http://gpgpu.org/static/sc2007/SC07_CUDA_5_Optimization_Harris.pdf

cricri1 · February 11, 2011, 11:59pm

External Image External Image

it s WORKS at 99% i thinks i have a little camping now but i can find

__global__ void square_array(float *a, int N,int in,int ss)  

 {  

__shared__ float s_data[512];

	   int sh=  threadIdx.x+32*threadIdx.y;

	   int id = 512 * blockIdx.x +  1048576 * blockIdx.y + sh;

	  if ( id<N)

           {

		s_data[sh] = a[id];

	   }

__syncthreads();

float  Cnter=0 ;

    int z ;

	  if ( sh <256 &&  id<N)

	   { 

               z=int(sh/in)*in+sh;

	    

	       Cnter= s_data[z]+s_data[z+in];

		   s_data[z+in] = s_data[z]-s_data[z+in];

		   s_data[z] = Cnter;

	   }

		__syncthreads();

		

  if (   id<N)

		{

		a[id]=s_data[sh];

		}		

 }  

with

square_array <<< dim3(2048,65,1),dim3(32,16,1) >>> (memoiregraphique1, N,in,ss);