time of kernel 100ms to 14ms i dont why

i try to do a fft

so i call the same thread 26 time

the first 4 time

i use memory and memory+1 take 100 ms

i use memory and memory+2 take 100 ms

i use memory and memory+4 take 100 ms

i use memory and memory+8 take 100 ms

after

i use memory and memory+16 take 14 ms

i use memory and memory+32 take 14 ms

seem if you use memory to close is not good ??

why ?

#include <stdio.h>  

 #include <cuda.h>  

 #include <time.h>

#include <math.h>

 #include "cutil_inline.h"

 // Kernel that executes on the CUDA device  

 static unsigned __int64 start_tics;

 __global__ void square_array(float *a, int N,int in,int ss)  

 {  

    float  Cnter = 0.0;

    int z =0;

	   int idx = blockIdx.x * blockDim.x + threadIdx.x;  

	   int idy = blockIdx.y * blockDim.y + threadIdx.y;  

	   int id = idx+65536*idy;;

	  if (  id<N)

	   { 

	      z=int(id/in)*in+id;

		   Cnter = a[z] + a[z+in];  

		   a[z+in] = a[z] - a[z+in];

		   a[z] = Cnter;

		}

 }  

// main routine that executes on the host  

 int main(void)  

 {  

   float *memoirecpu1, *memoiregraphique1;  // Pointer to host & device arrays  

   cudaEvent_t start, stop;

   int S = 0;

   int N=1;

for (S=1 ;S<27;S=S+1)

    {

      N=N*2;

    }

S=S-1;

   size_t size = N * sizeof(float);  

   memoirecpu1 = (float *)malloc(size);        // Allocate array on host  

   // Initialize host array and copy it to CUDA device  

   for (int i=0; i<N; i++)

   { memoirecpu1[i] = (float)i;  

   }

   N=N;

    cutilSafeCall( cudaEventCreate(&start) );

    cutilSafeCall( cudaEventCreate(&stop)  );

    unsigned int timer;

    cutilCheckError(  cutCreateTimer(&timer)  );

    cutilCheckError(  cutResetTimer(timer)    );

    cutilSafeCall( cudaThreadSynchronize() );

    float gpu_time = 0.0f;

    cutilCheckError( cutStartTimer(timer) );

//---------------------------

  cudaMalloc((void **) &memoiregraphique1, size);   // Allocate array on device  

  cudaMemcpy(memoiregraphique1, memoirecpu1, size, cudaMemcpyHostToDevice);  

int in =1;

int ss=2;

N=N/2;

for (int bou=0;bou<S;bou=bou+1)

  {

    cudaEventRecord(start, 0);

if (bou<4)    square_array <<< dim3(4096,34,1),dim3(16,32,1) >>> (memoiregraphique1, N,in,ss);  // i win 10 ms now only 95 ms   

if (bou>3)   square_array <<< dim3(1024,66,1),dim3(64,8,1) >>> (memoiregraphique1, N,in,ss);    // only 14 ms

        cudaEventRecord(stop, 0);

         unsigned long int counter=0;

    while( cudaEventQuery(stop) == cudaErrorNotReady )

    {

        counter++;

    }

  cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );        

    printf("time spent executing by the GPU: %.2f\n", gpu_time);

   in=in*2;

   ss=ss*2;

    }

   cudaMemcpy(memoirecpu1, memoiregraphique1, sizeof(float)*N, cudaMemcpyDeviceToHost);  

//------------------------------

    cutilCheckError( cutStopTimer(timer) );

// have CPU do some work while waiting for stage 1 to finish

    unsigned long int counter=0;

    while( cudaEventQuery(stop) == cudaErrorNotReady )

    {

        counter++;

    }

    cutilSafeCall( cudaEventElapsedTime(&gpu_time, start, stop) );

// print the cpu and gpu times

    printf("time spent executing by the GPU: %.2f\n", gpu_time);

    printf("time spent by CPU in CUDA calls: %.2f\n", cutGetTimerValue(timer) );

    printf("CPU executed %d iterations while waiting for GPU to finish\n", counter);

for (int i=0; i<20; i=i+1) 

   {

   printf("%d %f\n", i, memoirecpu1[i]);  

   }

   free(memoirecpu1); cudaFree(memoiregraphique1);  

}

Well, yes, if you fetch too small chunks of array it’s worse.

This happens because CUDA serializes every IO in warp chunks.

So it’s best to fetch a multiply of warp size (very basicly speaking),

you can read more about this in the programming guide (chapter 5, 5.3 exacly)

Your description is extremely vague.

But there’s an effect called “partition camping” which may have an influence on performance. Google it External Image

ok i try to read more

but i still not understand all processor block thread External Image

i will do more test and research

if i can do all in 14 ms it s 120x more faster than the processor

i read a lot try something for partition camping not work and dont understand all

now with

square_array <<< dim3(2048,513,1),dim3(32,1,1) >>> (memoiregraphique1, N,in,ss);

the first 4 take 60ms

the over 11 ms

why with only 32 thread it s better than 512 ??? External Image

I think this has got to do with the Global Memory coalescing. The hardware coalesces 16 bit words in every warp. So an efficient access would be Thread 0 accessing “memory” and “memory + 16”, Thread 1 accessing “memory + 1” and “memory + 17” and so on.

yes i thinks it s Coalescing

i read that(very good i thinks) now i must try
http://gpgpu.org/static/sc2007/SC07_CUDA_5_Optimization_Harris.pdf

External Image External Image

it s WORKS at 99% i thinks i have a little camping now but i can find

__global__ void square_array(float *a, int N,int in,int ss)  

 {  

__shared__ float s_data[512];

	   int sh=  threadIdx.x+32*threadIdx.y;

	   int id = 512 * blockIdx.x +  1048576 * blockIdx.y + sh;

	  if ( id<N)

           {

		s_data[sh] = a[id];

	   }

__syncthreads();

float  Cnter=0 ;

    int z ;

	  if ( sh <256 &&  id<N)

	   { 

               z=int(sh/in)*in+sh;

	    

	       Cnter= s_data[z]+s_data[z+in];

		   s_data[z+in] = s_data[z]-s_data[z+in];

		   s_data[z] = Cnter;

	   }

		__syncthreads();

		

  if (   id<N)

		{

		a[id]=s_data[sh];

		}		

 }  

with

square_array <<< dim3(2048,65,1),dim3(32,16,1) >>> (memoiregraphique1, N,in,ss);