When I had executed the .exe file after compilation the following code, why the first execution time is always slower than the others? (N=512, it’s slower than 1024 so on…)
Result:
512
0.0888103
1024
0.000553656
2048
0.000540972
4096
0.000565986
8192
0.000561761
16384
0.000560941
32768
0.000612148
65536
0.000697729
131072
0.000837562
262144
0.00144884
//-----------Main.cpp
#include
#include “stopWatch.c”
using namespace std;
extern “C” void reduce(int *, int *, unsigned int);
int main()
{
double GPU;
for(unsigned int z=512; z<=262144; z*=2)
{
//unsigned int N = 512;
int *h_a = new int[z];
for(unsigned int i=0; i<z; i++)
*(h_a+i) = 1;
int *h_sum = new int[z/512];
watchStart();
reduce(h_a, h_sum, z);
watchStop();
GPU = elapsedTime();
cout << *h_sum << endl;
cout << GPU << endl;
delete[] h_a;
delete[] h_sum;
}
return 0;
}
//------------Reduce.cu
#include <stdlib.h>
#include <stdio.h>
global void sum_kernel(int *, int *);
extern “C” void reduce(int h_a, int h_sum, unsigned int N)
{
unsigned int threadSize = 512;
unsigned int blockSize = N / threadSize;
int d_a = 0, d_output = 0;
cudaMalloc((void)&d_a, Nsizeof(unsigned int));
cudaMalloc((void*)&d_output, blockSizesizeof(unsigned int));
cudaMemcpy(d_a, h_a, Nsizeof(unsigned int), cudaMemcpyHostToDevice);
sum_kernel<<<blockSize, threadSize>>>(d_a, d_output);
sum_kernel<<<1, blockSize>>>(d_output, d_output);
cudaMemcpy(h_sum, d_output, sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_output);
}
global void sum_kernel(int *d_a, int *d_output)
{
shared int share_a[512];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
share_a[threadIdx.x] = d_a[idx];
__syncthreads();
for(int distance=blockDim.x/2; distance>0; distance/=2)
{
if(threadIdx.x<distance)
share_a[threadIdx.x] += share_a[threadIdx.x+distance];
__syncthreads();
}
if(threadIdx.x==0)
d_output[blockIdx.x] = share_a[0];
}
Thank you very much!