Code attached.
Edit: odd, whenever I attach the file it only adds a 0 byte file. I’m including it in a code block below. Apologies for any lack of tab formatting in the resulting version.
#include <stdio.h>
# define CUDA_SAFE_CALL( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
#ifdef NDEBUG
#define CUT_CHECK_ERROR(errorMessage)
#else
# define CUT_CHECK_ERROR(errorMessage) do { \
cudaThreadSynchronize(); \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} } while (0)
#endif
#define BLOCK_SIZE 32
int *d_odata, *d_idata, *h_idata;
__global__ void copy_gmem(int* g_idata, int* g_odata, int work)
{
const unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
// read in per block flag: true => quit processing immeadiately
__shared__ int a;
if (threadIdx.x == 0)
{
a = g_idata[blockIdx.x];
}
__syncthreads();
if (a)
return;
// the flag was false, perform work sumes on shared memory
__shared__ int sdata[BLOCK_SIZE];
sdata[threadIdx.x] = 0;
for (unsigned int i = 0; i < work; i++)
{
sdata[threadIdx.x] += work;
}
g_odata[idx] = sdata[threadIdx.x];
}
void do_bmark(dim3 grid, dim3 threads, int work)
{
copy_gmem<<< grid, threads >>>(d_idata, d_odata, work);
cudaEvent_t start, end;
CUDA_SAFE_CALL( cudaEventCreate(&start) );
CUDA_SAFE_CALL( cudaEventCreate(&end) );
CUDA_SAFE_CALL( cudaEventRecord(start, 0) );
for (int i=0; i < 100; ++i)
{
copy_gmem<<< grid, threads >>>(d_idata, d_odata, work);
}
CUDA_SAFE_CALL( cudaEventRecord(end, 0) );
CUDA_SAFE_CALL( cudaEventSynchronize(end) );
float runTime;
CUDA_SAFE_CALL( cudaEventElapsedTime(&runTime, start, end) );
runTime /= float(100);
printf("%d %f\n", work, runTime);
CUDA_SAFE_CALL( cudaEventDestroy(start) );
CUDA_SAFE_CALL( cudaEventDestroy(end) );
}
void setup_skips(int len, int skips)
{
for (int i = 0; i < len; i++)
{
h_idata[i] = 0;
}
for (int i = 0; i < skips; i++)
{
int skip = rand() % len;
if (h_idata[skip])
{
i--;
continue;
}
h_idata[skip] = 1;
}
CUDA_SAFE_CALL( cudaMemcpy(d_idata, h_idata, sizeof(int)*len, cudaMemcpyHostToDevice) );
}
int main()
{
int len = BLOCK_SIZE*20000;
int num_threads = BLOCK_SIZE;
CUDA_SAFE_CALL( cudaMalloc((void**)&d_idata, sizeof(int)*(len)) );
CUDA_SAFE_CALL( cudaMalloc((void**)&d_odata, sizeof(int)*(len)) );
h_idata = (int *)malloc(sizeof(int) * len);
dim3 threads(num_threads, 1, 1);
dim3 grid(len/num_threads, 1, 1);
printf("no skips\n");
setup_skips(len, 0);
int work = 30000;
do_bmark(grid, threads, work);
printf("\n50%% skips\n");
setup_skips(len, len*0.5);
do_bmark(grid, threads, work);
printf("\n90%% skips\n");
setup_skips(len, len*0.9);
do_bmark(grid, threads, work);
return 0;
}