Hi Gys,
I am facing a strange bug in my programme. Recently i got a new GC GTS 450 and my old one was 8400 GS. On my old one the timing of the following code was around 0.6 msec. And now on my new GC the timing shoots upto 2.2 msec. The code is as follows:
void gpu_blur( gpu_context_t *ctx , int KERNEL_RADIUS)
{
assert(KERNEL_RADIUS);
gpu_error_t error = GPU_OK;
struct timeval startTime;
struct timeval endTime;
float elapsedtime;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
gettimeofday(&startTime, NULL);
int KERNEL_LENGTH = (2 * KERNEL_RADIUS + 1);
const int imageW = ctx->width;
const int imageH = ctx->height;
float *tempKernel;
unsigned char *in;
tempKernel = (float *)malloc(KERNEL_LENGTH * sizeof(float));
in = ctx->output_buffer_1;
cudaArray *src;
cudaChannelFormatDesc floatTex = cudaCreateChannelDesc<unsigned char>();
cudaMallocArray(&src, &floatTex, imageW, imageH);
unsigned char *tempOutput;
cudaMalloc((void **)&tempOutput, imageW * imageH );
error = checkCudaError();
////////////// calculating kernel //////////////
float sum = 0;
for(int i = 0; i < KERNEL_LENGTH; i++)
{
float dist = (float)(i - KERNEL_RADIUS) / (float)KERNEL_RADIUS;
tempKernel[i] = expf(- dist * dist / 2);
sum += tempKernel[i];
}
for(int i = 0; i < KERNEL_LENGTH; i++)
tempKernel[i] /= sum;
cudaMemcpyToSymbol(Kernel, tempKernel, KERNEL_LENGTH * sizeof(float));
////////////////////////////////////////////////
cudaMemcpyToArray(src, 0, 0, in, imageW * imageH, cudaMemcpyHostToDevice);
convolutionRowsGPU( tempOutput, src, imageW, imageH, KERNEL_RADIUS, KERNEL_LENGTH, ctx->threadsX, ctx->threadsY);
if(checkCudaError() == GPU_OK)
{
cudaMemcpyToArray(src, 0, 0, tempOutput, imageW * imageH, cudaMemcpyDeviceToDevice);
convolutionColumnsGPU( tempOutput, src, imageW, imageH, KERNEL_RADIUS, KERNEL_LENGTH, ctx->threadsX, ctx->threadsY);
}
gettimeofday(&endTime, NULL); ///////// As soon as i go past this line, the timing shoots upto 2.2ms, before this line timing is 0.3ms. ////////
/********Problem starts here*******/
cudaMemcpy(in, tempOutput, imageW * imageH, cudaMemcpyDeviceToHost);
cudaMemcpy( ctx->gpu_buffer_1, tempOutput, imageW * imageH, cudaMemcpyDeviceToDevice);
error = checkCudaError();
cudaFree(tempOutput);
cudaFreeArray(src);
double tS = startTime.tv_sec*1000000 + (startTime.tv_usec);
double tE = endTime.tv_sec*1000000 + (endTime.tv_usec);
fprintf(stderr,"Smoothing_GPU_1:%lf \n",(tE-tS)/1000);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedtime,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
fprintf(stderr,"Smoothing_GPU:%lf \n",elapsedtime);
}
Each of the kernel calls are finished within 0.3ms time. But as soon as i go to cudaMemcpy line of code the timings shoots to 2.2ms.
Is there something wrong with my approach like should i use cudaThreadSynchronize ? Will __syncthreads() work instead of cudaThreadSynchronize() ?