Thanks for your reply.
The image processing will be handled in GPU.
If CPU doesn’t do anything, the processing speed can up to 80k.
However, if CPU starts detect feature, the speed is only about 40k.
The code is like below.
The caller function will be called by many CPU thread.
I have already used Async function to overlap the transition and computation.
But it is not work too much for my performance.
void Preprocessing_Caller(unsigned char* pSrc, unsigned char* pDst, int iWidth, int iHeight, int filterWidth, int filterHeight, float* factor, bool bShading, bool bFilter, bool bSharpen, int nChannels){
cudaStream_t cudaStream;
cudaStreamCreate(&cudaStream);
dim3 block(BLOCK_X, BLOCK_Y);
dim3 grid((iWidth*nChannels + block.x - 1)/block.x, (iHeight + block.y - 1)/ block.y);
unsigned char* d_pSrc = NULL;
unsigned char* d_pDst = NULL;
unsigned char* d_tmp = NULL;
float* d_factor = NULL;
size_t gpu_image_pitch = 0;
cudaResourceDesc resDesc;
cudaTextureDesc texSrcDesc;
cudaTextureObject_t texSrc=0;
cudaMallocPitch<unsigned char>(&d_pSrc,&gpu_image_pitch,iWidth*nChannels,iHeight);
cudaMemcpy2DAsync(d_pSrc,gpu_image_pitch,pSrc,iWidth*nChannels,iWidth*nChannels,iHeight,cudaMemcpyHostToDevice, cudaStream);
if(bShading){
cudaMalloc((void**)&d_factor,nChannels*iWidth*sizeof(float));
cudaMemcpyAsync(d_factor, factor,nChannels*iWidth*sizeof(float),cudaMemcpyHostToDevice, cudaStream);
if(nChannels == 1){
ShadingKernel<<<(iWidth + SHADING_THREAD - 1)/SHADING_THREAD, SHADING_THREAD, 0, cudaStream>>>(d_pSrc, d_pSrc, iWidth, iHeight, gpu_image_pitch, d_factor);
}else if(nChannels == 3){
ShadingKernel_3channels<<<(iWidth*nChannels + SHADING_THREAD - 1)/SHADING_THREAD, SHADING_THREAD, 0, cudaStream>>>(d_pSrc, d_pSrc, iWidth, iHeight, gpu_image_pitch, d_factor);
}
cudaFree(d_factor);
}
if(bFilter){
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
cudaMallocPitch<unsigned char>(&d_tmp,&gpu_image_pitch,iWidth*nChannels,iHeight);
resDesc.res.pitch2D.devPtr = d_pSrc;
resDesc.res.pitch2D.pitchInBytes = gpu_image_pitch;
resDesc.res.pitch2D.height = iHeight;
resDesc.res.pitch2D.width = iWidth*nChannels;
resDesc.res.pitch2D.desc.f = cudaChannelFormatKindUnsigned;
resDesc.res.pitch2D.desc.x = 8;
memset(&texSrcDesc, 0, sizeof(texSrcDesc));
texSrcDesc.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texSrc, &resDesc, &texSrcDesc, NULL);
if(nChannels == 1){
MeanFilterKernel<<<(iWidth+MEAN_THREAD-1)/MEAN_THREAD, MEAN_THREAD, 0, cudaStream>>>(texSrc, d_tmp, iWidth, iHeight, gpu_image_pitch, filterWidth, filterHeight, (float)1/(filterWidth*filterHeight));
}else if(nChannels == 3){
MeanFilterKernel_3channels<<<(iWidth*nChannels+MEAN_THREAD-1)/MEAN_THREAD, MEAN_THREAD, 0, cudaStream>>>(texSrc, d_tmp, iWidth, iHeight, gpu_image_pitch, filterWidth, filterHeight, (float)1/(filterWidth*filterHeight));
}
cudaDestroyTextureObject(texSrc);
cudaFree(d_pSrc);
}else{
d_tmp = d_pSrc;
}
if(bSharpen){
cudaMallocPitch<unsigned char>(&d_pDst,&gpu_image_pitch,iWidth*nChannels,iHeight);
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = d_tmp;
resDesc.res.pitch2D.pitchInBytes = gpu_image_pitch;
resDesc.res.pitch2D.height = iHeight;
resDesc.res.pitch2D.width = iWidth*nChannels;
resDesc.res.pitch2D.desc.f = cudaChannelFormatKindUnsigned;
resDesc.res.pitch2D.desc.x = 8;
memset(&texSrcDesc, 0, sizeof(texSrcDesc));
texSrcDesc.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texSrc, &resDesc, &texSrcDesc, NULL);
if(nChannels == 1){
SharpFilterKernel<<<grid, block, 0, cudaStream>>>(texSrc, d_pDst, iWidth, iHeight, gpu_image_pitch);
}else if(nChannels == 3){
SharpFilterKernel_3channels<<<grid, block, 0, cudaStream>>>(texSrc, d_pDst, iWidth, iHeight, gpu_image_pitch);
}
cudaDestroyTextureObject(texSrc);
cudaFree(d_tmp);
}else{
d_pDst = d_tmp;
}
cudaMemcpy2DAsync(pDst,iWidth*nChannels,d_pDst,gpu_image_pitch,iWidth*nChannels,iHeight,cudaMemcpyDeviceToHost, cudaStream);
cudaStreamDestroy(cudaStream);
cudaFree(d_pDst);
}