CUDA code to remove pitch from image

Hi,

This CUDA code is broke to remove pitch (dStep) from an image.

Any ideas, thanks!

Host code:

const uint64_t writesize = TextureOpt.WidthOut * TextureOpt.HeightOut * byteperpixel;
cudaMallocHost((void**)&src_imf4, writesize * sizeof(float));

const uint64_t pixels = (uint64_t)TextureOpt.WidthIn * TextureOpt.HeightIn * byteperpixel;
cudaMallocHost((void**)&cropbuff, pixels * sizeof(float));

// fills cropbuff with source image

// resizes image

IppiSize srcSizeM = { pResize->m_SrcW, pResize->m_SrcH };
IppiSize dstSizeM = { pResize->m_DstW, pResize->m_DstH };

pResize->m_srcSize = srcSizeM;
pResize->m_dstSize = dstSizeM;

pSrc = ippiMalloc_32f_C4(pResize->m_SrcW, pResize->m_SrcH, &srcStepM);
ippiCopy_32f_C4R(cropbuff, pResize->m_SrcW * pResize->m_nChannels * sizeof(Ipp32f), pSrc, srcStepM, pResize->m_srcSize);
pDst = ippiMalloc_32f_C4(pResize->m_DstW, pResize->m_DstH, &dstStepM);

Removing Pitch from resized image Works on Host:

    findex = src_imf4;
    const int dstep = dstStepM >> 2;
    for (j = 0; j < pResize->m_DstH; j++) {
        for (k = 0; k < pResize->m_DstW * pResize->m_nChannels; k++) {
            *findex++ = pDst[j * dstep + k];
        }
    }

CUDA broke :

Host:

int thread_count = 32;
pResize->m_nChannels = 4;

const int dstep = dstStepM >> 2;
dim3 dimBlockstep(thread_count, thread_count, thread_count);
dim3 dimGridstep((pResize->m_DstW + dimBlockstep.x - 1) / dimBlockstep.x, (pResize->m_DstH + dimBlockstep.y - 1) / dimBlockstep.y, ((pResize->m_DstW * pResize->m_nChannels) + dimBlockstep.z - 1) / dimBlockstep.z);

tbb_dstep << <dimGridstep, dimBlockstep >> > (pDst, src_imf4, pResize->m_DstW, pResize->m_DstH, dstep, pResize->m_nChannels);
cudaDeviceSynchronize();

Device code:

// tbb_dstep
extern global void tbb_dstep(float* input, float* output, const int p_Width, const int p_Height, const int dstep, const int bytespp)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
const int z = blockIdx.z * blockDim.z + threadIdx.z;

if ((x < p_Width) && (y < p_Height))
{
    const int outindex = (y * p_Width + x) * bytespp;
    const int index = (y * dstep + z);
    output[outindex] = input[index];
}

}