Say that my code is as follows
[codebox]
global void FooKernel(float* fpOutImg, int iImgPitch)
{
int x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
int y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
IMG2D(fpOutImg, iImgPitch, x, y) = tex2D(texFloatImg, x , y);
}
//and the invokation code goes something like this
cuErr = cudaMemcpy2D(fpCudaIn1Img, iCudaMaxImgPitch, fpInImg, iImgWdth * sizeof(float), iImgWdth * sizeof(float), iImgHght, cudaMemcpyHostToDevice);
if (cuErr != CUDA_SUCCESS)
{
return -11;
}
// Bind the device memory to texture memory for ease of access
cudaChannelFormatDesc desc = cudaCreateChannelDesc ();
cuErr = cudaBindTexture2D(NULL, &texFloatImg, fpCudaIn1Img, &desc, iImgWdth4, iImgHght, iCudaMaxImgWdth4);
if (cuErr != CUDA_SUCCESS)
{
return -11;
}
texFloatImg.normalized = false;
texFloatImg.addressMode[0] = cudaAddressModeClamp;
texFloatImg.addressMode[1] = cudaAddressModeClamp;
texFloatImg.filterMode = cudaFilterModePoint;
// Call the GPU kernel to do the core computations
FooKernel <<<gridSz, blockSz>>> (fpCudaOutImg, iCudaMaxImgWdth );
[/codebox]
This is not actually what I’m trying to do, what I’m actually trying to do is make my float image filter library use 2D texture memory instead of global memory to save some ifs in the kernel code but this is what I’ve reverted to since the texture floats don’t seem to work properly when I run them on the GPU.
This works fine under EMU Debug but gives very strange behavior under the actual GPU. (I think the float is represented in a different way and the dimensions are all bent or something, but am not sure)
has anyone encountered this? if not does anyone have something that works on the GPU and using texture<float, 2, …> to pass the image to the kernel?
10x in advance,
DV