This is supposed to be a repro code ( use nvcc -o test test.cu -arch=compute_30 ) , but on my Windows laptop running a Kepler chip and CUDA 7.5RC it doesn’t exhibit the problem.
The top printout shows the texel center values, the second line shows the values interpolated
inbetween 1.0 and 2.0 (1.5), 2.0 and 4.0 (3.0) and the wraparound case 4.0 and 1.0 (2.5). If the border color was used to interpolate against, the value 2.0 would show up in place of 2.5.
Next I’ll try on the Linux machines that exhibited the problem in our production code. If the repro code still doesn’t reproduce the problem there, the problem must lie deeper (it may be some stupid texture initialization bug on our end)
proper output (expected result)
1.00000000+i* 1.00000000 2.00000000+i* 2.00000000 4.00000000+i* 4.00000000
1.50000000+i* 1.50000000 3.00000000+i* 3.00000000 2.50000000+i* 2.50000000
test.cu
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void kernel(cudaTextureObject_t tex, int m, int n)
{
float2 val;
// texture access at texel center coordinatres
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
val = tex2D<float2>(tex, (col + 0.5f)/n, (row + 0.5f)/m);
printf("% 9.8f+i*% 9.8f ", val.x, val.y);
}
printf("\n");
}
printf("\n");
// texture access to the right of texel center (halfway between texels, horizontally)
for (int row = 0; row < m; row++) {
for (int col = 0; col < n; col++) {
val = tex2D<float2>(tex, (col + 0.5f + 0.499f)/n, (row + 0.5f)/m);
printf("% 9.8f+i*% 9.8f ", val.x, val.y);
}
printf("\n");
}
}
int main(void)
{
const int m = 1; // height = #rows
const int n = 3; // width = #columns
ushort2 arr[m][n] = {
{ make_ushort2(0x3c00, 0x3c00), make_ushort2(0x4000, 0x4000), make_ushort2(0x4400, 0x4400) }, // 1.0 , 2.0 , 4.0
};
cudaArray_t array;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDescHalf2();
CUDA_SAFE_CALL(cudaMallocArray(&array, &channelDesc, n, m, 0));
CUDA_SAFE_CALL(cudaMemcpy2DToArray(array, 0, 0, arr, n*sizeof(arr[0][0]), n*sizeof(arr[0][0]), m, cudaMemcpyHostToDevice));
// create resource description
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = array;
// create texture description
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.normalizedCoords = 1;
// create texture object
cudaTextureObject_t tex = 0;
cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
kernel << <1, 1 >> >(tex, m, n);
CHECK_LAUNCH_ERROR();
CUDA_SAFE_CALL(cudaDeviceSynchronize());
CUDA_SAFE_CALL(cudaFreeArray(array));
CUDA_SAFE_CALL(cudaDestroyTextureObject(tex));
return EXIT_SUCCESS;
}