Dear community,
my problem consists of of multiple data sequences with the same size that my measurement hardware returns as "unsigned short"s. The sequences are are stored sequentially as a long 1D array. Each sequence has to be resampled by interpolation for which the new coordinates (floats) are equal for all sequences and are stored in another 1D array of the same size as one sequence. Further data processing is done on the GPU after the resampling.
I have already written a working interpolation kernel, however, the interpolation remains the main bottleneck. Because this is carried out on an older GPU (Geforce 750) I thought it would be worth a try and check if texture based interpolation performs faster.
If I convert the data from unsigned short to float type on the GPU and move this to the GPU for texture interpolation, everything works fine. I have attached a minimal working example for this below.
#include <iostream>
#include <cuda.h>
#include <cstdlib>
// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
// attempt to interpolate linear memory
__global__
void cuda_texture_interpolate(cudaTextureObject_t tex,
int count,
int size,
float* d_map) {
const int start_idx = (blockIdx.x * size + threadIdx.x)*(count);
if (start_idx < size) {
if (count < 1) { count = 1; }
float x;
float y;
printf("count is: %d and start is %d an size is %d\n", count, start_idx, size);
for (int i = 0; i != count; i++) {
x = (d_map[i]+0.5+start_idx);
y = tex1D<float>(tex, x);
printf("ID: %d, x: %f ; y: %f\n", threadIdx.x, x, y);
}
}
}
int main() {
const int n = 10;
const int map_length = 5;
float a_host[n] = {11,14,17,13,15,11,14,17,13,15};
float m_host[map_length] = {0, 0.4288, 0.8615, 1.2980, 1.7384};
// allocate and copy to cuda array and device memory
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaArray* cuArray;
float* d_map;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_map, sizeof(float)*map_length));
CUDA_SAFE_CALL(cudaMallocArray(&cuArray, &channelDesc, n));
// Copy to device memory
CUDA_SAFE_CALL(cudaMemcpyToArray(cuArray, 0, 0, a_host, n*sizeof(float),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_map,m_host, sizeof(float)*map_length, cudaMemcpyHostToDevice));
// create texture object
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
// texDesc.normalizedCoords = 1;
texDesc.normalizedCoords = 0;
cudaResourceViewDesc resViewDesc;
memset(&resViewDesc, 0, sizeof(resViewDesc));
resViewDesc.format = cudaResViewFormatFloat1;
resViewDesc.width = n;
// create texture object
cudaTextureObject_t tex;
CUDA_SAFE_CALL(cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc));
// call interpolation kernel
cuda_texture_interpolate<<<2,2>>>(tex,map_length,n, d_map);
CHECK_LAUNCH_ERROR();
// clean up
CUDA_SAFE_CALL(cudaDestroyTextureObject(tex));
CUDA_SAFE_CALL(cudaFreeArray(cuArray));
CUDA_SAFE_CALL(cudaFree(d_map));
printf("end of texture_object_interpolation.\n");
}
I thought it would be really need to utilize the texture interpolation to also take care of the type conversion from unsigned short to float (as for example done here. However, for both my code, as well as when I change the copied and successfully tested 2D example implementation to a 1D case, the code does not compile due to an “invalid argument” error when I try to create the texture object. Does anyone have any idea how to fix this or am I missing an obvious limitation that prohibits my preferred use case?
Here is the unsuccessful code:
// attempt to interpolate linear memory
__global__
void cuda_texture_interpolate(cudaTextureObject_t tex,
int count,
int size,
float* d_map) {
const int start_idx = (blockIdx.x * size + threadIdx.x)*(count);
if (start_idx < size) {
if (count < 1) { count = 1; }
float x;
float y;
printf("count is: %d and start is %d an size is %d\n", count, start_idx, size);
for (int i = 0; i != count; i++) {
x = (d_map[i]+0.5+start_idx);
y = tex1D<float>(tex, x)*65536.0f;
printf("ID: %d, x: %f ; y: %f\n", threadIdx.x, x, y);
}
}
}
int main() {
const int n = 10;
const int map_length = 5;
unsigned short a_host[n] = {11,14,17,13,15,11,14,17,13,15};
float m_host[map_length] = {0, 0.4288, 0.8615, 1.2980, 1.7384};
// allocate and copy to cuda array and device memory
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(16, 0, 0, 0,
cudaChannelFormatKindUnsigned);
cudaArray* cuArray;
float* d_map;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_map, sizeof(float)*map_length));
CUDA_SAFE_CALL(cudaMallocArray(&cuArray, &channelDesc, n));
// Copy to device memory
CUDA_SAFE_CALL(cudaMemcpyToArray(cuArray, 0, 0, a_host, n*sizeof(unsigned short),
cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_map,m_host, sizeof(float)*map_length, cudaMemcpyHostToDevice));
// create texture object
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeNormalizedFloat;
// texDesc.normalizedCoords = 1;
texDesc.normalizedCoords = 0;
cudaResourceViewDesc resViewDesc;
memset(&resViewDesc, 0, sizeof(resViewDesc));
resViewDesc.format = cudaResViewFormatFloat1;
resViewDesc.width = n;
// create texture object
cudaTextureObject_t tex;
CUDA_SAFE_CALL(cudaCreateTextureObject(&tex, &resDesc, &texDesc, &resViewDesc));
// call interpolation kernel
cuda_texture_interpolate<<<2,2>>>(tex,map_length,n, d_map);
CHECK_LAUNCH_ERROR();
// clean up
CUDA_SAFE_CALL(cudaDestroyTextureObject(tex));
CUDA_SAFE_CALL(cudaFreeArray(cuArray));
CUDA_SAFE_CALL(cudaFree(d_map));
printf("end of texture_object_interpolation.\n");
}
I suspect this error to be caused by a wrong combination when I try to promote the type. Thanks in advance!