I have been working on an application that each thread has a “ray through a 3D data structure” access pattern.
For this reason I decided to use texture/surface memory during the reads of this data structure.
The conversion of the code worked and I got some amazing speedup (dramatically reduced long stall latency bottleneck).
However, after several iterations, the final output of my algorithms converges to slightly different values.
I have spent several days trying to pinpoint why there is any discrepancy between outputs without luck.
For this reason, I am implemented a simple example based on the “rotating with textures” from the cuda-samples that mimics part of the reads/writes from/to this texture data structure.
Currently my mock example does the following:
- Generate input array with zeros on the top part and floating point values on the bottom part.
- Print original data
- Copy host data to texture array
- Use texture kernel to copy to device array
- Copy device array to host
- Print host data (first iteration)
- Copy device array to texture array
- Use texture kernel to copy to device array
- Copy device array to host
- Print host data (second iteration)
My application uses PointInterpolation (no interpolation), with no transformation on the texture indices.
Because I am only reading and writing the same data to the same locations, I was expecting to always see the same values, however there are some lines that have slightly different values over the iterations. The lines with incorrect values are not consistent. Different iterations present differences on different lines.
I cannot explain why, and I think this is what is affecting my main application.
I am in a loss on why this is happening. Are there bugs during read/write to texture memory?
Compilation tells me that I am using a deprecated function. Is this a problem?
main.cu:81:7: warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated [-Wdeprecated-declarations]
checkCudaErrors(cudaMemcpyToArray(cu_array, 0, 0, h_data, size,
^
/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda_runtime_api.h:6926:46: note: declared here
extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enu
Which function should I use?
Code:
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <stdio.h>
#include <numeric>
#include <vector>
#include <iostream>
#include <iomanip>
#include <limits>
#define PI_F 3.141592654f
#define DEBUG 1
//undef DEBUG
void printArrayAsMatrix(const float* in,
const size_t& width, const size_t& height) {
#ifdef DEBUG
std::cout <<"Printing "<<width<<","<<height<<" array"<< std::endl;
for (size_t j = 0; j < height; ++j) {
for (size_t i = 0; i < width; ++i) {
std::cout
<<std::fixed
<< std::setw(12) // space between numbers
<< std::setprecision(8) // nubmers after decimal point
// << std::setprecision(std::numeric_limits<float>::digits10) // nubmers after decimal point
<< in[width*j + i] << ',';
}
std::cout << std::endl;
}
#endif
}
__global__ void rotateKernel (float * output,
cudaTextureObject_t texObj, int width, int height,
float theta) {
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// Read from texture and write to global memory
output[idx] = tex2D<float>(texObj, x, y);
}
int main ()
{
// Inputs
size_t width = 128;
size_t height = 128;
size_t size = width * height * sizeof(float);
float angle = 0; // in degrees
float theta = angle/180*PI_F; // in rad
// Initialize host array
float * h_data = (float*)malloc(size);
for (int i =0; i<height*width; ++i) h_data[i] =(float)i/(height*width);
memset(h_data, 0, size/4);
// cudaArray obj will have elements of 32bits, representing single-precision
// floating point numbers
cudaChannelFormatDesc ch_desc =
cudaCreateChannelDesc(32,0,0,0,
cudaChannelFormatKindFloat);
cudaArray* cu_array;
checkCudaErrors(cudaMallocArray(&cu_array, &ch_desc, width, height));
checkCudaErrors(cudaMemcpyToArray(cu_array, 0, 0, h_data, size,
cudaMemcpyHostToDevice));
// Specify texture
// Texture is going to be bound to a 1D Array, with name cu_array
struct cudaResourceDesc res_desc;
memset(&res_desc, 0, sizeof(res_desc));
res_desc.resType = cudaResourceTypeArray;
res_desc.res.array.array = cu_array;
// Specify texture object parameters
// - Clamp mode: if out of bounds clamp index to closest 0 or width | 0 or height
// - Without interpoation
// - No conversion/normalization of the value read
// - Coordinates are not normalized
struct cudaTextureDesc tex_desc;
memset(&tex_desc, 0, sizeof(tex_desc));
tex_desc.addressMode[0] = cudaAddressModeClamp;
tex_desc.addressMode[1] = cudaAddressModeClamp;
tex_desc.filterMode = cudaFilterModePoint;
tex_desc.readMode = cudaReadModeElementType;
tex_desc.normalizedCoords = 0;
// Copy host memory to cudaArray
checkCudaErrors(cudaMemcpyToArray(cu_array, 0, 0, h_data, size,
cudaMemcpyHostToDevice));
// Create texture object
cudaTextureObject_t tex_obj = 0;
cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
// Allocate result of transformation in device memory
float* d_output;
checkCudaErrors(cudaMalloc(&d_output, size));
// Print host array
printArrayAsMatrix(h_data, width, height);
// Invoke kernel rotating it once
dim3 dimBlock(16, 16);
dim3 dimGrid(
(width + dimBlock.x - 1) / dimBlock.x+1,
(height + dimBlock.y - 1) / dimBlock.y+1);
rotateKernel<<<dimGrid, dimBlock>>>(d_output,
tex_obj, width, height,
theta);
// Print result array
checkCudaErrors(cudaMemcpy(h_data, d_output, size, cudaMemcpyDeviceToHost));
printArrayAsMatrix(h_data, width, height);
// Copy old result to texture and Invoke kernel rotating it again
checkCudaErrors(cudaMemcpyToArray(cu_array, 0, 0, d_output, size,
cudaMemcpyDeviceToDevice));
rotateKernel<<<dimGrid, dimBlock>>>(d_output,
tex_obj, width, height,
theta);
// Print result array
checkCudaErrors(cudaMemcpy(h_data, d_output, size, cudaMemcpyDeviceToHost));
printArrayAsMatrix(h_data, width, height);
// Copy old result to texture and Invoke kernel rotating it again
checkCudaErrors(cudaMemcpyToArray(cu_array, 0, 0, d_output, size,
cudaMemcpyDeviceToDevice));
rotateKernel<<<dimGrid, dimBlock>>>(d_output,
tex_obj, width, height,
theta);
// Print result array
checkCudaErrors(cudaMemcpy(h_data, d_output, size, cudaMemcpyDeviceToHost));
printArrayAsMatrix(h_data, width, height);
// Destroy texture object
checkCudaErrors(cudaDestroyTextureObject(tex_obj));
// Free device memory
checkCudaErrors(cudaFreeArray(cu_array));
checkCudaErrors(cudaFree(d_output));
// Free host memory
free(h_data);
}
Using Ubuntu 16.04, NVIDIA-SMI 450.51.05 Driver Version: 450.51.05 CUDA Version: 11.0.
It is possible to Compile with:
# To compile for a 1080 card.
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc -std=c++14 -m64 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61 -o main.o -c main.cu
/usr/local/cuda/bin/nvcc -ccbin g++ -std=c++14 -m64 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61 -o simpleTextureObject main.o
To run:
./simpleTextureObject
To run and view the output on the terminal with proper format:
./simpleTextureObject > out
column -s, -t < out | less -#2 -N -S
This link has pictures comparing segments of this data and highlighting the mismatch.
Any help is deeply appreciated.
Thank you in advance.