Simplest texture 2D examples

Dear all,

I am studying textures.

I want to find a simple example of using tex2D to read a 2D texture. I want to avoid cudamallocpitch, cuda arrays, fancy channel descriptions, etc.
I came up with the following code. (Lots of code online is too complicated for me to understand and use lots of parameters in their functions calls).

I want to map the 4 element linear array to a 2 by 2 2D texture.
However, I always get 0 as ouptput.

Any help would be greatly appreciated!

Best,

Mat

#include<stdio.h>

texture<int, 2> texarray;

__global__ void test(){
        int s =  tex2D<int>(texarray, 0 ,0);
        printf("%d \n", s);
}

int main(){

        int *host_array = (int *) malloc(4*sizeof(int));
        host_array[0] = 3; host_array[1] = 4; host_array[2] = 5; host_array[3] = 6;

        int *dev_array;
        cudaMalloc( (void**) &dev_array, 4*sizeof(int) );

        cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();

        cudaBindTexture2D(NULL, texarray, dev_array, desc, 2, 2, 2*sizeof(int) );

        cudaMemcpy(dev_array, host_array, 4*sizeof(int), cudaMemcpyHostToDevice  );

        test<<<1,1>>>();

        cudaDeviceSynchronize();

        cudaUnbindTexture( texarray );
        cudaFree(dev_array);
        free(host_array);

        return 0;
}

(1) Generally speaking, 2D textures require the use of cudaMallocPitch(), because 2D textures have alignment requirements for each “row” that you are unlikely to satisfy otherwise.

(2) Your code seems to be devoid of any error checking; I would suggest adding that for all API calls and the kernel invocation.

(3) Is there no texture example among the CUDA sample apps that ship with CUDA that you consider simple enough?

(4) Maybe take a look at this simple code:

#include <stdlib.h>
#include <stdio.h>

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaDeviceSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<int, 2, cudaReadModeElementType> tex;

__global__ void kernel (int m, int n) 
{
    int val;
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            val = tex2D (tex, col+0.5f, row+0.5f);
            printf ("%3d  ", val);
        }
        printf ("\n");
    }
}

int main (void)
{
    int m = 4; // height = #rows
    int n = 3; // width  = #columns
    size_t pitch, tex_ofs;
    int arr[4][3]= {{10, 11, 12},
                    {20, 21, 22},
                    {30, 31, 32},
                    {40, 41, 42}};
    int *arr_d = 0;

    CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
    CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
                                n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
    tex.normalized = false;
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
                                       n, m, pitch));
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
    }
    printf ("reading texture:\n");
    kernel<<<1,1>>>(m, n);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    return EXIT_SUCCESS;
}

Output should look like this:

reading texture:
 10   11   12
 20   21   22
 30   31   32
 40   41   42
1 Like

Hello,

[1] MY example is based on the book

http://www.mat.unimi.it/users/sansotte/cuda/CUDA_by_Example.pdf

Section 7.3.5

They do not use pitch. or cudaMemcpy2D, etc.

I understand that alignment is a good thing for performance, but I did not know it was required to make things work.

[2] Yes, agree. After reading online, I have added

#define gerror(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

to the beginning of the code and

gerror( cudaPeekAtLastError() );
        cudaDeviceSynchronize();

at the end.

I got an error, and then I put gerror around the line that does the texture binding, and it seems that that line is the culprit giving "GPUassert: invalid argument "

Any reason why?

[3] The texture examples are all kind of dense. For me at least.

[4] I think that I understand your code. It is also fairly complicated (for me at least). But when I try to test my understanding by making a very simple code, like mine, I do not understand why my code does not work. :(

With very few exceptions, I don’t debug other people’s code. I find debugging my own code annoying enough. In general, debugging one’s code is an excellent way to learn about things. It is sufficiently painful that you will not soon forget any lessons learned this way.

It has been over a decade since I learned about textures in CUDA, and I may no longer remember clearly whether pitched allocations are necessary for 2D textures (due to alignment requirements) or only a recommended best practice (for performance reasons). I automatically use cudaMallocPitch() whenever I use 2D textures. Maybe I was mistaken on that particular issue. Check the official CUDA documentation.

I quickly browsed the manual…

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses

In the “Size and Alignment Requirement” for “Texture and Surface Memory” it does not say that texture needs to be aligned.

I also found this post…

https://devtalk.nvidia.com/default/topic/419616/problem-with-arbitrary-sized-textures-error-invalid-argument-with-cudabindtexture2d-/

where the person is saying that alignment is necessary (or that not all N x M array sizes are possible without using pitch, etc.) …is the person correct? I mean, is there a way to knowing this for sure ?

I guess lots of people just do not dig deep enough and just play safe?!

Quick update…

If I write

cudaBindTexture2D(NULL, texarray, dev_array, desc, 8, 8, 8*sizeof(int) )

the code seems to work, some how… so it seems like the issue is with trying to define a 2 by 2 texture. Maybe it is too small? It would be strange if CUDA did not support small textures, no? Although small textures are not interesting, from a language point of view, it would be kind of strange.

There are two potential issues that are best not conflated:

(1) Is the start of the texture suitably aligned? Since you desire simple code, the code I posted simply gives up if that is not the case (texofs !=0). One can work with textures whose start is not aligned, by using the texture offset when addressing the texture later inside the kernel. I would say that doing so is is an advanced topic, especially for 2D textures. If I skimmed it correctly, this is the subject of the post you linked in #5.

Note that what the code in “CUDA By Example” does, namely passing NULL as the first argument of cudaBindTexture2D(), is bad practice in my book.

(2) Should the start of each texture row be suitably aligned? As you pointed out, that is possibly a performance issue rather than a potential correctness issue as I initially stated. Either way, I would consider the use of cudaMallocPitch() as a best practice when working with 2D textures.

Note that the alignment of the texture start doesn’t immediately tell us anything about texture row alignment, which also depends on the element type and dimensions of the texture. Therefore these two issues should be considered separately.

I see no issues with a 2x2 texture after modifying my posted example from #2 as follows:

#if 0
    int m = 4; // height = #rows
    int n = 3; // width  = #columns
    size_t pitch, tex_ofs;
    int arr[4][3]= {{10, 11, 12},
                    {20, 21, 22},
                    {30, 31, 32},
                    {40, 41, 42}};
#else
    int m = 2; // height = #rows
    int n = 2; // width  = #columns
    size_t pitch, tex_ofs;
    int arr[2][2]= {{10, 11},
                    {20, 21}};
#endif

Thanks.

Very interesting.

  1. So I guess that if tex_ofs was not zero, then we would some how have to correct the way we read the texture? Who determines if tex_ofs is zero, or not?

  2. Also, would your code work if arr was a 1D array instead of a 2D array?

  3. In your line

cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
                                       n, m, pitch)

you use & for tex and also for tex.channelDesc. However, I have seen code online that does not pass these by reference!? Which one is the correct one?

  1. It would be good to find some sort of official word from NVIDIA on this, no?

  2. I guess the example in the book works because they are maybe testing it on images whose size is “nice”. It is a bit misleading. They should at least give a warning “This only works for certain array sizes”.

1a) So I guess that if tex_ofs was not zero, then we would some how have to correct the way we read the texture?

YES.

1b) Who determines if tex_ofs is zero, or not?

Programmer passes a storage address to CUDA runtime when cudaBindTexture2D() is called. CUDA runtime checks the device characteristics of the current GPU for required texture hardware alignment (typically something like 256 bytes or 512 bytes, but we don’t need to know) and places the start of the texture at the largest address <= the passed-in address that fulfills the required alignment. It then places the value of the difference between the texture start address and the storage address passed by the programmer into the location pointed to by the first argument of cudaBindTexture2D(), provided that first argument is not NULL.

As for the balance of the questions, I am not able to provide a personalized texture tutorial here. Maybe other forum participants can tackle that, or you can spend some quality time with the CUDA documentation. From personal experience: Textures are one of the more complicated things for a CUDA beginner to learn, as they are basically a concept from graphics and not related to general purpose C++ computing. In the best of all worlds (according to me :-), they would have never been exposed in CUDA.

In the early days of CUDA, working with textures was frequently needed for good performance. With modern GPUs and modern CUDA, this is hardly ever necessary (with limited exceptions for applications that can take advantage of the low-quality interpolation provided by textures).

Regarding your last comment,

"With modern GPUs and modern CUDA, this is hardly ever necessary "

is it not the case that textures are the only way to have fast access when there is spacial locality in the way the data is to be accessed?

The memory hierarchy of recent GPUs is much different from early GPUs. The several different caches have been pretty much unified, and explicitly declared textures are now not the only way to make use of the “texture path”. GPU caches have also gotten bigger, although nowhere near the size of caches found on x86 CPUs.

If you use restrict and const with pointer arguments as appropriate, and sprinkle in the occasional __ldg() intrinsic if you absolutely have to, you should be good. Consult the Best Practices Guide for details.

In other words, recent GPU generations (Pascal, Volta, Turing) very much allow CUDA programmers to achieve good performance while programming in a natural C++ style. There are certainly exceptions where ultimate performance requires ninja programming skills, but that is true of any computing platform (I look with horror at the questions involving SIMD intrinsics on Stack Overflow).

The book “CUDA by Example” was written in the early days of CUDA, prior to CUDA 3.0, if memory serves. It is showing its age, considering the rapid progress in GPU technology (hardware and software) over the past decade. I would suggest checking out Nicholas Wilt’s “CUDA Handbook” as a newer publication.

Note that I am unable to provide unbiased reviews of these publications, as I worked closely with Messrs Sanders, Kandrot, and Wilt for several years while on the CUDA software team (and I think I reviewed a chapter or two of Nick Wilt’s book prior to publication).

I thought I could provide you with a simple example through xpl (a simple 3x3 mean filter):

#include <xpl.cuh>

int main(int argc, char* argv[])
{
	int rows = 4096;
	int cols = 4096;
	// Input / output matrices
	xpl::TextureBuffer<float> texture(rows,cols);
	xpl::DeviceBuffer<float> out(texture.size());

	// Initialize to some values:
	texture.for_each([=]__device__(float& val){val=1.0f;});
	out.for_each([=]__device__(float& val){val=0.0f;});
	// Write random value here:
	texture(3,3) = 42.0f;
	//
	// Filter using texture memory
	//
	auto mean_filter_texture = xpl_device_lambda()
	{
		
		int i = xpl::device::getY();
		int j = xpl::device::getX();
		
		float sum = 0.0f; 
		for(int y : {-1,0,1})
			for(int x : {-1,0,1})
			{
				float val = texture(i+y, j+x);
				sum += val;
			}

		if( i < out.rows() && j < out.cols() )
			out(i,j) = sum/9.0f;  
	};
	// Execute kernel 
	xpl::device::execute(texture.size(), mean_filter_texture);
	xpl::device::synchronize();
	// Define 10x10 region
	xpl::Region smallRegion(0,9,0,9);
	std::cout << out(smallRegion);

	return 0;
}

This can be compiled with cmake:

CMakeLists.txt

cmake_minimum_required(VERSION 3.10.2)

project(textureMeanFilter VERSION 0.1.0 LANGUAGES CXX CUDA)

find_package(xtream 1.2.4 REQUIRED)

add_executable(textureMeanFilter textureMeanFilter.cu)
target_link_libraries(textureMeanFilter xtream)

See https://devtalk.nvidia.com/default/topic/940518/how-to-determine-the-base-adress-alignment-and-pitch-alignment-used-by-cudamallocpitch-/ and https://stackoverflow.com/questions/12550927/pitch-alignment-for-2d-textures

Use texture object, or a higher-level abstraction to it (like the xtrem library mentioned in the last posting).
The simple strategy mentioned also above (by using restrict and const and __ldg) might work also good enough.