Decreased performance when using textures

alien1900 · April 5, 2019, 10:31am

Hello!

Trying to repeat the example from the book. It simulates heat transfer.
Consider two options:

Without texture memory
Using texture memory

The source code is given below.
The book states that the second option should work faster than the first.
And I get the opposite:

63 ms per frame
380 ms per frame

Graphics Processor: GeForce GTX 1060 with Max-Q Design

Can you please tell me why it happens?
Thanks you!

First

#include <stdio.h>
#include <cuda.h>
#include <book.h>
#include <cpu_anim.h>

#define DIM 1024
#define SPEED 0.25f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define PI 3.1415926535897932f

struct DataBlock {
	unsigned char	*output_bitmap;
	float			*dev_inSrc;
	float			*dev_outSrc;
	float			*dev_constSrc;
	CPUAnimBitmap	*bitmap;
	cudaEvent_t		start, stop;
	float			totalTime;
	float			frames;
};

__global__ void copy_const_kernel( float *iptr, const float *cptr ) {
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;
	int offset = x + y * blockDim.x * gridDim.x;

	if(cptr[offset] != 0) iptr[offset] = cptr[offset];
}

__global__ void blend_kernel( float *outSrc, const float *inSrc) {
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;
	int offset = x + y * blockDim.x * gridDim.x;

	int left = offset - 1;
	int right = offset + 1;
	if(x == 0) left++;
	if(x == DIM-1) right--;

	int top = offset - DIM;
	int bottom = offset + DIM;
	if(y == 0) top += DIM;
	if(y == DIM-1) bottom -= DIM;

	outSrc[offset] = inSrc[offset] + SPEED * ( inSrc + inSrc + inSrc[bottom] + inSrc[top]- inSrc[offset] *4 );
}

void anim_gpu( DataBlock *d, int ticks ) {
	HANDLE_ERROR (cudaEventRecord( d->start, 0) );
	dim3 blocks(DIM/16, DIM/16);
	dim3 threads(16, 16);
	CPUAnimBitmap *bitmap = d->bitmap;

	for(int i=0; i<90; i++) {
		copy_const_kernel<<<blocks,threads>>>( d->dev_inSrc, d->dev_constSrc );
		blend_kernel<<<blocks,threads>>>( d->dev_outSrc, d->dev_inSrc );
		swap ( d->dev_inSrc, d->dev_outSrc );
	}

	float_to_color<<<blocks,threads>>>(d->output_bitmap, d->dev_inSrc);

	HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(),
							  d->output_bitmap,
							  bitmap->image_size(),
							  cudaMemcpyDeviceToHost ) );

	HANDLE_ERROR (cudaEventRecord( d->stop, 0) );
	HANDLE_ERROR (cudaEventSynchronize( d->stop) );

	float elapsedTime;
	HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
	                                        d->start, d->stop ) );

	d->totalTime += elapsedTime;
	++d->frames;
    printf( "Average one frame time:  %3.1f ms\n", d->totalTime / d->frames );
}

void anim_exit( DataBlock *d) {
	HANDLE_ERROR( cudaFree( d->dev_inSrc ) );
	HANDLE_ERROR( cudaFree( d->dev_outSrc ) );
	HANDLE_ERROR( cudaFree( d->dev_constSrc ) );

	HANDLE_ERROR( cudaEventDestroy( d->start ));
	HANDLE_ERROR( cudaEventDestroy( d->stop ));
}

int main( void ) {
    DataBlock   data;
    CPUAnimBitmap bitmap( DIM, DIM, &data );
    data.bitmap = &bitmap;
    data.totalTime = 0;
    data.frames = 0;
    HANDLE_ERROR( cudaEventCreate( &data.start ) );
    HANDLE_ERROR( cudaEventCreate( &data.stop ) );

    int imageSize = bitmap.image_size();

    HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap,
                               imageSize ) );


    HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc,
                              imageSize ) );
    HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc,
                              imageSize ) );
    HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc,
                              imageSize ) );


    // intialize the constant data
    float *temp = (float*)malloc( imageSize );
    for (int i=0; i<DIM*DIM; i++) {
        temp[i] = 0;
        int x = i % DIM;
        int y = i / DIM;
        if ((x>300) && (x<600) && (y>310) && (y<601))
            temp[i] = MAX_TEMP;
    }
    temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2;
    temp[DIM*700+100] = MIN_TEMP;
    temp[DIM*300+300] = MIN_TEMP;
    temp[DIM*200+700] = MIN_TEMP;
    for (int y=800; y<900; y++) {
        for (int x=400; x<500; x++) {
            temp[x+y*DIM] = MIN_TEMP;
        }
    }
    HANDLE_ERROR( cudaMemcpy( data.dev_constSrc, temp,
                              imageSize,
                              cudaMemcpyHostToDevice ) );

    // initialize the input data
    for (int y=800; y<DIM; y++) {
        for (int x=0; x<200; x++) {
            temp[x+y*DIM] = MAX_TEMP;
        }
    }
    HANDLE_ERROR( cudaMemcpy( data.dev_inSrc, temp,
                              imageSize,
                              cudaMemcpyHostToDevice ) );
    free( temp );

    bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu,
                           (void (*)(void*))anim_exit );
}

Second

#include <stdio.h>
#include <cuda.h>
#include <book.h>
#include <cpu_anim.h>

#define DIM 1024
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED   0.25f

// these exist on the GPU side
texture<float,2>  texConstSrc;
texture<float,2>  texIn;
texture<float,2>  texOut;

__global__ void blend_kernel( float *dst,
                              bool dstOut ) {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    float   t, l, c, r, b;
    if (dstOut) {
        t = tex2D(texIn,x,y-1);
        l = tex2D(texIn,x-1,y);
        c = tex2D(texIn,x,y);
        r = tex2D(texIn,x+1,y);
        b = tex2D(texIn,x,y+1);
    } else {
        t = tex2D(texOut,x,y-1);
        l = tex2D(texOut,x-1,y);
        c = tex2D(texOut,x,y);
        r = tex2D(texOut,x+1,y);
        b = tex2D(texOut,x,y+1);
    }
    dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}

__global__ void copy_const_kernel( float *iptr ) {
    // map from threadIdx/BlockIdx to pixel position
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int offset = x + y * blockDim.x * gridDim.x;

    float c = tex2D(texConstSrc,x,y);
    if (c != 0)
        iptr[offset] = c;
}

// globals needed by the update routine
struct DataBlock {
    unsigned char   *output_bitmap;
    float           *dev_inSrc;
    float           *dev_outSrc;
    float           *dev_constSrc;
    CPUAnimBitmap  *bitmap;

    cudaEvent_t     start, stop;
    float           totalTime;
    float           frames;
};

void anim_gpu( DataBlock *d, int ticks ) {
    HANDLE_ERROR( cudaEventRecord( d->start, 0 ) );
    dim3    blocks(DIM/16,DIM/16);
    dim3    threads(16,16);
    CPUAnimBitmap  *bitmap = d->bitmap;

    // since tex is global and bound, we have to use a flag to
    // select which is in/out per iteration
    volatile bool dstOut = true;
    for (int i=0; i<90; i++) {
        float   *in, *out;
        if (dstOut) {
            in  = d->dev_inSrc;
            out = d->dev_outSrc;
        } else {
            out = d->dev_inSrc;
            in  = d->dev_outSrc;
        }
        copy_const_kernel<<<blocks,threads>>>( in );
        blend_kernel<<<blocks,threads>>>( out, dstOut );
        dstOut = !dstOut;
    }
    float_to_color<<<blocks,threads>>>( d->output_bitmap,
                                        d->dev_inSrc );

    HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(),
                              d->output_bitmap,
                              bitmap->image_size(),
                              cudaMemcpyDeviceToHost ) );

    HANDLE_ERROR( cudaEventRecord( d->stop, 0 ) );
    HANDLE_ERROR( cudaEventSynchronize( d->stop ) );
    float   elapsedTime;
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                        d->start, d->stop ) );
    d->totalTime += elapsedTime;
    ++d->frames;
    printf( "Average Time per frame:  %3.1f ms\n",
            d->totalTime/d->frames  );
}

// clean up memory allocated on the GPU
void anim_exit( DataBlock *d ) {
    cudaUnbindTexture( texIn );
    cudaUnbindTexture( texOut );
    cudaUnbindTexture( texConstSrc );
    HANDLE_ERROR( cudaFree( d->dev_inSrc ) );
    HANDLE_ERROR( cudaFree( d->dev_outSrc ) );
    HANDLE_ERROR( cudaFree( d->dev_constSrc ) );

    HANDLE_ERROR( cudaEventDestroy( d->start ) );
    HANDLE_ERROR( cudaEventDestroy( d->stop ) );
}


int main( void ) {
    DataBlock   data;
    CPUAnimBitmap bitmap( DIM, DIM, &data );
    data.bitmap = &bitmap;
    data.totalTime = 0;
    data.frames = 0;
    HANDLE_ERROR( cudaEventCreate( &data.start ) );
    HANDLE_ERROR( cudaEventCreate( &data.stop ) );

    int imageSize = bitmap.image_size();

    HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap,
                               imageSize ) );

    // assume float == 4 chars in size (ie rgba)
    HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc,
                              imageSize ) );
    HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc,
                              imageSize ) );
    HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc,
                              imageSize ) );

    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
    HANDLE_ERROR( cudaBindTexture2D( NULL, texConstSrc,
                                   data.dev_constSrc,
                                   desc, DIM, DIM,
                                   sizeof(float) * DIM ) );

    HANDLE_ERROR( cudaBindTexture2D( NULL, texIn,
                                   data.dev_inSrc,
                                   desc, DIM, DIM,
                                   sizeof(float) * DIM ) );

    HANDLE_ERROR( cudaBindTexture2D( NULL, texOut,
                                   data.dev_outSrc,
                                   desc, DIM, DIM,
                                   sizeof(float) * DIM ) );

    // initialize the constant data
    float *temp = (float*)malloc( imageSize );
    for (int i=0; i<DIM*DIM; i++) {
        temp[i] = 0;
        int x = i % DIM;
        int y = i / DIM;
        if ((x>300) && (x<600) && (y>310) && (y<601))
            temp[i] = MAX_TEMP;
    }
    temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2;
    temp[DIM*700+100] = MIN_TEMP;
    temp[DIM*300+300] = MIN_TEMP;
    temp[DIM*200+700] = MIN_TEMP;
    for (int y=800; y<900; y++) {
        for (int x=400; x<500; x++) {
            temp[x+y*DIM] = MIN_TEMP;
        }
    }
    HANDLE_ERROR( cudaMemcpy( data.dev_constSrc, temp,
                              imageSize,
                              cudaMemcpyHostToDevice ) );

    // initialize the input data
    for (int y=800; y<DIM; y++) {
        for (int x=0; x<200; x++) {
            temp[x+y*DIM] = MAX_TEMP;
        }
    }
    HANDLE_ERROR( cudaMemcpy( data.dev_inSrc, temp,
                              imageSize,
                              cudaMemcpyHostToDevice ) );
    free( temp );

    bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu,
                           (void (*)(void*))anim_exit );
}

njuffa · April 5, 2019, 3:10pm

What book?

Are you using release builds for both code variants? You would never want to time a debug build.

Did you modify the example code from the book in any way?

Assuming the code above is the original, unmodified code from the book, and you are using a release build for both variants:

Hypothesis 1: The book is wrong. Mistakes and errors in books are a thing. Depending on the quality of the publication, even fairly common. The timing methodology used by the code from the book may be questionable.

Hypothesis 2: The book is outdated. Early GPU architectures differed substantially from modern GPU architectures. The amount of processing resources has increased by a factor of ten, and the memory hierarchy is quite different. Bottlenecks in the code may have shifted.

[Later:] I compiled both codes and ran them on a Quadro P2000 under Windows 7. The first code reports “Average Time per frame: 13.2ms”, the second code reports “Average Time per frame: 13.3ms”. I think the Quadro P2000 is roughly comparable to the GTX 1060 in terms of performance.

This would seem to suggest that you are not using release builds. That the times are equal may be indicative of the fact that classical textures are often no faster than just regular use of memory, due to changes in the GPU memory hierarchy over the past decade.

alien1900 · April 8, 2019, 5:47am

This book http://www.mat.unimi.it/users/sansotte/cuda/CUDA_by_Example.pdf
Chapter 7.1

Yes, I was using debug builds. I am stupid.
Today I was using release building and got 12.9 ms for first code and 14.7 ms for second code.
I am sorry, I am a beginner in CUDA technology.
Thank a lot for your great help!

Topic		Replies	Views
CUDA vs DX execution times DX GPGPU code --> CUDA = slower CUDA Programming and Performance	15	13443	January 30, 2008
Global Memory vs Constant vs Texture Fetch Performance CUDA Programming and Performance	12	7447	March 10, 2009
Texture memory fetch extremely slow CUDA Programming and Performance	13	3297	December 21, 2017
Is GPU worth it? GPU currently too slow. CUDA Programming and Performance	16	6194	December 8, 2008
Textures: Please provide some feedback CUDA Programming and Performance	2	1205	February 24, 2013
Roughly the same processing time for global and shared mem CUDA Programming and Performance	9	1932	June 6, 2010
problem: ch. 7 heat transfer example in cuda by example The heat transfer example runs very slowly CUDA Programming and Performance	3	1548	November 7, 2023
CUDA Texture Memory Example for Beginners CUDA Programming and Performance	6	5782	July 10, 2023
Is CUDA better than GLSLang? I need to know more... CUDA Programming and Performance	30	38891	July 13, 2007
Why texture memory is better on Fermi? CUDA Programming and Performance	62	21559	January 28, 2011

Decreased performance when using textures

Related topics