low performance

Hi, I’m writing simple raytracer in cuda (just 2 spheres and one light), as base I used simpleTextrue3d project from cuda samples and divided to cpp file and cu file. I’m not using any specific cuda stuff (no thread synchronization, shared memory), just copy some (small) amout of data, calculate scene, draw texture, code it’s so simple that with minor modifications (change datatype names, reading from input data, it’s the same code) i’ve ported it to GLSL and Cg and it runs much faster (about 5 times, 270fps vs 47fps) in Cg and GLSL than in cuda, so i’m wondering why, what can be wrong? My system is GF8800GT (G92), E8200, 4GB ram, Linux Debian 64bit (testing), cuda 2.1, best performance with cuda i’m getting launching kernel with dim3 block(112, 1, 1) (and i understand why) but it still is slow.

Edit, maybe i should add some code here:

cpp

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <time.h>

#include <GL/glew.h>

#include <GL/glut.h>

#include <cuda_runtime.h>

#include <cutil_inline.h>

#include <cutil_gl_error.h>

#include <cuda_gl_interop.h>

#include <vector_types.h>

clock_t start, finish;

double frames;

extern "C" void launch_kernel(float4* out, unsigned int width, unsigned int height, float *in);

const unsigned int width = 512, height = 512;

const unsigned int N = 24;

float input[N];

float *d_input;

float angle = 0;

unsigned int fps = 0;

GLuint pbo;	 // OpenGL pixel buffer object

// render image using CUDA

void render()

{

  // map PBO to get CUDA device pointer

  float4 *output;

  angle += 0.001f;

  input[0] = cos(angle) * 4.0 + input[8]; input[2]  = sin(angle) * 4.0 + input[10];

  cutilSafeCall(cudaGLMapBufferObject((void**)&output, pbo));

  cutilSafeCall(cudaMemcpy(d_input, input, sizeof(float)*N, cudaMemcpyHostToDevice));

// call CUDA kernel, writing results to PBO

  launch_kernel(output, width, height, d_input);

  cutilCheckMsg("kernel failed");

cutilSafeCall(cudaGLUnmapBufferObject(pbo));

}

// display results using OpenGL (called by GLUT)

void display()

{

  render();

// display results

  glClear(GL_COLOR_BUFFER_BIT);

// draw image from PBO

  glDisable(GL_DEPTH_TEST);

  glRasterPos2i(0, 0);

  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);

  glDrawPixels(width, height, GL_RGBA, GL_FLOAT, 0);

  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

glutSwapBuffers();

  glutReportErrors();

	frames += 1;

	finish = clock();

	float duration = (double) (finish - start)/CLOCKS_PER_SEC;

	if (duration > 5)

	  {

	start = finish;

	printf("%f frames in %f seconds, %f FPS\n", frames, duration, frames/duration);

	duration = 0;

	frames = 0;

	  }

}

void idle()

{

  glutPostRedisplay();

}

void keyboard(unsigned char key, int x, int y)

{

  switch(key) {

  case 27:

	exit(0);

	break;

  default:

	break;

  }

  glutPostRedisplay();

}

void reshape(int x, int y)

{

  glViewport(0, 0, x, y);

glMatrixMode(GL_MODELVIEW);

  glLoadIdentity();

glMatrixMode(GL_PROJECTION);

  glLoadIdentity();

  glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); 

}

void cleanup()

{

  cutilSafeCall(cudaGLUnregisterBufferObject(pbo));	

  glDeleteBuffersARB(1, &pbo);

}

void initOpenGL()

{

  // create pixel buffer object

  glGenBuffersARB(1, &pbo);

  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);

  glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(GLfloat)*4, 0, GL_STREAM_DRAW_ARB);

  glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);

cutilSafeCall(cudaGLRegisterBufferObject(pbo));

cutilSafeCall(cudaMalloc((void **) &d_input, N*sizeof(float)));

  input[0] = 0.0f;	// light position xyz

  input[1] = 0.0f;

  input[2] = 0.0f;

  input[3] = 0.0f;	// unused

  input[4] = 1.0f;	// light color rgba

  input[5] = 1.0f;

  input[6] = 1.0f;

  input[7] = 1.0f;

  input[8] = 0.0f;	// sphere position xyz

  input[9] = 0.0f;

  input[10] = -4.0f;

  input[11] = 0.5f;   // sphere radius

  input[12] = 1.0f;   // sphere color rgba

  input[13] = 0.5f;

  input[14] = 0.5f;

  input[15] = 1.0f;

  input[16] = 0.0f;	// sphere2 position xyz

  input[17] = 1.1f;

  input[18] = -4.0f;

  input[19] = 0.5f;   // sphere2 radius

  input[20] = 0.0f;   // sphere2 color rgba

  input[21] = 1.0f;

  input[22] = 0.0f;

  input[23] = 1.0f;

}

////////////////////////////////////////////////////////////////////////////////

// Program main

////////////////////////////////////////////////////////////////////////////////

int

main( int argc, char** argv) 

{

  // use command-line specified CUDA device, otherwise use device with highest Gflops/s

  if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

	{

	  cutilDeviceInit(argc, argv);

	}

  else

	{

	  cudaSetDevice( cutGetMaxGflopsDeviceId() );

	}

// initialize GLUT callback functions

  glutInit(&argc, argv);

  glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);

  glutInitWindowSize(width, height);

  glutCreateWindow("cuda RT");

  glutDisplayFunc(display);

  glutKeyboardFunc(keyboard);

  glutReshapeFunc(reshape);

  glutIdleFunc(idle);

glewInit();

  if (!glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object")) {

	fprintf(stderr, "Required OpenGL extensions missing.");

	cudaThreadExit();

	exit(-1);

  }

  initOpenGL();

atexit(cleanup);

glutMainLoop();

cudaThreadExit();

  return 0;

}

cu

#ifndef _CUDART_KERNEL_H_

#define _CUDART_KERNEL_H_

#include <cutil_math.h>

#define DEB 0

#if DEB

#include <stdio.h>

#endif

#define RT_DEPTH 2

#define AMBIENT_LIGHT 0.5

__device__ float3

raySphereIntersect(const float3 r0, const float3 rd, const float4 sphere)

{

  float3 rv;

  float a = rd.x * rd.x + rd.y * rd.y + rd.z * rd.z;

  float b = 2 * (rd.x * (r0.x - sphere.x) + rd.y * (r0.y - sphere.y) + rd.z * (r0.z - sphere.z));

  float c = (r0.x - sphere.x) * (r0.x - sphere.x) + (r0.y - sphere.y) * (r0.y - sphere.y) + 

		  (r0.z - sphere.z) * (r0.z - sphere.z) - (sphere.w * sphere.w);

  rv.x = b * b - 4 * a * c;

  if (rv.x > 0.0)

	rv.y = (-b - sqrt(rv.x)) * (1 / a) * 0.5;

  else

	rv.y = -1.0;

  return rv;

}

__device__ float4

scaleFloat4(float4 f, float n)

{

  return make_float4(f.x * n, f.y * n, f.z * n, f.w * n);

}

__device__ float4

multiplyFloat4(float4 a, float4 b)

{

  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);

}

__device__ float

dotProd(float3 a, float3 b)

{

  return a.x * b.x + a.y * b.y + a.z * b.z;

}

__device__ float3

normalizeFloat3(float3 v)

{

  float d = 1 / sqrt(v.x * v.x + v.y * v.y + v.z * v.z);

  return make_float3(v.x * d, v.y * d, v.z * d);

}

#if DEB

#define debX 256

#define debY 303

#endif

__global__ void

kernel(float4 *output, unsigned int width, unsigned int height, float* in)

{

	uint ix = blockIdx.x * blockDim.x + threadIdx.x;

	uint iy = blockIdx.y * blockDim.y + threadIdx.y;

	

	float3 r0 = make_float3(0.0, 0.0, 0.0);

	float3 rd = make_float3(1.0f * ix / width - 0.5, 1.0f * iy / height - 0.5, -1.0);

	float4 color = make_float4(0.0, 0.0, 0.0, 0.0);

	float4 sphere1 = make_float4(in[8], in[9], in[10], in[11]);

	float4 sphere2 = make_float4(in[16], in[17], in[18], in[19]);

	float3 light = make_float3(in[0], in[1], in[2]);

	float4 sphere1Color = make_float4(in[12], in[13], in[14], in[15]);

	float4 sphere2Color = make_float4(in[20], in[21], in[22], in[23]);

	float4 lightColor = make_float4(in[4], in[5], in[6], in[7]);

	float4 ambientColor = make_float4(0.1, 0.1, 0.1, 1.0);

	float3 s1i, s2i;

	int i = 0;

	if ((ix < width) && (iy < height))

	{

	while (i < RT_DEPTH)

	{

		s1i = raySphereIntersect(r0, rd, sphere1);

		s2i = raySphereIntersect(r0, rd, sphere2);

		if ((s1i.y > 0.0) && ((s2i.y <= 0.0) || (s2i.x > 0.0 && s1i.y <= s2i.y)))

		{

		float3 nr0 = make_float3(r0.x + s1i.y * rd.x, r0.y + s1i.y * rd.y, r0.z + s1i.y * rd.z);

		float3 nrd = make_float3(light.x - nr0.x, light.y - nr0.y ,light.z - nr0.z);

		float3 nsi = raySphereIntersect(nr0, nrd, sphere1);

		if (nsi.x <= 0.0 || nsi.y < -0.0001)

			color += multiplyFloat4(sphere1Color, lightColor);

			else

			color += multiplyFloat4(sphere1Color, ambientColor);

		float3 normal = make_float3(nr0.x - sphere1.x, nr0.y - sphere1.y, nr0.z - sphere1.z);

		normal = normalizeFloat3(normal);

		float d = dotProd(normal, make_float3(-rd.x, -rd.y, -rd.z));

		float3 reflected = make_float3(2 * normal.x * d + rd.x, 

			   		   		   2 * normal.y * d + rd.y, 2 * normal.z * d + rd.z);

		r0.x = nr0.x + reflected.x * 0.001; r0.y = nr0.y + reflected.y * 0.001; r0.z = nr0.z + reflected.z * 0.001;

		rd.x = reflected.x; rd.y = reflected.y; rd.z = reflected.z; 

		}

		else

		if ((s2i.y > 0.0) && ((s1i.y <= 0.0) || (s1i.x > 0.0 && s2i.y <= s1i.y)))

		{

		float3 nr0 = make_float3(r0.x + s2i.y * rd.x, r0.y + s2i.y * rd.y, r0.z + s2i.y * rd.z);

		float3 nrd = make_float3(light.x - nr0.x, light.y - nr0.y ,light.z - nr0.z);

		float3 nsi = raySphereIntersect(nr0, nrd, sphere2);

		if (nsi.x <= 0.0 || nsi.y < -0.0001)

			color += multiplyFloat4(sphere2Color, lightColor);

			else

			color += multiplyFloat4(sphere2Color, ambientColor);

		float3 normal = make_float3(nr0.x - sphere2.x, nr0.y - sphere2.y, nr0.z - sphere2.z);

		normal = normalizeFloat3(normal);

		float d = dotProd(normal, make_float3(-rd.x, -rd.y, -rd.z));

		float3 reflected = make_float3(2 * normal.x * d + rd.x,

			   		   		   2 * normal.y * d + rd.y, 2 * normal.z * d + rd.z);

		r0.x = nr0.x + reflected.x * 0.001; r0.y = nr0.y + reflected.y * 0.001; r0.z = nr0.z + reflected.z * 0.001;

		rd.x = reflected.x; rd.y = reflected.y; rd.z = reflected.z; 

		}

		else

		break;

		i++;

	}

		output[iy * width + ix] = color;

	}

}

// Wrapper for the __global__ call that sets up the kernel call

extern "C" void launch_kernel(float4* out, unsigned int width, unsigned int height, float* in)

{

	// execute the kernel

	dim3 block(14, 8, 1);

	dim3 grid(width /block.x,height/block.y, 1);

	kernel<<< grid, block>>>(out, width, height, in);

}

#endif // #ifndef _CUDART_KERNEL_H_

for building put it in simpletexture3d folder and change a bit makefile (add cpp file), i deleted some debug stuff, bu it should working

I haven’t looked at your code in detail, but reading the sphere data from global memory is probably not the best way of doing this, especially since your reads are not coalesced.

Try putting the scene data in CUDA constant memory instead - this is what your GLSL implementation is effectively doing if you are using “uniform” variables.

Also, a block size of 14 x 8 is not ideal because it’s not a multiple of 32.

I have some problems with using constant memory.

I changed .cu by adding:
constant float
rtData;
before all functions, and all in to rtData, deleted in parameter from kernel
in *.cpp i changed
cutilSafeCall(cudaMemcpy(d_input, input, sizeof(float)*N, cudaMemcpyHostToDevice));
to
cutilSafeCall(cudaMemcpyToSymbol(“rtData”, input, sizeof(float)*N));
and I’m getting “unspecified launch failure” when I run kernel

Constant memory is limited to 64KB and it could be not enought for bigger scenes.

Sadly, constant memory must have size specified at compile time. So you have to do something like

constant float rtData[1000];

Naturally, you are free to use less of this memory at runtime.

By the way, NVIDIA, it would be Quite Cool if the compiler warned us about this error. It already warns (actually errors-out) when one overflows 64K, ie. constant float a[16001]; won’t compile, but constant float* a; will, and will crap out at runtime with a meaningless message.

Thanks Big_Mac, but it still works about 2 times slower than GLSL/Cg app (148fps vs 270fps).

You could try changing

r0.x = nr0.x + reflected.x * 0.001; r0.y = nr0.y + reflected.y * 0.001; r0.z = nr0.z + reflected.z * 0.001;
    rd.x = reflected.x; rd.y = reflected.y; rd.z = reflected.z;

to
r0.x = nr0.x + reflected.x * 0.001f; r0.y = nr0.y + reflected.y * 0.001f; r0.z = nr0.z + reflected.z * 0.001f;
rd.x = reflected.x; rd.y = reflected.y; rd.z = reflected.z;

i don’t know how it works on your card, but on gt200 it uses doubles for calculations in first case, which are very slow.
Try using fast math (for sqrt), shaders probably use it too. and integer multiplications could be changed to __mul24.

also
output[iy * width + ix] = color;

could work better when blockDim.x would be a multiple of 8, so it would be aligned to 128B, it will result in better coalescing.

i’m writing raytracer too;) and i use 32x4 block size, which i use as 4 screen blocks with 8x4 pixels and it works nice;)

Thanks for tips, I aplied your changes (i just removed all reflected.x * 0.001), all this imprvments give me 10 extra fps, it’s still slower than Cg/GLSL.