Hi, I’m writing simple raytracer in cuda (just 2 spheres and one light), as base I used simpleTextrue3d project from cuda samples and divided to cpp file and cu file. I’m not using any specific cuda stuff (no thread synchronization, shared memory), just copy some (small) amout of data, calculate scene, draw texture, code it’s so simple that with minor modifications (change datatype names, reading from input data, it’s the same code) i’ve ported it to GLSL and Cg and it runs much faster (about 5 times, 270fps vs 47fps) in Cg and GLSL than in cuda, so i’m wondering why, what can be wrong? My system is GF8800GT (G92), E8200, 4GB ram, Linux Debian 64bit (testing), cuda 2.1, best performance with cuda i’m getting launching kernel with dim3 block(112, 1, 1) (and i understand why) but it still is slow.
Edit, maybe i should add some code here:
cpp
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <GL/glew.h>
#include <GL/glut.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <cutil_gl_error.h>
#include <cuda_gl_interop.h>
#include <vector_types.h>
clock_t start, finish;
double frames;
extern "C" void launch_kernel(float4* out, unsigned int width, unsigned int height, float *in);
const unsigned int width = 512, height = 512;
const unsigned int N = 24;
float input[N];
float *d_input;
float angle = 0;
unsigned int fps = 0;
GLuint pbo; // OpenGL pixel buffer object
// render image using CUDA
void render()
{
// map PBO to get CUDA device pointer
float4 *output;
angle += 0.001f;
input[0] = cos(angle) * 4.0 + input[8]; input[2] = sin(angle) * 4.0 + input[10];
cutilSafeCall(cudaGLMapBufferObject((void**)&output, pbo));
cutilSafeCall(cudaMemcpy(d_input, input, sizeof(float)*N, cudaMemcpyHostToDevice));
// call CUDA kernel, writing results to PBO
launch_kernel(output, width, height, d_input);
cutilCheckMsg("kernel failed");
cutilSafeCall(cudaGLUnmapBufferObject(pbo));
}
// display results using OpenGL (called by GLUT)
void display()
{
render();
// display results
glClear(GL_COLOR_BUFFER_BIT);
// draw image from PBO
glDisable(GL_DEPTH_TEST);
glRasterPos2i(0, 0);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
glDrawPixels(width, height, GL_RGBA, GL_FLOAT, 0);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
glutSwapBuffers();
glutReportErrors();
frames += 1;
finish = clock();
float duration = (double) (finish - start)/CLOCKS_PER_SEC;
if (duration > 5)
{
start = finish;
printf("%f frames in %f seconds, %f FPS\n", frames, duration, frames/duration);
duration = 0;
frames = 0;
}
}
void idle()
{
glutPostRedisplay();
}
void keyboard(unsigned char key, int x, int y)
{
switch(key) {
case 27:
exit(0);
break;
default:
break;
}
glutPostRedisplay();
}
void reshape(int x, int y)
{
glViewport(0, 0, x, y);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
}
void cleanup()
{
cutilSafeCall(cudaGLUnregisterBufferObject(pbo));
glDeleteBuffersARB(1, &pbo);
}
void initOpenGL()
{
// create pixel buffer object
glGenBuffersARB(1, &pbo);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_ARB, width*height*sizeof(GLfloat)*4, 0, GL_STREAM_DRAW_ARB);
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
cutilSafeCall(cudaGLRegisterBufferObject(pbo));
cutilSafeCall(cudaMalloc((void **) &d_input, N*sizeof(float)));
input[0] = 0.0f; // light position xyz
input[1] = 0.0f;
input[2] = 0.0f;
input[3] = 0.0f; // unused
input[4] = 1.0f; // light color rgba
input[5] = 1.0f;
input[6] = 1.0f;
input[7] = 1.0f;
input[8] = 0.0f; // sphere position xyz
input[9] = 0.0f;
input[10] = -4.0f;
input[11] = 0.5f; // sphere radius
input[12] = 1.0f; // sphere color rgba
input[13] = 0.5f;
input[14] = 0.5f;
input[15] = 1.0f;
input[16] = 0.0f; // sphere2 position xyz
input[17] = 1.1f;
input[18] = -4.0f;
input[19] = 0.5f; // sphere2 radius
input[20] = 0.0f; // sphere2 color rgba
input[21] = 1.0f;
input[22] = 0.0f;
input[23] = 1.0f;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
{
cutilDeviceInit(argc, argv);
}
else
{
cudaSetDevice( cutGetMaxGflopsDeviceId() );
}
// initialize GLUT callback functions
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
glutInitWindowSize(width, height);
glutCreateWindow("cuda RT");
glutDisplayFunc(display);
glutKeyboardFunc(keyboard);
glutReshapeFunc(reshape);
glutIdleFunc(idle);
glewInit();
if (!glewIsSupported("GL_VERSION_2_0 GL_ARB_pixel_buffer_object")) {
fprintf(stderr, "Required OpenGL extensions missing.");
cudaThreadExit();
exit(-1);
}
initOpenGL();
atexit(cleanup);
glutMainLoop();
cudaThreadExit();
return 0;
}
cu
#ifndef _CUDART_KERNEL_H_
#define _CUDART_KERNEL_H_
#include <cutil_math.h>
#define DEB 0
#if DEB
#include <stdio.h>
#endif
#define RT_DEPTH 2
#define AMBIENT_LIGHT 0.5
__device__ float3
raySphereIntersect(const float3 r0, const float3 rd, const float4 sphere)
{
float3 rv;
float a = rd.x * rd.x + rd.y * rd.y + rd.z * rd.z;
float b = 2 * (rd.x * (r0.x - sphere.x) + rd.y * (r0.y - sphere.y) + rd.z * (r0.z - sphere.z));
float c = (r0.x - sphere.x) * (r0.x - sphere.x) + (r0.y - sphere.y) * (r0.y - sphere.y) +
(r0.z - sphere.z) * (r0.z - sphere.z) - (sphere.w * sphere.w);
rv.x = b * b - 4 * a * c;
if (rv.x > 0.0)
rv.y = (-b - sqrt(rv.x)) * (1 / a) * 0.5;
else
rv.y = -1.0;
return rv;
}
__device__ float4
scaleFloat4(float4 f, float n)
{
return make_float4(f.x * n, f.y * n, f.z * n, f.w * n);
}
__device__ float4
multiplyFloat4(float4 a, float4 b)
{
return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
}
__device__ float
dotProd(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
__device__ float3
normalizeFloat3(float3 v)
{
float d = 1 / sqrt(v.x * v.x + v.y * v.y + v.z * v.z);
return make_float3(v.x * d, v.y * d, v.z * d);
}
#if DEB
#define debX 256
#define debY 303
#endif
__global__ void
kernel(float4 *output, unsigned int width, unsigned int height, float* in)
{
uint ix = blockIdx.x * blockDim.x + threadIdx.x;
uint iy = blockIdx.y * blockDim.y + threadIdx.y;
float3 r0 = make_float3(0.0, 0.0, 0.0);
float3 rd = make_float3(1.0f * ix / width - 0.5, 1.0f * iy / height - 0.5, -1.0);
float4 color = make_float4(0.0, 0.0, 0.0, 0.0);
float4 sphere1 = make_float4(in[8], in[9], in[10], in[11]);
float4 sphere2 = make_float4(in[16], in[17], in[18], in[19]);
float3 light = make_float3(in[0], in[1], in[2]);
float4 sphere1Color = make_float4(in[12], in[13], in[14], in[15]);
float4 sphere2Color = make_float4(in[20], in[21], in[22], in[23]);
float4 lightColor = make_float4(in[4], in[5], in[6], in[7]);
float4 ambientColor = make_float4(0.1, 0.1, 0.1, 1.0);
float3 s1i, s2i;
int i = 0;
if ((ix < width) && (iy < height))
{
while (i < RT_DEPTH)
{
s1i = raySphereIntersect(r0, rd, sphere1);
s2i = raySphereIntersect(r0, rd, sphere2);
if ((s1i.y > 0.0) && ((s2i.y <= 0.0) || (s2i.x > 0.0 && s1i.y <= s2i.y)))
{
float3 nr0 = make_float3(r0.x + s1i.y * rd.x, r0.y + s1i.y * rd.y, r0.z + s1i.y * rd.z);
float3 nrd = make_float3(light.x - nr0.x, light.y - nr0.y ,light.z - nr0.z);
float3 nsi = raySphereIntersect(nr0, nrd, sphere1);
if (nsi.x <= 0.0 || nsi.y < -0.0001)
color += multiplyFloat4(sphere1Color, lightColor);
else
color += multiplyFloat4(sphere1Color, ambientColor);
float3 normal = make_float3(nr0.x - sphere1.x, nr0.y - sphere1.y, nr0.z - sphere1.z);
normal = normalizeFloat3(normal);
float d = dotProd(normal, make_float3(-rd.x, -rd.y, -rd.z));
float3 reflected = make_float3(2 * normal.x * d + rd.x,
2 * normal.y * d + rd.y, 2 * normal.z * d + rd.z);
r0.x = nr0.x + reflected.x * 0.001; r0.y = nr0.y + reflected.y * 0.001; r0.z = nr0.z + reflected.z * 0.001;
rd.x = reflected.x; rd.y = reflected.y; rd.z = reflected.z;
}
else
if ((s2i.y > 0.0) && ((s1i.y <= 0.0) || (s1i.x > 0.0 && s2i.y <= s1i.y)))
{
float3 nr0 = make_float3(r0.x + s2i.y * rd.x, r0.y + s2i.y * rd.y, r0.z + s2i.y * rd.z);
float3 nrd = make_float3(light.x - nr0.x, light.y - nr0.y ,light.z - nr0.z);
float3 nsi = raySphereIntersect(nr0, nrd, sphere2);
if (nsi.x <= 0.0 || nsi.y < -0.0001)
color += multiplyFloat4(sphere2Color, lightColor);
else
color += multiplyFloat4(sphere2Color, ambientColor);
float3 normal = make_float3(nr0.x - sphere2.x, nr0.y - sphere2.y, nr0.z - sphere2.z);
normal = normalizeFloat3(normal);
float d = dotProd(normal, make_float3(-rd.x, -rd.y, -rd.z));
float3 reflected = make_float3(2 * normal.x * d + rd.x,
2 * normal.y * d + rd.y, 2 * normal.z * d + rd.z);
r0.x = nr0.x + reflected.x * 0.001; r0.y = nr0.y + reflected.y * 0.001; r0.z = nr0.z + reflected.z * 0.001;
rd.x = reflected.x; rd.y = reflected.y; rd.z = reflected.z;
}
else
break;
i++;
}
output[iy * width + ix] = color;
}
}
// Wrapper for the __global__ call that sets up the kernel call
extern "C" void launch_kernel(float4* out, unsigned int width, unsigned int height, float* in)
{
// execute the kernel
dim3 block(14, 8, 1);
dim3 grid(width /block.x,height/block.y, 1);
kernel<<< grid, block>>>(out, width, height, in);
}
#endif // #ifndef _CUDART_KERNEL_H_
for building put it in simpletexture3d folder and change a bit makefile (add cpp file), i deleted some debug stuff, bu it should working