So I finally got time to work on this again. Thanks for the replies. I tried using other kinds of memory but didn’t get to make they work, except for shared memory, which gave some speed improvement but it is still slow.
Basically, what it does is to create a pixel buffer object (PBO), call cudaGLMapBufferObject() to map the buffer to cuda and then call the kernel (which writes on the pixel buffer), then unmap it and call glTexSubImage2D() to copy the data from the pbo to a texture which is then applied on a quad that fills the screen.
While debugging I noticed that when I step over the call to cudaGLUnmapBufferObject(pbo) it takes some time to go to next line, showing that the problem is possibly there (??). But my code is just like other samples which does something like what I’m doing, and they run fast…
This time Im gonna post more of the (updated) code. There are the main data structures I use:
//camera struct to be used inside kernel
struct GPUCamera
{
float viewMatrix[16];
float nearPlane, farPlane;
float fov;
};
//camera struct to be used in host code
class Camera
{
public:
float3 position;
float4 rotation;
float nearPlane, farPlane;
float fov;
GPUCamera GetGPUCamera()
{
GPUCamera gpuCam;
//build viewMatrix
gpuCam.nearPlane = nearPlane;
gpuCam.farPlane = farPlane;
gpuCam.fov = fov;
return gpuCam;
}
};
//structures to store data about an implicit surface
struct Term3
{
float coeff;
int x, y, z;
};
struct __align__(16) Polynomial3
{
Term3 terms[16];
int nTerms;
};
struct __align__(16) Surface
{
Polynomial3 surface;
Polynomial3 gradient[3];//one polynomial for each coordinate
float3 position;
float4 rotation;
float3 scale;
float3 aabbMin;
float3 aabbMax;
};
PBO and texture creation
glGenBuffers(1, &pbo);
glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo);
glBufferData(GL_PIXEL_PACK_BUFFER, width*height*sizeof(GLubyte)*4, NULL, GL_DYNAMIC_DRAW);
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
cutilSafeCall(cudaGLRegisterBufferObject(pbo));
glGenTextures(1, &tex);
glBindTexture(GL_TEXTURE_2D, tex);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glBindTexture(GL_TEXTURE_2D, 0);
This creates a surface and then allocates mem in the device for it, and copy it to there
__device__ __constant__ Surface* d_Surface;
void CreateSurface()
{
//create a sphere
Surface h_s;
h_s.surface.nTerms = 4;
h_s.gradient[0].nTerms = 1;
h_s.gradient[1].nTerms = 1;
h_s.gradient[2].nTerms = 1;
//setup polynomial coefficients and exponents
Term3* t = h_s.surface.terms;
t[0].coeff = 1.f; t[0].x = 2; t[0].y = 0; t[0].z = 0;
t[1].coeff = 1.f; t[1].x = 0; t[1].y = 2; t[1].z = 0;
t[2].coeff = 1.f; t[2].x = 0; t[2].y = 0; t[2].z = 2;
t[3].coeff = -4.f; t[3].x = 0; t[3].y = 0; t[3].z = 0;
//setup gradient
t = h_s.gradient[0].terms;
t[0].coeff = 2.f; t[0].x = 1; t[0].y = 0; t[0].z = 0;
t = h_s.gradient[1].terms;
t[0].coeff = 2.f; t[0].x = 0; t[0].y = 1; t[0].z = 0;
t = h_s.gradient[2].terms;
t[0].coeff = 2.f; t[0].x = 0; t[0].y = 0; t[0].z = 1;
//transfer to device
cutilSafeCall(cudaMalloc((void**)&d_Surface, sizeof(Surface)));
cutilSafeCall(cudaMemcpy(d_Surface, &h_s, sizeof(Surface), cudaMemcpyHostToDevice));
}
This is the draw code
void Display()
{
unsigned int* buffer = NULL;
cutilSafeCall(cudaGLMapBufferObject((void**)&buffer, pbo));
GPUCamera gpuCam = {{1.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f,-5.f,0.f,0.f,0.f,1.f}, 1.f, 100.f, 1.5f};
ImplicitRT(dim3(16, 16), buffer, width, height, d_Surface, 1, gpuCam);
cutilSafeCall(cudaGLUnmapBufferObject(pbo));
// load texture from pbo
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
glBindTexture(GL_TEXTURE_2D, tex);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
glClear(GL_COLOR_BUFFER_BIT);
glEnable(GL_TEXTURE_2D);
glDisable(GL_DEPTH_TEST);
glBegin(GL_QUADS);
glTexCoord2f(0, 1); glVertex2f(0.f, 0.f);
glTexCoord2f(1, 1); glVertex2f(1.f, 0.f);
glTexCoord2f(1, 0); glVertex2f(1.f, 1.f);
glTexCoord2f(0, 0); glVertex2f(0.f, 1.f);
glEnd();
glBindTexture(GL_TEXTURE_2D, 0);
glutSwapBuffers();
}
And this is the kernel
#define NUM_STEPS 50
__global__ void ImplicitRTKernel(uint* pixels, int width, int height,
Surface* surface, int nSurfaces, GPUCamera gpuCam)
{
int x = blockDim.x*blockIdx.x + threadIdx.x;
int y = blockDim.y*blockIdx.y + threadIdx.y;
float fx = ((x + 0.5f)/width - 0.5f)*2.f;//fire ray from center of pixel
float fy = ((y + 0.5f)/height - 0.5f)*2.f;
float aspectX = width > height? 1.f: (float)width/height;
float aspectY = width < height? 1.f: (float)height/width;
float tanfov2 = tanf(gpuCam.fov/2.f);
float3 p0;
p0.x = fx*gpuCam.nearPlane*tanfov2*aspectX;
p0.y = fy*gpuCam.nearPlane*tanfov2*aspectY;
p0.z = gpuCam.nearPlane;
float3 p1;
p1.x = fx*gpuCam.farPlane*tanfov2*aspectX;
p1.y = fy*gpuCam.farPlane*tanfov2*aspectY;
p1.z = gpuCam.farPlane;
//transform p0 and p1 by view matrix (from view-space to world-space)
float* view = gpuCam.viewMatrix;
float3 vp0;
vp0.x = view[0]*p0.x + view[1]*p0.y + view[2]*p0.z + view[3];
vp0.y = view[4]*p0.x + view[5]*p0.y + view[6]*p0.z + view[7];
vp0.z = view[8]*p0.x + view[9]*p0.y + view[10]*p0.z + view[11];
float3 vp1;
vp1.x = view[0]*p1.x + view[1]*p1.y + view[2]*p1.z + view[3];
vp1.y = view[4]*p1.x + view[5]*p1.y + view[6]*p1.z + view[7];
vp1.z = view[8]*p1.x + view[9]*p1.y + view[10]*p1.z + view[11];
float rayParam = 1.f;//ray parameter at intersection
int hitSurfaceIndex = -1;//index of the surface that was first hit by the ray
//compute intersection between the segment (vp0, vp1) and each surface, and get the closest one
for(int i=0; i<nSurfaces; ++i)
{
__shared__ Surface s;//this gave some speed up
s = surface[i];
//transform vp0 and vp1 from world-space to object-space
float3 op0 = vp0;
float3 op1 = vp1;
float3 dir = op1 - op0;
#pragma unroll
for(int j=0; j<NUM_STEPS-1; ++j)
{
float rj = (float)j/NUM_STEPS;//ray parameter at j-th iteration
float3 sp0 = op0 + dir*rj;//points at step j
float3 sp1 = op0 + dir*(j+1)/(float)NUM_STEPS;
float psp0 = 0.f;
float psp1 = 0.f;
//compute the value of the surface polynomial at sp0 and sp1
for(int k=0; k<s.surface.nTerms; ++k)
{
//Term3 t = s.surface.terms[k];
float3 xyz0;
xyz0.x = xyz0.y = xyz0.z = 1.f;
float3 xyz1 = xyz0;
for(int l=0; l<s.surface.terms[k].x; ++l)
{
xyz0.x *= sp0.x;
xyz1.x *= sp1.x;
}
for(int l=0; l<s.surface.terms[k].y; ++l)
{
xyz0.y *= sp0.y;
xyz1.y *= sp1.y;
}
for(int l=0; l<s.surface.terms[k].z; ++l)
{
xyz0.z *= sp0.z;
xyz1.z *= sp1.z;
}
//__powf only works in the [0,1] interval..
/*xyz0.x = __powf(sp0.x, t.x);
xyz0.y = __powf(sp0.y, t.y);
xyz0.z = __powf(sp0.z, t.z);
xyz1.x = __powf(sp1.x, t.x);
xyz1.y = __powf(sp1.y, t.y);
xyz1.z = __powf(sp1.z, t.z);*/
psp0 += s.surface.terms[k].coeff * xyz0.x * xyz0.y * xyz0.z;
psp1 += s.surface.terms[k].coeff * xyz1.x * xyz1.y * xyz1.z;
}
if(psp0 * psp1 < 0.f)//root interval found
{
rayParam = rj;
hitSurfaceIndex = i;
break;
}
}
}
if(hitSurfaceIndex >= 0)
{
//perform shading using surface[hitSurfaceIndex] material
Surface* s = surface + hitSurfaceIndex;
float3 p = (1-rayParam)*vp0 + rayParam*vp1;
float n[3];
for(int i=0; i<3; ++i)
{
Polynomial3* g = s->gradient + i;
n[i] = 1.f;
for(int j=0; j<g->nTerms; ++j)
{
Term3* t = g->terms + j;
float3 f;
f.x = f.y = f.z = 1.f;
for(int l=0; l<t->x; ++l)
f.x *= p.x;
for(int l=0; l<t->y; ++l)
f.y *= p.y;
for(int l=0; l<t->z; ++l)
f.z *= p.z;
n[i] = t->coeff*f.x*f.y*f.z;
}
}
float3 v;
v.x = n[0]; v.y = n[1]; v.z = -n[2];
v = normalize(v);
pixels[y*width + x] = ((uint)((v.x*0.5f+0.5f)*255)) | (((uint)((v.y*0.5f+0.5f)*255))<<8) | (((uint)((v.z*0.5f+0.5f)*255))<<16);//0xffffffff;
}
else
{
pixels[y*width + x] = 0;
}
}
Sorry for so much stuff…thank you.