/******************************************************************** * RayTraceCUDA.cu * This program ported to CUDA by noridon(NEW/Bio_100%). *********************************************************************/ #include #include #include #include #include ///////////////////////////////////////////////////////////////////// // Please enable the below if you want to use only host. //#define _CPU_ #ifdef _CPU_ #define __CONSTANT__ #define __LOCAL_CALL__ #define __GLOBAL_CALL__ #else #define __CONSTANT__ __constant__ #define __LOCAL_CALL__ __device__ #define __GLOBAL_CALL__ __global__ #endif #define SCREEN_SX (1024) #define SCREEN_SY (768) const double ZOOM = 600; const float DATA1[] = { 20,40,20, 0,0,0, -8,9,-3, ZOOM * SCREEN_SY / 212 }; const int NUMOBJ = 6; struct tagOBJS { float d[6]; int c[2]; }; __CONSTANT__ const tagOBJS objs[] = { 2,2,2, .2,.2,.2, 1,2, -2,2,2, .2,.2,.2, 1,3, -6,4,2, .2,.2,.2, 1,4, -2,2,-2, .2,.2,.2, 1,5, -6,4,-6, .2,.2,.2, 1,6, 0,-2,0, 20,1,20, 0,-1 }; struct tagSHDS { float d[7]; int c[1]; }; __CONSTANT__ const tagSHDS shds[] = { .9,.9,.9, .5,.4,.6, .7,6, .0,.9,.0, .5,.4,.6, .7,6, .9,.0,.0, .3,.6,.0, .0,0, .9,.9,.9, .3,.6,.0, .6,8, .0,.0,.9, .3,.6,.0, .6,8, .9,.9,.9, .3,.6,.0, .6,6, .0,.0,.0, .3,.6,1, .9,8 }; typedef struct _tagscrn{ int PX; int PY; }SCRN; void trace(SCRN scrn); void display(); void reshape(int w, int h); void resize(int w, int h); void timer(int value); /************************************************************************/ /* Init CUDA */ /************************************************************************/ bool InitCUDA(void) { int count = 0; int i = 0; cudaGetDeviceCount(&count); if(count == 0) { fprintf(stderr, "There is no device.\n"); return false; } for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } } if(i == count) { fprintf(stderr, "There is no device supporting CUDA 1.x.\n"); return false; } cudaSetDevice(i); return true; } /************[OpenGLの初期設定]********************/ inline void GLUT_INIT() { glutInitDisplayMode(GLUT_RGBA| GLUT_DOUBLE | GLUT_DEPTH); glutInitWindowSize(SCREEN_SX, SCREEN_SY); } inline void GLUT_CALL_FUNC() { glutDisplayFunc(display); glutReshapeFunc(resize); // glutTimerFunc(1,timer,0); } inline void MY_INIT() { // glClearColor(1.0, 1.0, 1.0, 1.0); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); } /********[ここからコールバック]****************************************/ double m_ctime; void display() { glLoadIdentity(); // gluLookAt(0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0); SCRN scrn; scrn.PX = SCREEN_SX; scrn.PY = SCREEN_SY; // DATA1[9] = ZOOM * scrn.PY / 212; clock_t ctime = clock(); trace(scrn); ctime = (double)(clock() - ctime) / ((double)CLOCKS_PER_SEC / 1000); m_ctime = ctime; glEnable(GL_DEPTH_TEST); glutSwapBuffers(); } void reshape(int w, int h) { glViewport(0, 0, w, h); glMatrixMode(GL_PROJECTION); glLoadIdentity(); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); } void resize(int w, int h) { /* ウィンドウ全体をビューポートにする */ glViewport(-w / 2, -h / 2, w * 2, h * 2); /* 変換行列の初期化 */ glLoadIdentity(); /* スクリーン上の表示領域をビューポートの大きさに比例させる */ // glOrtho(-w / 100.0, w / 100.0, -h / 100.0, h / 100.0, -1.0, 1.0); } void timer(int value) { glutPostRedisplay(); glutTimerFunc(1,timer,0); //タイマー関数 } ///////////////////////////////////////////////////////////////////// __CONSTANT__ const float ma=1000; __CONSTANT__ const float mi=1E-03; __CONSTANT__ const float pt = 4; __CONSTANT__ float VTbl[16]; __LOCAL_CALL__ _inline float3 cross(float3 vector, const int n, float* pT, float3 cvec); __LOCAL_CALL__ _inline float3 shade(float3* pVector, float3 color, const int n, float3* pLvec, int* pF, float2* pRfrn, float* pT, float3 cvec, float* V); __LOCAL_CALL__ _inline float3 box(float3 vector, float3 rvec, float3 vabc, float* pT); __LOCAL_CALL__ _inline float3 ball(float3 vector, float3 nvec, float3 rvec, float3 vabc, float* pT); __LOCAL_CALL__ _inline float3 pixel(float3 vector, float3 color, float3* pCvec, float* V); __GLOBAL_CALL__ void Render(float3* pLineBuffer, int sy); #define N_STREAM (1) void trace(SCRN scrn) { int i; int sx, sy; float v; float T[10]; float V[16]; for (i = 0; i < 10; ++i) T[i] = DATA1[i]; V[0]=T[0]; V[1]=T[1]; V[2]=T[2]; V[9]=T[0]-T[3]; V[10]=T[1]-T[4]; V[11]=T[2]-T[5]; v=sqrt(V[9]*V[9]+V[10]*V[10]+V[11]*V[11]); V[9]=V[9]/v; V[10]=V[10]/v; V[11]=V[11]/v; V[6]=-V[9]*V[10]; V[7]=1-V[10]*V[10]; V[8]=-V[11]*V[10]; V[3]=-(V[10]*V[8]-V[11]*V[7]); V[4]=-(V[11]*V[6]-V[9]*V[8]); V[5]=-(V[9]*V[7]-V[10]*V[6]); V[15]=T[9]; V[12]=T[6]; V[13]=T[7]; V[14]=T[8]; for (i = 1; i < 5; ++i) { v = sqrt(V[i*3+0]*V[i*3+0]+V[i*3+1]*V[i*3+1]+V[i*3+2]*V[i*3+2]); V[i*3+0]=V[i*3+0]/v;V[i*3+1]=V[i*3+1]/v;V[i*3+2]=V[i*3+2]/v; } // device memory alloc for line buffer int nBufByteSize, nBufFloat3Count; float3* pLineBuffer; cudaMalloc( reinterpret_cast(&pLineBuffer), (nBufByteSize = sizeof(float3) * (nBufFloat3Count = SCREEN_SX)) * N_STREAM); float3* pHostLine = new float3[nBufFloat3Count * N_STREAM]; #ifdef _CPU_ memcpy(VTbl, V, sizeof(V)); #else // Copy to constant array instead of global memory. cudaMemcpyToSymbol(VTbl, V, sizeof(V)); #endif // create streams cudaStream_t stream[N_STREAM]; for (i = 0; i < N_STREAM; ++i){ cudaStreamCreate(&stream[i]); } // begin tracing glBegin( GL_POINTS ); int PX = scrn.PX; int PY = scrn.PY; for (sy = 0; sy < PY; sy += N_STREAM) { #ifdef _CPU_ for(i = 0; i < N_STREAM; i++){ Render(pHostLine, sy + i); } #else dim3 block(8,1,1); dim3 threads(128,1,1); for(i = 0; i < N_STREAM; i++){ Render<<>>(pLineBuffer + nBufFloat3Count * i, sy + i); } // copy a result line buffer from VRAM to HOST cudaMemcpy( pHostLine, pLineBuffer, sizeof(float3) * SCREEN_SX, cudaMemcpyDeviceToHost ); // I want to copy by using streams asynchronously instead of above, but I could not. for(i = 0; i < N_STREAM; i++){ // cudaMemcpyAsync( pHostLine + nBufFloat3Count * i , pLineBuffer + nBufFloat3Count * i, nBufByteSize, cudaMemcpyDeviceToHost, stream[i] ); } // cudaThreadSynchronize(); #endif // drawing pixels for(i = 0; i < N_STREAM; i++){ for(sx = 0; sx < PX; sx++){ float3 color = *(pHostLine + nBufFloat3Count * i + sx); glColor3d(color.x, color.y, color.z); glVertex2d((float)sx / SCREEN_SX - 0.5, 0.5 - (float)sy / SCREEN_SY); } } } glEnd(); glFlush(); delete[] pHostLine; cudaFree(pLineBuffer); } __GLOBAL_CALL__ void Render(float3* pLineBuffer, int sy) { float* V = VTbl; const int PX = SCREEN_SX; const int PY = SCREEN_SY; float3 cvec; float3 vector; float v; #ifdef _CPU_ for(int sx = 0; sx < PX; sx++){ cvec.x = V[0]; cvec.y = V[1]; cvec.z = V[2]; vector.x=V[3]*(float)(sx-PX/2)+V[6]*(float)(PY/2-sy)-V[9]*V[15]; vector.y=V[4]*(float)(sx-PX/2)+V[7]*(float)(PY/2-sy)-V[10]*V[15]; vector.z=V[5]*(float)(sx-PX/2)+V[8]*(float)(PY/2-sy)-V[11]*V[15]; v=sqrt(vector.x*vector.x+vector.y*vector.y+vector.z*vector.z); vector.x=vector.x/v; vector.y=vector.y/v; vector.z=vector.z/v; float3 color = pixel(vector, make_float3(0,0,0), &cvec, V); if (color.x > 1.0) color.x = 1.0; if (color.y > 1.0) color.y = 1.0; if (color.z > 1.0) color.z = 1.0; pLineBuffer[sx] = color; } #else const int sx = blockIdx.x * 128 + threadIdx.x; // for threads cvec.x = V[0]; cvec.y = V[1]; cvec.z = V[2]; vector.x=V[3]*(float)(sx-PX/2)+V[6]*(float)(PY/2-sy)-V[9]*V[15]; vector.y=V[4]*(float)(sx-PX/2)+V[7]*(float)(PY/2-sy)-V[10]*V[15]; vector.z=V[5]*(float)(sx-PX/2)+V[8]*(float)(PY/2-sy)-V[11]*V[15]; v=rsqrtf(vector.x*vector.x+vector.y*vector.y+vector.z*vector.z); vector.x *= v; vector.y *= v; vector.z *= v; float3 color = pixel(vector, make_float3(0,0,0), &cvec, V); if (color.x > 1.0) color.x = 1.0; if (color.y > 1.0) color.y = 1.0; if (color.z > 1.0) color.z = 1.0; pLineBuffer[sx] = color; // __syncthreads(); // syncronize #endif } __LOCAL_CALL__ _inline float3 pixel(float3 vector, float3 color, float3* pCvec, float* V) { float2 rfrn = make_float2(1, 0); float3 lvec = make_float3(0,0,0); float3 cvec = *pCvec; float t = 0; int n, f; do { double tt = ma; int tn; for (n = 0; n < NUMOBJ; ++n) { float3 nvec = cross(vector, n, &t, cvec); if (tt > t && t > mi) { tt = t; tn = n; lvec.x = nvec.x; lvec.y = nvec.y; lvec.z = nvec.z; } } if (tt == ma){ *pCvec = cvec; return color; } cvec.x=cvec.x+tt*vector.x; cvec.y=cvec.y+tt*vector.y; cvec.z=cvec.z+tt*vector.z; n=tn; color = shade(&vector, color, n, &lvec, &f, &rfrn, &t, cvec, V); } while (f == 1); *pCvec = cvec; return color; } __LOCAL_CALL__ _inline float3 cross(float3 vector, const int n, float* pT, float3 cvec) { float3 nvec = make_float3(0, 0, 0); float3 rvec = make_float3(0, 0, 0); float3 vabc = make_float3(0, 0, 0); rvec.x =cvec.x-objs[n].d[0]; rvec.y=cvec.y-objs[n].d[1]; rvec.z=cvec.z-objs[n].d[2]; vabc.x=objs[n].d[3]; vabc.y=objs[n].d[4]; vabc.z=objs[n].d[5]; switch (objs[n].c[0]) { case 0: nvec = box(vector, rvec, vabc, pT); break; case 1: nvec = ball(vector, nvec, rvec, vabc, pT); break; default: break; } return nvec; } __LOCAL_CALL__ _inline float3 box(float3 vector, float3 rvec, float3 vabc, float* pT) { float3 nvec = make_float3(0,0,0); float t1, t2, t3; if (vector.x == 0.0) t1 = ma; else if (rvec.x < 0.0) t1=-(rvec.x+vabc.x)/vector.x; else t1=-(rvec.x-vabc.x)/vector.x; if (vector.y == 0.0) t2 = ma; else if (rvec.y < 0.0) t2=-(rvec.y+vabc.y)/vector.y; else t2=-(rvec.y-vabc.y)/vector.y; if (vector.z == 0.0) t3 = ma; else if (rvec.z < 0.0) t3=-(rvec.z+vabc.z)/vector.z; else t3=-(rvec.z-vabc.z)/vector.z; if (fabs(rvec.y+t1*vector.y)>vabc.y || fabs(rvec.z+t1*vector.z)>vabc.z) t1 = ma; if (fabs(rvec.z+t2*vector.z)>vabc.z || fabs(rvec.x+t2*vector.x)>vabc.z) t2 = ma; if (fabs(rvec.x+t3*vector.x)>vabc.x || fabs(rvec.y+t3*vector.y)>vabc.y) t3 = ma; if (t1 <= t2 && t1 <= t3) { *pT=t1; nvec.x=-vector.x/fabs(vector.x); nvec.y=0; nvec.z=0; } if (t2 <= t3 && t2 <= t1) { *pT=t2; nvec.y=-vector.y/fabs(vector.y); nvec.z=0; nvec.x=0; } if (t3 <= t1 && t3 <= t2) { *pT=t3; nvec.z=-vector.z/fabs(vector.z); nvec.x=0; nvec.y=0; } return nvec; } __LOCAL_CALL__ _inline float3 ball(float3 vector, float3 nvec, float3 rvec, float3 vabc, float* pT) { float t1, t2; double aa, bb, cc, dd; aa=vector.x*vector.x*vabc.x+vector.y*vector.y*vabc.y+vector.z*vector.z*vabc.z; bb=rvec.x*vector.x*vabc.x+rvec.y*vector.y*vabc.y+rvec.z*vector.z*vabc.z; cc=rvec.x*rvec.x*vabc.x+rvec.y*rvec.y*vabc.y+rvec.z*rvec.z*vabc.z-1; dd=bb*bb-aa*cc; if (dd<0) { *pT = ma; return nvec;} t1=(-bb-(float)sqrt(dd))/aa;t2=(-bb+(float)sqrt(dd))/aa; if (t1 *pT && *pT > mi) { sn = 0; sm = 0; } } color.x = color.x+(sr*(sa+sd*sn)+sp*sm)*rfrn.x; color.y = color.y+(sg*(sa+sd*sn)+sp*sm)*rfrn.x; color.z = color.z+(sb*(sa+sd*sn)+sp*sm)*rfrn.x; if (sf == 0 && rfrn.y < 4) { f=0; *pF = f; return color; } f=1;rfrn.x=rfrn.x*sf;rfrn.y=rfrn.y+1; vector.x=wx;vector.y=wy;vector.z=wz; *pVector = vector; *pLvec = lvec; *pF = f; *pRfrn = rfrn; return color; } ///////////////////////////////////////////////////////////////////// char* version = "2.2.030407"; char* compiler = "unknown"; char* clockcount = "unknown"; char* resolution = "1024*768"; char* username = "unknown"; char* sysname = "unknown"; int main(int argc, char* argv[]) { if(!InitCUDA()) { return 0; } printf("CUDA initialized.\n"); glutInit(&argc,argv); GLUT_INIT(); glutCreateWindow("window name"); GLUT_CALL_FUNC(); MY_INIT(); display(); printf("%06f ms\n", m_ctime); getchar(); glutMainLoop(); return 0; }