Hello everyone,
someone tell me why my program in CUDA only works in emulation mode?
this is the kernel’s code:
[codebox]#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cutil_math.h>
#include <cutil_inline.h>
#include <cuda_runtime.h>
#include “DrawScreen.h”
#include “geometry.h”
#define EPSILON 0.00001
#define M_PI 3.1415926535897932384626433832795
static double *dvertex = NULL;
static int *dface = NULL;
static int *dfaceIndex = NULL;
static int dfaceNum;
static Pixel *da_d = NULL;
static Pixel *ha_d = NULL;
device float RayTriangleInc(float3 d,float3 o, float3 p0, float3 p1, float3 p2)
{
float3 e1, e2;
normalize(d);
e1=p1-p0;
e2=p2-p0;
float3 q=cross(d,e2);
float a=dot(e1,q);
if (a>-EPSILON && a<EPSILON){
return 0;
}
float f = 1/a;
float3 s=o-p0;
float u=f*(dot(s,q));
if (u<0){
return 0;
}
float3 r=cross(s,e1);
float v=f*(dot(d,r));
if (v<0 || u+v>1){
return 0;
}
float t=f*(dot(e2,r));
return 1;
}
device host float3 CalcolaVettorePunto(float3 xdir, float3 ydir, float3 LLdir, int tdx, int tdy)
{
float3 dir;
dir.x=LLdir.x + (xdir.x*tdx);
dir.y=LLdir.y + (ydir.y*tdy);
dir.z=LLdir.z;
normalize(dir);
return dir;
}
global void DrawScreen(float *da_d, int w, int h, int ch,float3 eye, float3 xdir, float3 ydir, float3 LLdir,
double *dvertex, int *dface, int *dfaceIndex, int dfaceNum)
{
int n =w*h*ch;
int tdx = (threadIdx.x+blockIdx.x*blockDim.x);
int tdy = (threadIdx.y+blockIdx.y*blockDim.y);
int idarray = (tdx+tdy*w)*ch;
float3 dir= CalcolaVettorePunto(xdir, ydir, LLdir, tdx, tdy);
float3 p0,p1,p2;
int inc=0;
for (int i=0; i<dfaceNum; i++){
p0.x = dvertex[(dface[dfaceIndex[i]+1]-1)*4];
p0.y = dvertex[((dface[dfaceIndex[i]+1]-1)*4)+1];
p0.z = dvertex[((dface[dfaceIndex[i]+1]-1)*4)+2];
p1.x = dvertex[(dface[dfaceIndex[i]+4]-1)*4];
p1.y = dvertex[((dface[dfaceIndex[i]+4]-1)*4)+1];
p1.z = dvertex[((dface[dfaceIndex[i]+4]-1)*4)+2];
p2.x = dvertex[(dface[dfaceIndex[i]+7]-1)*4];
p2.y = dvertex[((dface[dfaceIndex[i]+7]-1)*4)+1];
p2.z = dvertex[((dface[dfaceIndex[i]+7]-1)*4)+2];
inc = inc || (RayTriangleInc (dir,eye, p0 , p1, p2)==1.0);
}
if(idarray<n){
if(inc==1){
da_d[idarray]=1.0f;
da_d[idarray+1]=1.0f;
da_d[idarray+2]=1.0f;
da_d[idarray+3]=1.0f;
}else{da_d[idarray]=0.0f;
da_d[idarray+1]=0.0f;
da_d[idarray+2]=0.0f;
da_d[idarray+3]=1.0f;
}
}
}
host void callCuda(Pixel *a_d, int w, int h, int ch, float x, float y, float z, scene *hScene)
{
if(dvertex==NULL && dface==NULL && dfaceIndex==NULL && da_d == NULL){
//Alloco memoria su device
cutilSafeCall(cudaMalloc((void**) &dvertex, hScene->vertexSize * sizeof(double)));
cutilSafeCall(cudaMalloc((void**) &dface, hScene->faceSize * sizeof(int)));
cutilSafeCall(cudaMalloc((void**) &dfaceIndex, hScene->faceIndexSize * sizeof(int)));
cutilSafeCall(cudaMalloc((void**) &da_d, (w*h*ch) * sizeof(Pixel)));
//Copio Dati dalla scena agli array su device
cutilSafeCall(cudaMemcpy(dvertex, hScene->vertex, hScene->vertexSize * sizeof(double),cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(dface, hScene->face, hScene->faceSize * sizeof(int),cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(dfaceIndex, hScene->faceIndex, hScene->faceIndexSize * sizeof(int),cudaMemcpyHostToDevice));
}
//Copio Dati dalla scena alle variabili su device
dfaceNum = hScene->faceNum;
float3 Eye;
Eye.x =x;
Eye.y =y;
Eye.z =z;
float distanza=2.00;
float vFov=60.00;
float hFov=vFov*((float)w/(float)h);
vFov = vFov * M_PI / 180;
hFov = hFov * M_PI / 180;
float3 At;
At.x= Eye.x;
At.y= Eye.y;
At.z=Eye.z-distanza;
float3 vettoreCam = At-Eye;
float3 vettore_up = {0.0,1.0,0.0};
float3 vettore_v = cross(vettoreCam,vettore_up);
float3 ydir= -vettore_up;
float3 xdir= cross(ydir,vettoreCam);
xdir= normalize(xdir)*((2.0*distanza*tan(hFov/2.0))/(float)w);
ydir= normalize(ydir)*((2.0*distanza*tan(vFov/2.0))/(float)h);
float3 LLdir;
LLdir.x=vettoreCam.x-(xdir.x*w)/2;
LLdir.y=vettoreCam.y-(ydir.y*h)/2;
LLdir.z=vettoreCam.z;
dim3 dimGrid (80,60);
dim3 dimBlock (8,8);
DrawScreen <<< dimGrid, dimBlock >>> (da_d,w,h,ch,Eye,xdir,ydir,LLdir, dvertex, dface, dfaceIndex, dfaceNum);
//ha_d = (Pixel *) malloc((w*h*ch) * sizeof(Pixel));
cutilSafeCall(cudaMemcpy(a_d, da_d, (w*h*ch) * sizeof(Pixel),cudaMemcpyDeviceToHost));
for(int i=0; i<(w*h*ch); i++){
if(a_d[i]!= 0) //<b>ERROR: 0xC0000005: Access violation reading location 0x03020000. only without emulation mode </b>
printf("a_d[%d] %f\n ",i,a_d[i]);
}
// cleanup memory
//free(a_d);
//cutilSafeCall(cudaFree(da_d));
}
[/codebox]
Help me this program is part of my thesis … : '(