How to use emulation mode

Hi,

I am writing a simple program and want to read the result from the device part.

The program can run without any problem, but it cannot print anything in the device part such as

#ifdef DEVICE_EMULATION
printf(“test bits : \n”);
#endif
after I insert -deviceemu into to makefile

I attached my code here, could you help me to find the problem?

My makefile is :

+++++++++++++++++++++++++

Name Executable

EXECUTABLE := GPU1

Cuda source files (compiled with nvcc)

CUFILES := GPU1.cu

C/C++ files (compiled with gcc/g++) would go here:

CCFILES :=

Need -I and -L flags to use cutil.h

all:
nvcc -deviceemu -I/usr/local/cuda -I…/common/inc -L/usr/local/cuda -L…/lib/lcutil ${CUFILES} -o ${EXECUTABLE}

clean:
rm -f *~ ${EXECUTABLE}
#testmake: GPU1.cu

nvcc -o testmake -I/usr/local/cuda -I…/common/inc -L/usr/local/cuda -L…/lib/lcutil GPU1.cu

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++

My code is

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++

include <stdlib.h>

include <stdio.h>

include <string.h>

include <math.h>

include

include

define NX 10

define NY 10

define NUM 10

define BLOCKSIZE 16

define Cx 0.1

define Cy 0.1

define N 100

global void MatAdd(float *As, float *Bs)

{

#ifdef DEVICE_EMULATION
printf(“test bits : \n”);
#endif

int i,j;
// Block Index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread Index
int tx = threadIdx.x;
int ty = threadIdx.y;

float A[NX][NY], B[NX][NY],S[NX][NY],D1[NX][NY],D2[NX][NY],D3[NX][NY],D4[NX][
NY];

A[ty][tx] = As[NXty +tx];
B[ty][tx] = Bs[NX
ty +tx];
S[ty][tx] = A[ty][tx];// Record the U1 for the next U0

printf(“%f”,B[5][5]);
/*for(i=0;i<NX;i++)
{
for (j=0;j<NY;j++)
printf(“%f/n”,B[i][j]);
}
*/

i = bx * blockDim.x + tx;
j = by * blockDim.y + ty;

if ((i<= NUM-2 && i>=1)&&(j<= NUM-2 && j>=1))
{
D1[i][j] = A[i+1][j] + A[i-1][j];
D2[i][j] = A[i][j+1] + A[i][j-1];
D3[i][j] = B[i+1][j] + B[i-1][j];
D4[i][j] = B[i][j+1] + B[i][j-1];
A[i][j] = 0.5*(CxD1[i][j]+CyD2[i][j]+CxD3[i][j]+CyD4[i][j])/(1+Cx+Cy)+(1-Cx-Cy)*B[i][j]/(1+Cx+Cy);
}

As[NXty +tx] = A[ty][tx]; // Receive the new U1
Bs[NX
ty +tx] = S[ty][tx]; // Update the U0 with previous U1

}

int main(void)
{
float *V0,*V1,*U0,*U1;

// Distribute the memory; 
int size,i,j;

size = NX*NY*sizeof(float);
V0= (float *)malloc(size);
cudaMalloc ((void**)&U0,size);

size = NX*NY*sizeof(float);
V1 =(float *)malloc(size);
cudaMalloc ((void**)&U1,size);

// init the matrix;
for (i=0;i<NY*NX;i++)
  V0[i] =V1[i]= 0;

V0[NX*NY/2]=10;
//cout<<V0[NX*NY/2]<<endl;

cudaMemcpy(U0,V0,size,cudaMemcpyHostToDevice);
cudaMemcpy(U1,V1,size,cudaMemcpyHostToDevice);

// Kernel invocation
dim3 dimBlock(BLOCKSIZE, BLOCKSIZE);
dim3 dimGrid(NX/dimBlock.x,NY/dimBlock.y);

for (j=0;j<N;j++)
MatAdd<<<dimGrid,dimBlock>>>(U1,U0);

// Retrieve results from device and store it in host array
cudaMemcpy (V1,U1,sizeof (float)(NXNY),cudaMemcpyDeviceToHost);

ofstream fout1;
fout1.open(“Diffusion.txt”,ios::out);
fout1<<" TITLE = "Example: Simple 2D-Volume Data" “<<endl;
fout1<<” VARIABLES = "X", "Y", "Concentration" “<<endl;
fout1<<” ZONE T="Region",I=10, J=10, F=POINT"<<endl;

for(i=0;i<NX;i++)
{
for(j=0;j<NY;j++)
{
fout1<<i<<" “<<j<<” "<<V1[NX*j +i]<<endl;
}
}

fout1.close();

// Free device memory
cudaFree(U0);
cudaFree(U1);
return 0;
}
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++