Hi,
I am writing a simple program and want to read the result from the device part.
The program can run without any problem, but it cannot print anything in the device part such as
#ifdef DEVICE_EMULATION
printf(“test bits : \n”);
#endif
after I insert -deviceemu into to makefile
I attached my code here, could you help me to find the problem?
My makefile is :
+++++++++++++++++++++++++
Name Executable
EXECUTABLE := GPU1
Cuda source files (compiled with nvcc)
CUFILES := GPU1.cu
C/C++ files (compiled with gcc/g++) would go here:
CCFILES :=
Need -I and -L flags to use cutil.h
all:
nvcc -deviceemu -I/usr/local/cuda -I…/common/inc -L/usr/local/cuda -L…/lib/lcutil ${CUFILES} -o ${EXECUTABLE}
clean:
rm -f *~ ${EXECUTABLE}
#testmake: GPU1.cu
nvcc -o testmake -I/usr/local/cuda -I…/common/inc -L/usr/local/cuda -L…/lib/lcutil GPU1.cu
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++++++++++++++++++
My code is
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++
include <stdlib.h>
include <stdio.h>
include <string.h>
include <math.h>
include
include
define NX 10
define NY 10
define NUM 10
define BLOCKSIZE 16
define Cx 0.1
define Cy 0.1
define N 100
global void MatAdd(float *As, float *Bs)
{
#ifdef DEVICE_EMULATION
printf(“test bits : \n”);
#endif
int i,j;
// Block Index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread Index
int tx = threadIdx.x;
int ty = threadIdx.y;
float A[NX][NY], B[NX][NY],S[NX][NY],D1[NX][NY],D2[NX][NY],D3[NX][NY],D4[NX][
NY];
A[ty][tx] = As[NXty +tx];
B[ty][tx] = Bs[NXty +tx];
S[ty][tx] = A[ty][tx];// Record the U1 for the next U0
printf(“%f”,B[5][5]);
/*for(i=0;i<NX;i++)
{
for (j=0;j<NY;j++)
printf(“%f/n”,B[i][j]);
}
*/
i = bx * blockDim.x + tx;
j = by * blockDim.y + ty;
if ((i<= NUM-2 && i>=1)&&(j<= NUM-2 && j>=1))
{
D1[i][j] = A[i+1][j] + A[i-1][j];
D2[i][j] = A[i][j+1] + A[i][j-1];
D3[i][j] = B[i+1][j] + B[i-1][j];
D4[i][j] = B[i][j+1] + B[i][j-1];
A[i][j] = 0.5*(CxD1[i][j]+CyD2[i][j]+CxD3[i][j]+CyD4[i][j])/(1+Cx+Cy)+(1-Cx-Cy)*B[i][j]/(1+Cx+Cy);
}
As[NXty +tx] = A[ty][tx]; // Receive the new U1
Bs[NXty +tx] = S[ty][tx]; // Update the U0 with previous U1
}
int main(void)
{
float *V0,*V1,*U0,*U1;
// Distribute the memory;
int size,i,j;
size = NX*NY*sizeof(float);
V0= (float *)malloc(size);
cudaMalloc ((void**)&U0,size);
size = NX*NY*sizeof(float);
V1 =(float *)malloc(size);
cudaMalloc ((void**)&U1,size);
// init the matrix;
for (i=0;i<NY*NX;i++)
V0[i] =V1[i]= 0;
V0[NX*NY/2]=10;
//cout<<V0[NX*NY/2]<<endl;
cudaMemcpy(U0,V0,size,cudaMemcpyHostToDevice);
cudaMemcpy(U1,V1,size,cudaMemcpyHostToDevice);
// Kernel invocation
dim3 dimBlock(BLOCKSIZE, BLOCKSIZE);
dim3 dimGrid(NX/dimBlock.x,NY/dimBlock.y);
for (j=0;j<N;j++)
MatAdd<<<dimGrid,dimBlock>>>(U1,U0);
// Retrieve results from device and store it in host array
cudaMemcpy (V1,U1,sizeof (float)(NXNY),cudaMemcpyDeviceToHost);
ofstream fout1;
fout1.open(“Diffusion.txt”,ios::out);
fout1<<" TITLE = "Example: Simple 2D-Volume Data" “<<endl;
fout1<<” VARIABLES = "X", "Y", "Concentration" “<<endl;
fout1<<” ZONE T="Region",I=10, J=10, F=POINT"<<endl;
for(i=0;i<NX;i++)
{
for(j=0;j<NY;j++)
{
fout1<<i<<" “<<j<<” "<<V1[NX*j +i]<<endl;
}
}
fout1.close();
// Free device memory
cudaFree(U0);
cudaFree(U1);
return 0;
}
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++