Hi,
I have some doubts in my program.
my sample code is,
Fortran interface that calls CUDA C function.
…
…
CALL h_fld(HXS,EYS,EZS)
…
…
HXS, EYS, EZS are of real8 type and of size NXNY*NZ.
CUDA C:
#include <cuda.h>
#include <math.h>
#include <cutil.h>
#include “cutil_inline.h”
const double DTMDY=1.532517053655797E-03;
const double DTMDZ=1.532517053655797E-03;
const int NX=32;
const int NY=32;
const int NZ=32;
global void ker_hxs(double *HXS, double *EYS, double *EZS)
{
int I,J,K,AR;
for(K=0; K<NZ; K++)
for(J=0; J<NY; J++)
for(I=1; I<NX; I++)
{
AR=I+JNY+kNX*NY;
HXS[AR]= HXS[AR]-(EZS[AR+NX]-EZS[AR])DTMDY+(EYS[AR+(NXNY)]-EYS[AR])*DTMDZ;
}
}
extern “C” void h_fld_( double *HXS,double *EYS, double *EZS)
{
double d_a,d_b, d_c;
size_t SF= NXNYNZsizeof(double);
cudaMalloc( (void **) &d_a, SF );
cudaMalloc( (void **) &d_b, SF );
cudaMalloc( (void **) &d_c, SF );
cudaMemcpy( d_a, HXS, SF, cudaMemcpyHostToDevice );
cudaMemcpy( d_b, EYS, SF, cudaMemcpyHostToDevice );
cudaMemcpy( d_c, EZS, SF, cudaMemcpyHostToDevice );
dim3 block(4,4);
dim3 grid(1,1);
ker_hxs<<<grid,block>>>(d_a,d_b,d_c);
cudaMemcpy(HXS,d_a,SF,cudaMemcpyDeviceToHost);
cudaFree((void **) d_a);
cudaFree((void **) d_b);
cudaFree((void **) d_c);
}
With this program I got correct result. But performance wise very slow (compared to Serial coding).
and my question is, “How do we declare threads for 3D arrays”. (tried cudaMemcpy3D also)
And one more question: If I go more than 2 Blocks or more than 32 threads, o/p is abnormal values.
thanks in advance.