Hi All, I am a novice CUDA developer, i am trying to translate a FDTD algorithm into cuda implementations. however, i found that the kernel functions are not doing what i expect them to do. The resulted matrix is always 0. My approach is that to define some h_( host memory type) matrices and initialize them then pass them to h_(host memories) and carry out the simulation by updating the matrices on the host global memories.
Would anyone kind enough to point some errors in my programs? i could find no other place to find any help. Thanks in Advance. :rolleyes:
#include <time.h>
#include <stdio.h>
#include <cuda.h>
#define MATRIXSIZE 480*480
#define XYLENGTH 480
#define BLOCK_LENGTH 16
#define AR_INDEX(ar , a, b) (ar[ a*XYLENGTH + b ])
//declaration of the matrices used in the host memeory
float* h_ga;
float* h_dz;
float* h_ez;
float* h_hx;
float* h_hy;
float* h_ihx;
float* h_ihy;
float* h_gi2;
float* h_gj2;
float* h_gi3;
float* h_gj3;
float* h_fi1;
float* h_fj1;
float* h_fi2;
float* h_fj2;
float* h_fi3;
float* h_fj3;
//declaration of the matrices used in the kernal
__device__ float* d_ga;
__device__ float* d_dz;
__device__ float* d_ez;
__device__ float* d_hx;
__device__ float* d_hy;
__device__ float* d_ihx;
__device__ float* d_ihy;
__device__ float* d_gi2;
__device__ float* d_gj2;
__device__ float* d_gi3;
__device__ float* d_gj3;
__device__ float* d_fi1;
__device__ float* d_fj1;
__device__ float* d_fi2;
__device__ float* d_fj2;
__device__ float* d_fi3;
__device__ float* d_fj3;
// Kernel that executes on the CUDA device
__global__ void UpdateDZfield(int time)
{
// the coordinate for sinusoidal source
int ic = 480 /2 -5;
int jc = 480 /2 -5;
float ddx = 0.01; // cell size
float dt = ddx / 6e8; // time step
float pulse;
float pi = 3.14159;
//Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Tread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// coordinates location in the global matrix
int x = bx * BLOCK_LENGTH + tx;
int y = by *BLOCK_LENGTH + ty;
if( x== 0 || y ==0){
// boundary conditions do nothing
}else{
// update the dz field
AR_INDEX(d_dz, x, y) = d_gi3[x] * d_gj3[y]* AR_INDEX( d_dz, x, y)
+ d_gi2[x]* d_gj2[y] * 0.5 *( AR_INDEX( d_hy, x ,y) - AR_INDEX( d_hy, x-1 ,y)
- AR_INDEX( d_hx, x ,y) + AR_INDEX( d_hx, x ,y-1) );
}
// sinusoidal sources
pulse = sinf(2*pi*1500*1e6*dt*time);
if( x ==ic && y ==jc)
AR_INDEX(d_dz, x, y) = pulse;
} // end of the UpdateDZfield(int time)
__global__ void UpdateEZfield( )
{
//Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Tread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// coordinates location in the global matrix
int x = bx * BLOCK_LENGTH + tx;
int y = by *BLOCK_LENGTH + ty;
//calculation of EZ field
AR_INDEX(d_ez, x, y) = AR_INDEX(d_ga, x, y) * AR_INDEX(d_dz, x,y);
//set the Ez edges to 0, as part of the PML
if( x==0 || x == (480-1) || y==0 || y==(480-1))
AR_INDEX(d_ez,x,y) = 0.0;
}// end of the UpdateEZfield( )
__global__ void UpdateHfield( )
{
float curl_e;
//Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Tread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// coordinates location in the global matrix
int x = bx * BLOCK_LENGTH + tx;
int y = by * BLOCK_LENGTH + ty;
//calculation of Hx field
if(y == (480-1)) {
// the boundary condition,do nothing
} else{
curl_e = AR_INDEX(d_ez, x,y) - AR_INDEX(d_ez, x,y+1);
AR_INDEX(d_ihx, x, y) = AR_INDEX(d_ihx, x, y) + d_fi1[x]* curl_e;
AR_INDEX(d_hx, x, y) = d_fj3[y]* AR_INDEX(d_hx,x,y) + d_fj2[y]*0.5*(curl_e + AR_INDEX( d_ihx,x,y) );
}
//calculation of Hy field
if(x == (480-1)) {
// the boundary condition ,do nothing
}else{
curl_e = AR_INDEX(d_ez, x+1,y) - AR_INDEX(d_ez, x,y);
AR_INDEX(d_ihy, x, y) = AR_INDEX(d_ihy, x, y) + d_fj1[y]* curl_e;
AR_INDEX(d_hy, x, y) = d_fi3[x]* AR_INDEX(d_hy,x,y) + d_fi2[x]*0.5*(curl_e + AR_INDEX( d_ihy,x,y) );
}
} // end of the UpdateHfield( )
// main driver that executes on the host
int main(void)
{
// declare variables used in FDTD simulation
int l, n ,i , j, nsteps, npml,T;
float ddx, dt,epsz, pi, epsilon, sigma, eaf;
float xn, xxn, xnum, xd, curl_e;
float t0, spread;
//assign a file to record the result
FILE *fp,*fopen();
// allocate host memory for matrices and arrays
unsigned int mem_size_matrix = sizeof(float) * MATRIXSIZE;
h_ga = (float*) malloc(mem_size_matrix);
h_dz = (float*) malloc(mem_size_matrix);
h_ez = (float*) malloc(mem_size_matrix);
h_hx = (float*) malloc(mem_size_matrix);
h_hy = (float*) malloc(mem_size_matrix);
h_ihx = (float*) malloc(mem_size_matrix);
h_ihy = (float*) malloc(mem_size_matrix);
unsigned int mem_size_array = sizeof(float) * XYLENGTH;
h_gi2 = (float*) malloc(mem_size_array);
h_gj2 = (float*) malloc(mem_size_array);
h_gi3 = (float*) malloc(mem_size_array);
h_gj3 = (float*) malloc(mem_size_array);
h_fi1 = (float*) malloc(mem_size_array);
h_fj1 = (float*) malloc(mem_size_array);
h_fi2 = (float*) malloc(mem_size_array);
h_fj2 = (float*) malloc(mem_size_array);
h_fi3 = (float*) malloc(mem_size_array);
h_fj3 = (float*) malloc(mem_size_array);
// allocate device memory for matrices and arrays
cudaMalloc( (void**) & d_ga, mem_size_matrix);
cudaMalloc( (void**) & d_dz, mem_size_matrix);
cudaMalloc( (void**) & d_ez, mem_size_matrix);
cudaMalloc( (void**) & d_hx, mem_size_matrix);
cudaMalloc( (void**) & d_hy, mem_size_matrix);
cudaMalloc( (void**) & d_ihx, mem_size_matrix);
cudaMalloc( (void**) & d_ihy, mem_size_matrix);
cudaMalloc( (void**) & d_gi2, mem_size_array );
cudaMalloc( (void**) & d_gj2, mem_size_array );
cudaMalloc( (void**) & d_gi3, mem_size_array );
cudaMalloc( (void**) & d_gj3, mem_size_array );
cudaMalloc( (void**) & d_fi1, mem_size_array );
cudaMalloc( (void**) & d_fj1, mem_size_array );
cudaMalloc( (void**) & d_fi2, mem_size_array );
cudaMalloc( (void**) & d_fj2, mem_size_array );
cudaMalloc( (void**) & d_fi3, mem_size_array );
cudaMalloc( (void**) & d_fj3, mem_size_array );
//assign the value to the variables declared
epsz = 8.8e-12;
// initialize the matrix
for( i =0; i< XYLENGTH; i++) {
for( j =0; j< XYLENGTH;j++){
AR_INDEX(h_dz , i , j)= 0.0;
AR_INDEX(h_ez , i , j)= 0.0;
AR_INDEX(h_hx , i , j)= 0.0;
AR_INDEX(h_hy , i , j)= 0.0;
AR_INDEX(h_ihx , i , j)= 0.0;
AR_INDEX(h_ihy , i , j)= 0.0;
AR_INDEX(h_ga , i , j) = 1.0;
} // end inner for loop
} // end outer for loop
// calculate the PML parameters
for( i=0;i<XYLENGTH; i++){
h_gi2[i] = 1.0;
h_gi3[i] = 1.0;
h_fi1[i] = 0.0;
h_fi2[i] = 1.0;
h_fi3[i] = 1.0;
h_gj2[i] = 1.0;
h_gj3[i] = 1.0;
h_fj1[i] = 0.0;
h_fj2[i] = 1.0;
h_fj3[i] = 1.0;
}
//Ask for the Number of PML cells for calcualtion
printf("Number of PML cells ---->");
scanf("%d", &npml);
for( i =0; i <= npml; i++){
xnum = npml -i;
xd = npml;
xxn = xnum/xd;
xn = 0.33 *pow(xxn,3);
h_gi2[i] = 1.0 / (1.0 + xn);
h_gi2[XYLENGTH-1-i] = 1.0 /(1.0 + xn);
h_gi3[i] = (1.0 -xn)/ (1.0 +xn);
h_gi3[XYLENGTH-1-i] = (1.0 -xn)/ (1.0 +xn);
xxn = (xnum -.5) /xd;
xn = 0.25 * pow(xxn,3);
h_fi1[i] = xn;
h_fi1[XYLENGTH-2-i] =xn;
h_fi2[i] = 1.0 / (1.0 + xn);
h_fi2[XYLENGTH-2-i] = 1.0 /(1.0 + xn);
h_fi3[i] = (1.0 -xn)/ (1.0 +xn);
h_fi3[XYLENGTH-2-i] = (1.0 -xn)/ (1.0 +xn);
}
for( j =0; j<=npml; j++){
xnum = npml -j;
xd = npml;
xxn = xnum /xd;
xn = 0.33 * pow(xxn, 3);
h_gj2[XYLENGTH-1-j] = 1.0 /(1.0 + xn);
h_gj3[j] = (1.0 -xn)/ (1.0 +xn);
h_gj3[XYLENGTH-1-j] = (1.0 -xn)/ (1.0 +xn);
xxn = (xnum -.5) /xd;
xn = 0.25 * pow(xxn,3);
h_fj1[j] = xn;
h_fj1[XYLENGTH-2-j] =xn;
h_fj2[j] = 1.0 / (1.0 + xn);
h_fj2[XYLENGTH-2-j] = 1.0 /(1.0 + xn);
h_fj3[j] = (1.0 -xn)/ (1.0 +xn);
h_fj3[XYLENGTH-2-j] = (1.0 -xn)/ (1.0 +xn);
}
// simulation set up
t0 = 40.0;
spread = 15.0;
T = 0;
nsteps = 1;
//copy the matrices and arrays from host memory to device memory
cudaMemcpy(d_dz,h_dz, mem_size_matrix ,cudaMemcpyHostToDevice);
cudaMemcpy(d_ez,h_ez, mem_size_matrix ,cudaMemcpyHostToDevice);
cudaMemcpy(d_hx,h_hx, mem_size_matrix ,cudaMemcpyHostToDevice);
cudaMemcpy(d_hy,h_hy, mem_size_matrix ,cudaMemcpyHostToDevice);
cudaMemcpy(d_ga,h_ga, mem_size_matrix ,cudaMemcpyHostToDevice);
cudaMemcpy(d_ihx,h_ihx, mem_size_matrix ,cudaMemcpyHostToDevice);
cudaMemcpy(d_ihy,h_ihy, mem_size_matrix ,cudaMemcpyHostToDevice);
cudaMemcpy(d_gi2,h_gi2, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_gj2,h_gj2, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_gi3,h_gi3, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_gj2,h_gj2, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_fi1,h_fi1, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_fj1,h_fj1, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_fi2,h_fi2, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_fj2,h_fj2, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_fi3,h_fi3, mem_size_array,cudaMemcpyHostToDevice);
cudaMemcpy(d_fj3,h_fj3, mem_size_array,cudaMemcpyHostToDevice);
//kernal dimensions setup
dim3 dimBlock (BLOCK_LENGTH, BLOCK_LENGTH);
dim3 dimGrid (XYLENGTH / BLOCK_LENGTH , XYLENGTH / BLOCK_LENGTH);
//start simulation
while(nsteps >0){
printf("nsteps -->");
scanf("%d", &nsteps);
printf("%d\n", nsteps);
for ( n =1; n <= nsteps; n++){
T = T+1;
UpdateDZfield <<< dimGrid , dimBlock >>> ( T );
cudaThreadSynchronize();
UpdateEZfield <<< dimGrid , dimBlock >>> ( );
cudaThreadSynchronize();
UpdateHfield <<< dimGrid , dimBlock >>> ( );
cudaThreadSynchronize();
} // end for loop
} // end while loop
//copy d_ez back to h_ez and record into file
cudaMemcpy(h_ez, d_ez, mem_size_matrix ,cudaMemcpyDeviceToHost);
//cudaMemcpy(h_hx, d_hx, mem_size_matrix ,cudaMemcpyDeviceToHost);
// write the E field out to a file "Ez"
fopen_s(&fp,"Ez.txt", "w" );
for(j=0;j< XYLENGTH;j++){
for(i=0;i< XYLENGTH;i++) {
fprintf(fp, "%6.3f ", AR_INDEX(h_ez,i,j));
} // end inner for loop
fprintf( fp," \n");
} // end outer for loop
/*
// write the d_hx field out to a file "Ez"
fopen_s(&fp,"Hx", "w" );
for(j=0;j< XYLENGTH;j++){
for(i=0;i< XYLENGTH;i++) {
fprintf(fp, "%6.3f ", AR_INDEX(h_hx,i,j));
} // end inner for loop
fprintf( fp," \n");
} // end outer for loop
*/
// close the file
fclose(fp);
// clear host and device memories
free(h_ez);
free(h_dz);
free(h_hx);
free(h_hy);
free(h_ihx);
free(h_ihy);
free(h_ga);
free(h_gi2);
free(h_gj2);
free(h_gi3);
free(h_gj3);
free(h_fi1);
free(h_fj1);
free(h_fi2);
free(h_fj2);
free(h_fi3);
free(h_fj3);
cudaFree(d_ez);
cudaFree(d_dz);
cudaFree(d_hx);
cudaFree(d_hy);
cudaFree(d_ihx);
cudaFree(d_ihy);
cudaFree(d_ga);
cudaFree(d_gi2);
cudaFree(d_gj3);
cudaFree(d_fi1);
cudaFree(d_fj1);
cudaFree(d_fi2);
cudaFree(d_fj2);
cudaFree(d_fi3);
cudaFree(d_fj3);
system("pause");
}
CUDAProg.rtf (14.1 KB)