I want to make use of the two GPUs on my motherboard and am using MPICH2 to do this.
The process is startd with “mpiexec -n 2 fdtd.exe” and the error detection then tells me “no CUDA-capable device is detcted” and the result is garbage.
The simple code below runs well(correctly) within the VC++9.0 environment.
Using Windows 7, VC9.0,SDK3.2,Toolkit3.2,Drivers 270.61 MPICH2
How can I make MPI detect the GPUs ??? Test code below:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <sys/types.h>
#include <time.h>
#include <assert.h>
#include <cuda.h>
#include “mpi.h”
// includes, project
//#include <cutil_inline.h>
global static void exeyloop(float* ex)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
ex[idx]=4.0;
}
global static void ehehloop(float* ex)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
ex[idx]=5.0;
}
///////////////////////////////////////////////////////////////////////////////
void assignEvaluateFdtd1()
{
int i,j;
int blockt; //number of blocks
int gridt; //number of grids
blockt=5;
gridt=200;
float *ex_a;
float *ex_d;
cudaError_t cudareturn;
int gpudevice;
ex_a=(float *)malloc(100*sizeof(float));
for(i=0;i<100;i++){
ex_a[i]=(float)3.0;
}
cudareturn=cudaSetDevice(0);
printf(“Cuda set device %d\n”,cudareturn);
if(cudareturn!=0)printf(“\n %s \n”,cudaGetErrorString (cudareturn));
if (cudareturn == cudaErrorInvalidDevice)
{
printf(“\n cudaSetDevice returned cudaErrorInvalidDevice”);
}
cudareturn=cudaMalloc((void **) &ex_d, 100*sizeof(float));
printf(“Cuda Malloc to device %d\n”,cudareturn);
cudareturn=cudaMemcpy(ex_d, ex_a, 100*sizeof(float), cudaMemcpyHostToDevice);
printf(“Cuda cudaMemcpy device %d\n”,cudareturn);
// allocate array dimensions
dim3 dimBlock(blockt,1);
dim3 dimGrid(gridt,1);
exeyloop <<< dimGrid, dimBlock >>> (ex_d);
cudareturn=cudaMemcpy(ex_a, ex_d, 100*sizeof(float), cudaMemcpyDeviceToHost);
printf(“Cuda cudamemcpy back device %d\n”,cudareturn);
printf(“\n exey \n”);
for(j=0;j<10;j++){
printf("\n %f ",ex_a[j]);
}
ehehloop <<< dimGrid, dimBlock >>> (ex_d);
cudaMemcpy(ex_a, ex_d, 100*sizeof(float), cudaMemcpyDeviceToHost);
printf(“\n eheh \n”);
for(j=0;j<10;j++){
printf("\n %f ",ex_a[j]);
}
cudaFree(ex_d);
// end of assignEvaluateFDTD
}
int main(int argc,char **argv)
{
int done = 0, n, myid, numprocs, i;
char processor_name[100];
int namelen;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
MPI_Get_processor_name(processor_name,&namelen);
if(myid==0){
assignEvaluateFdtd1();
}
//the the total elapsed time in ms
MPI_Finalize();
return(0) ;
}