We have the problem with usage of more than one GPU in parallel on Windows 10 OS. Our program decompose initial numerical problem into a set of independent problems, and then it run N numerical engines (EXEs). “N” is number of GPUs in the system. One GPU is assigned to an instance of numerical engine, and it call DLL which initialize the GPU and perform calculations on it.
As both, interface and engine codes are very complex, we created a very simple code where we have the same behavior.
The first code is code for the exe which get number of GPUs and call “GPUstart.exe”. The code is given below.
#include <windows.h>
#include <stdio.h>
#include
#include <stdlib.h>
#include
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
typedef long long tipI;
using namespace std;
int main(){
int nGPUs;
cudaError_t ce = cudaGetDeviceCount(&nGPUs);
if (cudaGetDeviceCount(&nGPUs) != cudaSuccess){
return 3;
}
cout << endl << "Number of GPUs: " << nGPUs << endl << endl;
// 31.10.2017.
STARTUPINFO* si = new STARTUPINFO[nGPUs];
PROCESS_INFORMATION* pi = new PROCESS_INFORMATION[nGPUs];
for (int GPUindex = 0; GPUindex < nGPUs; GPUindex++){
// set the size of the structures
ZeroMemory(&si[GPUindex], sizeof(si[GPUindex]));
si[GPUindex].cb = sizeof(si[GPUindex]);
ZeroMemory(&pi[GPUindex], sizeof(pi[GPUindex]));
// start the program up
string appName = "GPUstart.exe";
char* argv;
char indexCH[256], nCH[256];
itoa(GPUindex, indexCH, 10);
itoa(nGPUs, nCH, 10);
string strArg = "\"" + appName + "\" " + indexCH + " " + nCH;
argv = (char*)(strArg.c_str());
BOOL FLAG = CreateProcess(
appName.c_str(),
argv,
NULL,
NULL,
FALSE,
0,
NULL,
NULL,
&si[GPUindex],
&pi[GPUindex]
);
}
// Close process and thread handles.
for (int GPUindex = 0; GPUindex < nGPUs; GPUindex++){
WaitForSingleObject(pi[GPUindex].hProcess, INFINITE);
WaitForSingleObject(pi[GPUindex].hThread, INFINITE);
CloseHandle(pi[GPUindex].hProcess);
CloseHandle(pi[GPUindex].hThread);
}
delete[] si;
delete[] pi;
system("pause");
return 0;
}
Code for “GPUstart.exe” is given below. As it can be noticed, it just call a function from “IGPU.dll”.
#include <windows.h>
#include <stdio.h>
#include
#include <stdlib.h>
#include
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
#include
typedef long long tipI;
using namespace std;
typedef void (* picsgpu)(int* pGPUindex, int* pNprocs);
int main(int* argc, char* argv){
HINSTANCE hDLL;
hDLL = LoadLibrary("IGPU.dll");
if (hDLL == NULL){
cout << endl << "Can not load \"IGPU.dll\"" << endl;
system("pause");
return 1;
}
picsgpu func = (picsgpu)GetProcAddress(hDLL, "icsgpu");
if (func == NULL){
cout << endl << "Can not load function \"icsgpu\" from \"IGPU.dll\"" << endl;
system("pause");
return 2;
}
int index = atoi(argv[1]);
int N = atoi(argv[2]);
func(&index, &N);
FreeLibrary(hDLL);
return 0;
}
Finally, in IGPU.dll we just call two CUDA finctions: cudaSetDevice and cudaDeviceReset, and make trace file. The code is given below.
#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
#include
typedef long long tipI;
using namespace std;
extern “C”
__declspec(dllexport) void __cdecl icsgpu(int* pGPUindex, int* pNprocs){ // 28.03.2017.
char nmbr[256];
itoa(*pGPUindex, nmbr, 10);
string fileName = "_trace_";
fileName.append(nmbr);
fileName.append(".txt");
FILE* trace_FPR = fopen(fileName.c_str(), "w");
int GPUindex = *pGPUindex;
int nGPUs = *pNprocs;
trace_FPR = fopen(fileName.c_str(), "a");
fprintf(trace_FPR, "\n icsgpu - begin: GPUindex = %d, nGPUs = %d \n", GPUindex, nGPUs);
fclose(trace_FPR);
if (cudaSetDevice(GPUindex)){
trace_FPR = fopen(fileName.c_str(), "a");
fprintf(trace_FPR, "\n cudaSetDevice - error \n");
fclose(trace_FPR);
}
trace_FPR = fopen(fileName.c_str(), "a");
fprintf(trace_FPR, "\n cudaSetDevice - success \n");
fclose(trace_FPR);
if (cudaDeviceReset()){
trace_FPR = fopen(fileName.c_str(), "a");
fprintf(trace_FPR, "\n cudaDeviceReset - error \n");
fclose(trace_FPR);
}
trace_FPR = fopen(fileName.c_str(), "a");
fprintf(trace_FPR, "\n icsgpu - end: GPUindex = %d \n", GPUindex);
fclose(trace_FPR);
return;
}
After we run the code on machine with 8 GTX-680 GPUs and Windows 10 OS, the machine become unresponsive and it restarts after few minutes. All trace files (created for different DLLs, i.e. different GPUs) are similar. There is only one line:
for 0th GPU: “icsgpu - begin: GPUindex = 0, nGPUs = 8”
for 1st GPU: “icsgpu - begin: GPUindex = 1, nGPUs = 8”
…
So, program stuck on function “cudaSetDevice”. Please advise.