can not execute external cuda process

seehakoo · August 21, 2009, 3:06pm

Hi, everyone,

I write a program to implement executing a external cuda process.

But the process exited before running to cuda function call.

My cuda process code: process1.cu ( just no effect and loop for cuda kernel )

[codebox]

#include

#include <cuda_runtime.h>

#include <cuda.h>

#include <sys/time.h>

#define LOOP_SIZE 39999999

#define LIST_NUM 100

#define GPU_LOOP 1

using namespace std;

#define TIMESTAMP(s); {\

timeval tv;\

char time_str[100];\

gettimeofday(&tv,NULL);\

strftime(time_str, 100, "%F %H:%M:%S", localtime(&tv.tv_sec));\

cout << time_str << ":" << tv.tv_usec << "    KProcess-1: " << s << endl;\

}

global void Kernel_dataCount(long* d_list, int num, clock_t* time)

{

clock_t start = clock();

int index = blockIdx.x * blockDim.x + threadIdx.x;

for(int j = 0; j < GPU_LOOP; j ++){

	for(long i = 0; i < LOOP_SIZE; i ++){

		d_list[index] = num;

	}

}

*time = clock() - start;

}

void dataCount(int num)

{

// GPU clock

clock_t* d_time;

clock_t endTime;

long *d_list;

int size = LIST_NUM * sizeof(long);

long *list = new long[LIST_NUM];

for(int i = 0; i < LIST_NUM; i++){

	list[i] = -1;

}

TIMESTAMP("Start!");

// allocate resource

cudaMalloc((void**) &d_list, size);

cudaMalloc((void**) &d_time, sizeof(clock_t));

cudaMemcpy(d_list, list, size, cudaMemcpyHostToDevice);

// set Grid/Block/Thread

if(LIST_NUM > 256){

	cout << "Error num" << endl;

	exit(1);

}

dim3 dimBlock(LIST_NUM);

dim3 dimGrid(1);

// run Kernel function

TIMESTAMP("Call Kernel Function!");

Kernel_dataCount<<<dimGrid, dimBlock>>>(d_list, num, d_time);

TIMESTAMP("Call Kernel Function done!");

// block until the device has completed

cudaThreadSynchronize();

TIMESTAMP("Synchronize done!");

// Read result from device memory

cudaMemcpy(list, d_list, size, cudaMemcpyDeviceToHost);

cudaMemcpy(&endTime, d_time, sizeof(clock_t), cudaMemcpyDeviceToHost);

TIMESTAMP("cudaMemcpy done!");

for(int i = 0; i < LIST_NUM; i++){

	if (list[i] != num){

		cout << "!!Error!! list[" << i << "]= " << list[i] << endl;

	}

}

// Free device memory

cudaFree(d_list);

TIMESTAMP("cudaFree done!");

}

int main(int argc, char *argv)

{

int num=atoi(argv[1]);

TIMESTAMP("Program Start!");

dataCount(num);

TIMESTAMP("Program Exit!");

return 0;

}

[/codebox]

My main process code: main.cpp

[codebox]

if ((pid = fork()) < 0) {

	perror("fork");

	exit(1);

} else if (pid == 0){					// child-1

	TIMESTAMP("Child process-1 fork!");

	char* argv[] = {(char *)"process1", (char *)"10", NULL};

	char* envp[] = {(char *)"LD_LIBRARY_PATH=/usr/local/cuda/lib", NULL};

	int rt = execve("./process1", argv, envp);

	if (rt == -1)

		cout << "!!ERROR!!" << endl;

	exit(0);

}

[/codebox]

code.tar.gz (209 KB)

The process1 just print out â€œKProcess-1: Program Start!â€ and â€œKProcess-1: Start!â€

I don’t know why it doesn’t work.

And I want to test the correctness about fork many cuda process at the same time.

But now I even one cuda process is not work.

I have no idea to solve the problem.

My SDK version 2.3, g++ version 4.3

Thanks for your helps.

mcleary · August 24, 2009, 5:15pm

Hi, everyone,

I write a program to implement executing a external cuda process.

But the process exited before running to cuda function call.

My cuda process code: process1.cu ( just no effect and loop for cuda kernel )

[codebox]

include

include <cuda_runtime.h>

include <cuda.h>

include <sys/time.h>

define LOOP_SIZE 39999999

define LIST_NUM 100

define GPU_LOOP 1

using namespace std;

define TIMESTAMP(s); {\
timeval tv;\

char time_str[100];\

gettimeofday(&tv,NULL);\

strftime(time_str, 100, "%F %H:%M:%S", localtime(&tv.tv_sec));\

cout << time_str << ":" << tv.tv_usec << "    KProcess-1: " << s << endl;\

}
global void Kernel_dataCount(long* d_list, int num, clock_t* time)

{
clock_t start = clock();

int index = blockIdx.x * blockDim.x + threadIdx.x;

for(int j = 0; j < GPU_LOOP; j ++){

	for(long i = 0; i < LOOP_SIZE; i ++){

		d_list[index] = num;

	}

}

*time = clock() - start;
}

void dataCount(int num)

{
// GPU clock

clock_t* d_time;

clock_t endTime;

long *d_list;

int size = LIST_NUM * sizeof(long);

long *list = new long[LIST_NUM];

for(int i = 0; i < LIST_NUM; i++){

	list[i] = -1;

}

TIMESTAMP("Start!");

// allocate resource

cudaMalloc((void**) &d_list, size);

cudaMalloc((void**) &d_time, sizeof(clock_t));

cudaMemcpy(d_list, list, size, cudaMemcpyHostToDevice);

// set Grid/Block/Thread

if(LIST_NUM > 256){

	cout << "Error num" << endl;

	exit(1);

}

dim3 dimBlock(LIST_NUM);

dim3 dimGrid(1);

// run Kernel function

TIMESTAMP("Call Kernel Function!");

Kernel_dataCount<<<dimGrid, dimBlock>>>(d_list, num, d_time);

TIMESTAMP("Call Kernel Function done!");

// block until the device has completed

cudaThreadSynchronize();

TIMESTAMP("Synchronize done!");

// Read result from device memory

cudaMemcpy(list, d_list, size, cudaMemcpyDeviceToHost);

cudaMemcpy(&endTime, d_time, sizeof(clock_t), cudaMemcpyDeviceToHost);

TIMESTAMP("cudaMemcpy done!");

for(int i = 0; i < LIST_NUM; i++){

	if (list[i] != num){

		cout << "!!Error!! list[" << i << "]= " << list[i] << endl;

	}

}

// Free device memory

cudaFree(d_list);

TIMESTAMP("cudaFree done!");
}

int main(int argc, char *argv)

{
int num=atoi(argv[1]);

TIMESTAMP("Program Start!");

dataCount(num);

TIMESTAMP("Program Exit!");

return 0;
}

[/codebox]

My main process code: main.cpp

[codebox]
if ((pid = fork()) < 0) {

	perror("fork");

	exit(1);

} else if (pid == 0){					// child-1

	TIMESTAMP("Child process-1 fork!");

	char* argv[] = {(char *)"process1", (char *)"10", NULL};

	char* envp[] = {(char *)"LD_LIBRARY_PATH=/usr/local/cuda/lib", NULL};

	int rt = execve("./process1", argv, envp);

	if (rt == -1)

		cout << "!!ERROR!!" << endl;

	exit(0);

}
[/codebox]

[attachment=13310:code.tar.gz]

The process1 just print out â€œKProcess-1: Program Start!â€ and â€œKProcess-1: Start!â€

I don’t know why it doesn’t work.

And I want to test the correctness about fork many cuda process at the same time.

But now I even one cuda process is not work.

I have no idea to solve the problem.

My SDK version 2.3, g++ version 4.3

Thanks for your helps.

I executed your code here, and occurred as you said. I don’t know a lot about fork, but I have two suggestions: First, check if the process1 binary is working well, second, try to use cutilSafeCall to be sure that cuda functions aren’t returning any error.

awaters · August 26, 2009, 12:09am

I’ve downloaded it and tried running it, the child does not execute properly when forked, but it executes fine when ran separately. I’m also experiencing this issue when trying to run a CUDA program through BOINC, since it does the same thing, it forks the child process to execute a CUDA application.

Here is a stack trace from GDB

#0 0x00007f5b6369ae90 in ?? () from /usr/lib/libcuda.so.1
#1 0x00007f5b636a08d4 in ?? () from /usr/lib/libcuda.so.1
#2 0x00007f5b63670e57 in ?? () from /usr/lib/libcuda.so.1
#3 0x00007f5b6361cc97 in ?? () from /usr/lib/libcuda.so.1
#4 0x00007f5b6362e4ab in ?? () from /usr/lib/libcuda.so.1
#5 0x00007f5b6361399f in ?? () from /usr/lib/libcuda.so.1
#6 0x00007f5b6360caea in ?? () from /usr/lib/libcuda.so.1
#7 0x00007f5b63666ab7 in ?? () from /usr/lib/libcuda.so.1
#8 0x00007f5b63ab7aa2 in ?? () from /opt/cuda/lib64/libcudart.so.2
#9 0x00007f5b63ab828c in ?? () from /opt/cuda/lib64/libcudart.so.2
#10 0x00007f5b63a9c5e4 in cudaMalloc () from /opt/cuda/lib64/libcudart.so.2
#11 0x000000000040f9c5 in dataCount (num=1) at process1.cu:51
#12 0x000000000040fe8a in main (argc=2, argv=0x7fff0c81b518) at process1.cu:93

it fails when calling when doing the cudaMalloc, because first it tries to create the CUDA context with cuCtxCreate, this is with

kernel 2.6.30
190.25 nvidia drivers
g++ 4.3.2
CUDA Toolkit 2.3
64 bit
gtx 285

pjwhite · September 1, 2009, 2:59pm

I am having the same symptoms of a segfault when running CUDA inside a worker child process using fork().

If i do:

p = fork()

if( p == 0 )

   setsid()

   // close stdin stdout stderr and reopen to /dev/null

   umask (0)

   exec(myChild....);

else

   mPid = p;

then i get the segfault inside cuda on the first CUDA call. Instead if i do:

system(  "nohup myChild &" );

then everything works as planned, but I do not like using system this way as I have no control or diagnostics over my spawned process. So I guess I am doing my fork wrong as I am pretty sure if i just run myChild in a bash terminal, bash essentially just does a fork() call?

I am using CUDA 2.2, and driver 185.14.08

seehakoo · September 4, 2009, 6:30am

I try to rewrite the process code using driver API.

[codebox]

#include

#include <cuda_runtime.h>

#include <cutil_inline.h>

#include <cuda.h>

#include <sys/time.h>

#define COUNT_SIZE 19999999

#define LIST_NUM 100

#define GPU_LOOP 1

using namespace std;

#define TIMESTAMP(s); {\

timeval tv;\

char time_str[100];\

gettimeofday(&tv,NULL);\

strftime(time_str, 100, "%F %H:%M:%S", localtime(&tv.tv_sec));\

cout << time_str << ":" << tv.tv_usec << "  KProcess-1: " << s << endl;\

}

global void dataCountKernel1(int* d_list, int num)

{

int index = blockIdx.x * blockDim.x + threadIdx.x;

for(int j = 0; j < GPU_LOOP; j ++){

	for(long i = 0; i < COUNT_SIZE; i ++){

		d_list[index] = num;

	}

}

}

void CallnvDevice(int num) {

int threadPerBlock = LIST_NUM;

int BlockPerGrid = 1;

CUdevice 	hcuDevice = 0;

CUcontext  	hcuContext  = 0;

CUmodule   	hcuModule   = 0;

CUfunction 	hcuFunction = 0;

CUdeviceptr dptr       = 0;

int list[100];

for(int i = 0 ; i < 100; i++){

	list[i] = 0;

}

// Initialize

TIMESTAMP("Initialize");

if (cuInit(0) != CUDA_SUCCESS)

	exit(0);

// Get handle for device 0

TIMESTAMP("Get device handle");

CU_SAFE_CALL( cuDeviceGet(&hcuDevice, 0));

// cuCtxCreate: Function works on floating contexts and current context

TIMESTAMP("CtxCreate");

CU_SAFE_CALL( cuCtxCreate( &hcuContext, CU_CTX_SCHED_YIELD, hcuDevice));

// Load module fileName

TIMESTAMP("ModuleLoad");

CU_SAFE_CALL( cuModuleLoad(&hcuModule, "process1.ptx"));

// Get Function Name from module

TIMESTAMP("ModuleGetFunction");

CU_SAFE_CALL( cuModuleGetFunction( &hcuFunction, hcuModule, "_Z16dataCountKernel1Pii"));

// Alloc cuda memory

TIMESTAMP("Call cuMemAlloc");

CU_SAFE_CALL( cuMemAlloc( &dptr, LIST_NUM*sizeof(int) ));

// Copy data from host memory to device memory

CU_SAFE_CALL( cuMemcpyHtoD(dptr, list, LIST_NUM*sizeof(int)));

// Set cuda resource

CU_SAFE_CALL( cuFuncSetBlockShape(hcuFunction, threadPerBlock, 1, 1));

// Invoke kernel

#define ALIGN_OFFSET(offset, alignment) \

(offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)

int offset = 0;

void* ptr;

ptr = (void*)(size_t)(dptr);

ALIGN_OFFSET(offset, __alignof(ptr));

cuParamSetv(hcuFunction, offset, &ptr, sizeof(ptr));

offset += sizeof(ptr);

ALIGN_OFFSET(offset, __alignof(num));

cuParamSeti(hcuFunction, offset, num);

offset += sizeof(num);

cuParamSetSize(hcuFunction, offset);

// run Kernel function

TIMESTAMP("Run Kernel function!");

CU_SAFE_CALL( cuLaunchGrid(hcuFunction, BlockPerGrid, 1));

// Copy result from device memory to host memory

cuMemcpyDtoH(list, dptr, LIST_NUM*sizeof(int));

cout << "List: " << list[10] << endl;

// Free device memory

cuMemFree(dptr);

}

int main(int argc, char *argv)

{

int num=atoi(argv[1]);

TIMESTAMP("Program Start!");

CallnvDevice(num);

TIMESTAMP("Program Exit!");

return 0;

}

[/codebox]

But it still be segfault when run to “cuModuleLoad(&hcuModule, “process1.ptx”)”.

and i try to replace system() function call bellow.

[codebox]

#include <signal.h>

          int system(const char *cmd)

          {

              int stat;

              pid_t pid;

              struct sigaction sa, savintr, savequit;

              sigset_t saveblock;

              if (cmd == NULL)

                  return(1);

              sa.sa_handler = SIG_IGN;

              sigemptyset(&sa.sa_mask);

              sa.sa_flags = 0;

              sigemptyset(&savintr.sa_mask);

              sigemptyset(&savequit.sa_mask);

              sigaction(SIGINT, &sa, &savintr);

              sigaction(SIGQUIT, &sa, &savequit);

              sigaddset(&sa.sa_mask, SIGCHLD);

              sigprocmask(SIG_BLOCK, &sa.sa_mask, &saveblock);

              if ((pid = fork()) == 0) {

                  sigaction(SIGINT, &savintr, (struct sigaction *)0);

                  sigaction(SIGQUIT, &savequit, (struct sigaction *)0);

                  sigprocmask(SIG_SETMASK, &saveblock, (sigset_t *)0);

//execl(“/bin/sh”, “sh”, “-c”, cmd, (char *)0);

                  //there is my replace

                  char* argv[] = {(char *)"process1", (char *)"10", NULL};

                  char* envp[] = {(char *)"LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib", NULL};

                  int rt = execve("./process1", argv, envp);

_exit(127);

              }

              if (pid == -1) {

                  stat = -1; /* errno comes from fork() */

              } else {

                  while (waitpid(pid, &stat, 0) == -1) {

                      if (errno != EINTR){

                          stat = -1;

                          break;

                      }

                  }

              }

              sigaction(SIGINT, &savintr, (struct sigaction *)0);

              sigaction(SIGQUIT, &savequit, (struct sigaction *)0);

              sigprocmask(SIG_SETMASK, &saveblock, (sigset_t *)0);

              return(stat);

          }

[/codebox]

it is still not work.

Maybe cuda process is not be supported by “exe*” system call, i don’t know.

So, it is no solution for fork-exe* method?

flo_n · October 13, 2009, 2:43pm

Did someone find a solution for this CUDA fork-exec problem?
Any help would be very much appreciated.
Flo

mahnaz · February 17, 2010, 12:31am

Hello all,

I have the same problem and have searched the forums for an answer. If anyone has found an answer could you please point me to it?

Thanks.

Ellie · February 22, 2010, 9:02pm

Inside cuCtxCreate(), /proc//cmdline is opened and the name found there is used in a series of calls to stat() until it is found in either the current directory or in a directory listed in your PATH. If stat() fails to find it then cuCtxCreate() will hang. I don’t know why it hangs, just that it does.

Reasons why the stat() call can fail:

your program changes its current working directory away from the directory in which it resides, eg, by calling daemon() or chdir()
your program fork/exec’s but the name given as “arg0” to exec( char *path, char *arg0, …) doesn’t exist in PATH

Maybe this should be reported as a bug? cuCtxCreate shouldn’t ‘hang’ if stat() fails.

Topic		Replies	Views
cuda.h error message CUDA Programming and Performance	9	6026	October 22, 2009
Issue when calling cublasDdot from within kernel GPU-Accelerated Libraries	7	925	March 21, 2018
Cuda kernel is not working and tried to detect errors using gpuAsset() but, no error message CUDA Programming and Performance	14	2862	December 31, 2017
newbie in Cuda needs help with 2D arrays CUDA Programming and Performance	9	921	March 9, 2018
what's wrong with my code? CUDA Programming and Performance	6	3955	July 7, 2009
unable to get the cpu and gpu to run in parallel CUDA Programming and Performance	34	23203	October 7, 2010
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8601	June 16, 2011
CUDA compile trouble CUDA Programming and Performance	47	5116	November 8, 2010
Calling CUDA functions from a C file CUDA Programming and Performance	19	29130	March 4, 2015
Annoying problems with memory and/or syntax CUDA Programming and Performance	19	4769	April 8, 2008

can not execute external cuda process

Related topics