memcopy fails in multiple pthreads with cudaSetDevice() i m unable to use pthread with multiple GPUs

I have S2070 with 4 GPUs and I want to run my program such that one main program spawns 4 pthreads to run kernels on 4 GPUs.

I am getting an error on cudaMemcpy() calls. I am using Red Hat linux.

Here is a program that I developed to tell my understanding of the method:

The result is shown at last…what is wrong with the way I am using pthreads to access multiGPUs ? I am using cuda 4.0.

#include <cstdlib>

#include <stdio.h>

#include "/usr/local/cuda/include/driver_functions.h"

#include </usr/local/cuda/include/cuda.h>

#include </usr/local/cuda/include/cutil.h>

#include <pthread.h>

#include <iostream>

#define NUM_THREADS 2

#define SIZE 100000

#define BLOCKSIZE SIZE/NUM_THREADS

#define divUp(A,B) ((A+B-1)/B)

using namespace std;

void* thread_work(void *param);

// A simple GPU kernel :

__global__ void add_vectors(int *a,int *b,int *x)

{

   int Idx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;

x[Idx] = a[Idx] + b[Idx];

}

typedef struct config

{

   int threadid;

   int start_index;

   int *vector_a;

   int *vector_b;

   int *vector_x;

}CONFIG;

int main(int argc, char** argv)

{

   int A;

   for(int i=0;i<SIZE;i++)

     A[i] = i;

int B;

   for(int i=0;i<SIZE;i++)

     B[i] = i;

int X;

   for(int i=0;i<SIZE;i++)

     X[i] = 0;

CONFIG conf[NUM_THREADS];

// create threads to allocate work :

   pthread_t thread[NUM_THREADS];  // create a thread pool (static)

   pthread_attr_t attr;

   pthread_attr_init(&attr);

   pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);

int rc[NUM_THREADS];

for(int i=0;i<NUM_THREADS;i++)

   {

       conf[i].threadid = i;

       conf[i].start_index =i*BLOCKSIZE;

       conf[i].vector_a = A;

       conf[i].vector_b = B;

       conf[i].vector_x = X;

      if(rc[i] = pthread_create (&thread[i],&attr,thread_work,&conf[i]))

      {

       printf("\n Couldnot create thread | ERROR CODE : %d",rc[i]);

       exit(EXIT_FAILURE);

      }

   }

for(int i=0;i<NUM_THREADS;i++)

        pthread_join(thread[i],NULL);

int Y;

   for(int j=0;j<SIZE;j++)

   {

       Y[j] = A[j] + B[j];

   }

for(int k=0;k<SIZE;k++)

   {

       if(X[k]!=Y[k])

       {

           cout<<"Failed at "<<k<<" : "<<X[k]<<" : "<<Y[k]<<endl;

       }

   }

return 0;

}

void* thread_work(void *param)

{

   CONFIG * conf = (CONFIG*) param;

int threadid = conf->threadid;

   int start_index = conf->start_index;

   int *A = conf->vector_a;

   int *B = conf->vector_b;

   int *X = conf->vector_x;

int* g_A;

   int *g_B;

   int *g_X;

//    CUDA_SAFE_CALL(cudaSetDevice(threadid));

cudaMalloc((void**)&g_A,BLOCKSIZE*sizeof(int));

   cudaMalloc((void**)&g_B,BLOCKSIZE*sizeof(int));

   cudaMalloc((void**)&g_X,BLOCKSIZE*sizeof(int));

cudaError_t error;

   error = cudaGetLastError();

   if(error!=cudaSuccess)

   {

          cout<<"Thread "<<threadid<<" :  [FATAL ERROR] Problems in memory initialization - "<<cudaGetErrorString(error)<<endl;

          exit(0);

   }

cudaMemcpy(&g_A,&(A[start_index]),sizeof(int)*BLOCKSIZE,cudaMemcpyHostToDevice);

   cudaMemcpy(&g_B,&(B[start_index]),sizeof(int)*BLOCKSIZE,cudaMemcpyHostToDevice);

error = cudaGetLastError();

   if(error!=cudaSuccess)

   {

          cout<<"Thread "<<threadid<<" : [FATAL ERROR] Problems in memory copy - "<<cudaGetErrorString(error)<<endl;

          exit(0);

   }

dim3 nthreads(32,1);

   dim3 grid(divUp(BLOCKSIZE,nthreads.x),1);

add_vectors<<<grid,nthreads>>>(g_A,g_B,g_X);

error = cudaGetLastError();

   if(error!=cudaSuccess)

   {

          cout<<"Thread "<<threadid<<" :  [FATAL ERROR] Problems in kernel - "<<cudaGetErrorString(error)<<endl;

          exit(0);

   }

cudaMemcpy(&X[start_index],&g_X,sizeof(int)*BLOCKSIZE,cudaMemcpyDeviceToHost);

error = cudaGetLastError();

   if(error!=cudaSuccess)

   {

          cout<<"Thread "<<threadid<<" : [FATAL ERROR] Problems in result memory copy - "<<cudaGetErrorString(error)<<endl;

          exit(0);

   }

cout<<" thread "<<threadid<<" completed .."<<endl;

}

Result:

./mtgpu_vectadd

Thread 1 : [FATAL ERROR] Problems in memory copy - invalid argument

Thread 0 : [FATAL ERROR] Problems in memory copy - invalid argument

cudaMemcpy deals with pointers, not pointers to pointers. You should be passing g_A, not &g_A. (Same for g_B)

That was so silly of me…

but my actual code doesnt have an ‘&’ before the device pointers. It gives a

“segmentation fault error”… can u please tell whether my code is semantically correct?

amean can we use pthreads with cuda in this way ??

I don’t see anything obviously wrong (although I’m not sure why you commented out the cudaSetDevice line). This is how multi-GPU CUDA is usually done.

You’ll want to compile your code in host debug mode (nvcc -g) and run it in GDB so you can see which line is segfaulting.

Thanx seibert … That commenting was a typo mistake…

so i will compile in debug mode and come back here… :)

Ya…so there is no segfault in this program now … it works…until I keep everything in one file…

but the problem in real environment is this:

I have a class defined (c++) which uses all cuda functions declared as extern “C” in a .cu file. This class is invoked from the main function inside pthreads.

The problem is that in the .cu file, I have globally declared texture references for images so that it can be used inside kernel and the other functions.

Now when I invoke two threads with different datasets on two devices, the texture references are common and hence a segmentation fault happens when one thread frees up the global texture while the other is in process.

I have a design issue here… How to I seperate C++ code from a cuda code which uses texture references ???

Example:

Main calls two threads on two images with abc object:

abc.cpp uses cudaabc.cu

cudaabc.cu has a texture refenrence declared as global. …

kindly help. please ask if further clarification is required.