I have S2070 with 4 GPUs and I want to run my program such that one main program spawns 4 pthreads to run kernels on 4 GPUs.
I am getting an error on cudaMemcpy() calls. I am using Red Hat linux.
Here is a program that I developed to tell my understanding of the method:
The result is shown at last…what is wrong with the way I am using pthreads to access multiGPUs ? I am using cuda 4.0.
#include <cstdlib>
#include <stdio.h>
#include "/usr/local/cuda/include/driver_functions.h"
#include </usr/local/cuda/include/cuda.h>
#include </usr/local/cuda/include/cutil.h>
#include <pthread.h>
#include <iostream>
#define NUM_THREADS 2
#define SIZE 100000
#define BLOCKSIZE SIZE/NUM_THREADS
#define divUp(A,B) ((A+B-1)/B)
using namespace std;
void* thread_work(void *param);
// A simple GPU kernel :
__global__ void add_vectors(int *a,int *b,int *x)
{
int Idx = __umul24(blockIdx.x,blockDim.x) + threadIdx.x;
x[Idx] = a[Idx] + b[Idx];
}
typedef struct config
{
int threadid;
int start_index;
int *vector_a;
int *vector_b;
int *vector_x;
}CONFIG;
int main(int argc, char** argv)
{
int A;
for(int i=0;i<SIZE;i++)
A[i] = i;
int B;
for(int i=0;i<SIZE;i++)
B[i] = i;
int X;
for(int i=0;i<SIZE;i++)
X[i] = 0;
CONFIG conf[NUM_THREADS];
// create threads to allocate work :
pthread_t thread[NUM_THREADS]; // create a thread pool (static)
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr,PTHREAD_CREATE_JOINABLE);
int rc[NUM_THREADS];
for(int i=0;i<NUM_THREADS;i++)
{
conf[i].threadid = i;
conf[i].start_index =i*BLOCKSIZE;
conf[i].vector_a = A;
conf[i].vector_b = B;
conf[i].vector_x = X;
if(rc[i] = pthread_create (&thread[i],&attr,thread_work,&conf[i]))
{
printf("\n Couldnot create thread | ERROR CODE : %d",rc[i]);
exit(EXIT_FAILURE);
}
}
for(int i=0;i<NUM_THREADS;i++)
pthread_join(thread[i],NULL);
int Y;
for(int j=0;j<SIZE;j++)
{
Y[j] = A[j] + B[j];
}
for(int k=0;k<SIZE;k++)
{
if(X[k]!=Y[k])
{
cout<<"Failed at "<<k<<" : "<<X[k]<<" : "<<Y[k]<<endl;
}
}
return 0;
}
void* thread_work(void *param)
{
CONFIG * conf = (CONFIG*) param;
int threadid = conf->threadid;
int start_index = conf->start_index;
int *A = conf->vector_a;
int *B = conf->vector_b;
int *X = conf->vector_x;
int* g_A;
int *g_B;
int *g_X;
// CUDA_SAFE_CALL(cudaSetDevice(threadid));
cudaMalloc((void**)&g_A,BLOCKSIZE*sizeof(int));
cudaMalloc((void**)&g_B,BLOCKSIZE*sizeof(int));
cudaMalloc((void**)&g_X,BLOCKSIZE*sizeof(int));
cudaError_t error;
error = cudaGetLastError();
if(error!=cudaSuccess)
{
cout<<"Thread "<<threadid<<" : [FATAL ERROR] Problems in memory initialization - "<<cudaGetErrorString(error)<<endl;
exit(0);
}
cudaMemcpy(&g_A,&(A[start_index]),sizeof(int)*BLOCKSIZE,cudaMemcpyHostToDevice);
cudaMemcpy(&g_B,&(B[start_index]),sizeof(int)*BLOCKSIZE,cudaMemcpyHostToDevice);
error = cudaGetLastError();
if(error!=cudaSuccess)
{
cout<<"Thread "<<threadid<<" : [FATAL ERROR] Problems in memory copy - "<<cudaGetErrorString(error)<<endl;
exit(0);
}
dim3 nthreads(32,1);
dim3 grid(divUp(BLOCKSIZE,nthreads.x),1);
add_vectors<<<grid,nthreads>>>(g_A,g_B,g_X);
error = cudaGetLastError();
if(error!=cudaSuccess)
{
cout<<"Thread "<<threadid<<" : [FATAL ERROR] Problems in kernel - "<<cudaGetErrorString(error)<<endl;
exit(0);
}
cudaMemcpy(&X[start_index],&g_X,sizeof(int)*BLOCKSIZE,cudaMemcpyDeviceToHost);
error = cudaGetLastError();
if(error!=cudaSuccess)
{
cout<<"Thread "<<threadid<<" : [FATAL ERROR] Problems in result memory copy - "<<cudaGetErrorString(error)<<endl;
exit(0);
}
cout<<" thread "<<threadid<<" completed .."<<endl;
}
Result:
./mtgpu_vectadd
Thread 1 : [FATAL ERROR] Problems in memory copy - invalid argument
Thread 0 : [FATAL ERROR] Problems in memory copy - invalid argument