CUDA with Pthread

Hi all,
please see my code. I am allocating device memory in main program. Then passing the pointer [ pointer to memory on device ] to a thread. From within this thread I want to access device memory or want to do data transfer from host memory to device memory.
For the following code i am getting “invalid device pointer” error at runtime. Can anybody tell me , where am I going wrong and how to solve it…

//////////////////////////////////////////////code////////////////////////////////////////////////
//------------------------global section------------------------
float *hMatForMult;
float *dMatForMult;
int matDimension = 0;
//--------------------------------------------------------------------

void doWorkCudaMultiplication(void ptr)
{
hMatForMult = new float[matDimension * matDimension];
SAFE_CALL( cudaMemcpy( ptr , (void*)hMatForMult,matDimension * matDimension * sizeof(float), cudaMemcpyHostToDevice ));
pthread_exit(NULL);
}// end of doWorkCuda

int main(int argc, char* argv)
{
pthread_t threadPtrMult;
matDimension = 100;

//---------allocating memory in device.
cudaMalloc((void**)&dMatForMult , matDimension * matDimension * sizeof(float)) ;

int errorCudaThread = pthread_create(&threadPtrMult, NULL, doWorkCudaMultiplication,(void*) dMatForMult);

if( errorCudaThread )
{
cout<<"\n Error : return code from pthread_create() is "<<errorCudaThread;
exit( -1);
}
pthread_join(threadPtrMult,NULL);

pthread_exit(NULL);

}// end of main

//////////////////////////////////////output/////////////////////////////////
in line 80 : invalid device pointer. :argh:
[ line 80 is where we are calling cudaMemcpy() ]

I guess the problem is that you’re allocating device memory in one thread, and use it from the other thread (at least, invalid device pointer seems to be reported on Linux because of that in your code)… You’d probably like to structure your code in following way (I tried to correct your code as much as possible, but still it’s hard to do it when one does not know what is to be accomplished with the given code; also, as a side note and general advice: remember to always check return value of any function returning some kind of error indication):

#include <assert.h>

#include <stdlib.h>

#include <pthread.h>

#include <cuda_runtime.h>

#define SIZE 100

typedef struct {

    int             size_;

    float          *values_;

} Matrix;

void           *

run(void *arguments)

{

    cudaError       error;

   Matrix         *matrix = (Matrix *) arguments;

    int             size = matrix->size_;

    float          *values = matrix->values_;

   // probably set CUDA device here, as there is no reason to mess with

    // pthreads except to be able to run kernel on multiple devices...

   float          *values_d;

    error = cudaMalloc((void **) &values_d, size * size * sizeof(float));

    assert(error == cudaSuccess);

   error =

	cudaMemcpy(values_d, values, size * size * sizeof(float),

     cudaMemcpyHostToDevice);

    assert(error == 0);

   // probably launch kernel using "values_d" here...

   // probably copy results to host memory here...

   error = cudaFree(values_d);

    assert(error == 0);

   pthread_exit(NULL);

}

int

main(void)

{

    int             error;

   pthread_attr_t  attributes;

    error = pthread_attr_init(&attributes);

    assert(error == 0);

    error =

	pthread_attr_setdetachstate(&attributes, PTHREAD_CREATE_JOINABLE);

    assert(error == 0);

   Matrix         *matrix = (Matrix *) malloc(sizeof(Matrix));

    assert(matrix != NULL);

   matrix->size_ = SIZE;

    matrix->values_ = (float *) malloc(SIZE * SIZE * sizeof(float));

    assert(matrix->values_ != NULL);

   // probably initialize matrix->values_ here...

   pthread_t       thread;

    error = pthread_create(&thread, &attributes, run, matrix);

    assert(error == 0);

   pthread_attr_destroy(&attributes);

   error = pthread_join(thread, NULL);

    assert(error == 0);

   // probably collect results from all threads here...

   free(matrix->values_);

    free(matrix);

   pthread_exit(NULL);

}

You’re passing around device contexts between threads–you need to use the thread migration API introduced in 2.0b.

Thanks for your reply. I will modify my code the way you suggested.

Dear tmurray,
Thank for your reply. I got some indication regarding error in my code. I would like to know , which specific APIs , I can use to solve this problem. :wave:

Not able to find Thread Migration API in 2.0 reference manual. :sad:
sam

there is an example in the SDK