cudaMalloced pointer in one thread not allow cudaMemcpy in another

a cudaMalloced pointer, passed to a windows thread,
cant execute cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);

  1. suppose we do this in the main program thread

host int oktomap(void)
{
cudaDeviceProp prop;
int whichDevice=0;
HANDLE_ERROR (cudaGetDevice( &whichDevice));
HANDLE_ERROR (cudaGetDeviceProperties( &prop,whichDevice));
if ( prop.canMapHostMemory != 1)
{
fprintf(stderr,“device cant map host memory\n”);
return(0);
}
return (1);
};

  1. then load some host memory, here called S

enum {sz=7};
enum {ssz=5898240};
void loadF(double S[ssz])
{
int i;
for ( i=0;i< ssz;i++)
S[i] = DBL_MAX - (double) i;
}
unsigned long cksumCpu(double S[ssz]);
unsigned long cksumGpu(void *devS, char *file,int line);

  1. then cuda malloc some memory on the gpu. and cudaMemcpy the memory from the host to the devS on the gpu.
    {
    size_t csz = ssz * sizeof(double);
    HANDLE_ERROR( cudaMalloc( (void**)&devS , csz));
    HANDLE_ERROR( cudaMemcpy( devS, *S, csz,cudaMemcpyHostToDevice ) );

    }

  2. Then take a checksum of the memory on the host, and define special check sum
    routine which takes the checksum of the device memory
    Of course it must be copied back to the host first.

     printf("devS extends from [%p to %p) (%lu bytes)\n", (char *) devS, ((char *)devS)+csz,csz);
     {
     	unsigned long ckCpu = cksumCpu(*S);
     	unsigned long ckGpu = cksumGpu(devS,__FILE__,__LINE__);
     	printf("%s:%ld ckCpu=%lu\n", __FILE__,__LINE__,ckCpu);	
     	printf("%s:%ld ckGpu=%lu\n", __FILE__,__LINE__,ckGpu);	
     }
    

unsigned long cksumGpu(void *devS,char *file,int line)
{
int me = tidMtl();
size_t csz = ssz * sizeof(double);
void *T = calloc( csz,sizeof(char));
unsigned long csumDev = 0;
{
cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);
cudaError_t e= cudaGetLastError();
if( e != cudaSuccess)
{
const char *es = cudaGetErrorString(e);
printf("%s:%ld cksumGpu(tid=%ld) cant copy device mem to host e=%ld %s\n", file,line,me,e,es);
exit(1);
}
else
{
unsigned long i,x,a,c;
char *p = (char *) T;
for ( i=0,a=0,c=0;i<csz;i++,p++)
{
x = *p;
a += x;
c+=a;
}
csumDev = c;
}
}
free (T);
return (csumDev);
}

The checksum is correct for both the host memory using cksumCpu and the devS using the cksumGpu.

unsigned long cksumCpu(double S[ssz])
{
size_t csz = ssz * sizeof(double);
unsigned long i,c,x,a;
char *p = (char *) S;
for ( i=0,c=0,x=0,a=0; i<csz;i++,p++)
{
x = *p;
a +=x;
c+=a;
}
return c;
}

All looks fine on the host so far.

  1. then start a separate cpu thread.
    Here is the thread argument

struct toPolicyInserter
{
double (*S)[ssz];
void *devS;
};

Here is the thread itself
the checksum for the cpu address passed in (S) and devS are both taken

void tPolicyInserter(void *pIarg)
{
struct toPolicyInserter *Iarg = (struct toPolicyInserter *)pIarg;
int me = tidMtl();
printf(“tPolicyInserter %ld Thread up. \n”,me);
{
double (*S)[ssz] = Iarg->S;
void *devS = Iarg->devS;
printf(“S=%p\n”, *S );
printf(“devS=%p\n”, devS);
{
unsigned long ckCpu = cksumCpu(*S);
unsigned long ckGpu = cksumGpu(devS,FILE,LINE);
printf("%s:%ld ckCpu=%lu\n", FILE,LINE,ckCpu);
printf("%s:%ld ckGpu=%lu\n", FILE,LINE,ckGpu);
}
}
printf(“tPolicyInserter %ld Thread will sleep for 10 \n”,me);

sleepMtl(10);
printf("tPolicyInserter %ld Thread  going down\n",me);
exitMtlTh();

}

The cpu checksum works fine inside the thread, but the gpu checksum dies trying to do
cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);
cudaError_t e= cudaGetLastError();
if( e != cudaSuccess)
{
const char *es = cudaGetErrorString(e);
printf("%s:%ld cksumGpu(tid=%ld) cant copy device mem to host e=%ld %s\n", file,line,me,e,es);
exit(1);
}

here is what the run looks like
1 solo4
2 solo4.cu:73 ok to map host memory
3 devS extends from [000000FC00000000 to 000000FC02D00000) (47185920 bytes)
4 solo4.cu:85 ckCpu=2195193856
5 solo4.cu:86 ckGpu=2195193856
6 tPolicyInserter 1968 Thread up.
7 S=0000000000630040
8 devS=000000FC00000000
9 solo4.cu:52 cksumGpu(tid=1968) cant copy device mem to host e=11 invalid argument

In summary, what do I have to do to allow cudaMalloced pointers, passed to a windows thread,
to execute a cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);

Pre CUDA 4.0 - Each thread has a separate CUDA context. You cant mix pointers.
CUDA 4.0 - One context per device per Application - Multiple threads can co-exist in a context…i.e CUDART is thread-safe.

I am running cuda toolkit 3.2

  1. Can /should I upograde to cuda 4.0 then to do this?
  2. Is it a beta? What are the downsides?
  3. I have a C2050 on windows 7 64 bit. Will I have to upgrade the driver too?
    have driver ver 8.17.12.6081 now (I think).
    Have installed CudaToolkit_3.2.16-win64.msi. When I get multiple gpu’s, I will need this functionality. The producer consumer queue that feeds this has allocations in one thread. input buffer preparation in another, consumption in another , the thread that “owns” the particular gpu, and output processing in another cpu thread. This works really well now in a pure cpu context. What do I need to know now if I plan to order (MONEY SPENT) a multiple gpu system and expect to pass pointer around. I know that when I move to multiple gpu’s I will have to allocate gpu buffers on the corresponding device- thats clear I think, using streams which has that device parameter, but will
    I get surprises in the case of the buffers that are memory mapped on the host. Having obtained a gpu specific address from for host memory (with the Portable option on suppose too) can I get multiple gpu specific device addrress for the same memory mapped host memory chunk? Are there any thread specific problems in this environment- with some blocks cudaMalloced, some types of memory blocks memory mapped, with gpu specific addresses?
    What you said about one cuda context for all threads per device, What about the multiple gpu device case with multiple cpu thread?

I really really need to know if there are any show stoppers in this before I tell management to order multiple gpu hardware.
By the way, when I post here, should I leave the tabs in the code fragments or convert to blank fill? What if the code fragments are preceeded with line numbers, then the tab field?