cudaMalloced pointer in one thread not allow cudaMemcpy in another

LHickey · April 13, 2011, 2:40am

a cudaMalloced pointer, passed to a windows thread,
cant execute cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);

suppose we do this in the main program thread

host int oktomap(void)
{
cudaDeviceProp prop;
int whichDevice=0;
HANDLE_ERROR (cudaGetDevice( &whichDevice));
HANDLE_ERROR (cudaGetDeviceProperties( &prop,whichDevice));
if ( prop.canMapHostMemory != 1)
{
fprintf(stderr,“device cant map host memory\n”);
return(0);
}
return (1);
};

then load some host memory, here called S

enum {sz=7};
enum {ssz=5898240};
void loadF(double S[ssz])
{
int i;
for ( i=0;i< ssz;i++)
S[i] = DBL_MAX - (double) i;
}
unsigned long cksumCpu(double S[ssz]);
unsigned long cksumGpu(void *devS, char *file,int line);

then cuda malloc some memory on the gpu. and cudaMemcpy the memory from the host to the devS on the gpu.
{
size_t csz = ssz * sizeof(double);
HANDLE_ERROR( cudaMalloc( (void**)&devS , csz));
HANDLE_ERROR( cudaMemcpy( devS, *S, csz,cudaMemcpyHostToDevice ) );

}

Then take a checksum of the memory on the host, and define special check sum
routine which takes the checksum of the device memory
Of course it must be copied back to the host first.

 printf("devS extends from [%p to %p) (%lu bytes)\n", (char *) devS, ((char *)devS)+csz,csz);
 {
 	unsigned long ckCpu = cksumCpu(*S);
 	unsigned long ckGpu = cksumGpu(devS,__FILE__,__LINE__);
 	printf("%s:%ld ckCpu=%lu\n", __FILE__,__LINE__,ckCpu);	
 	printf("%s:%ld ckGpu=%lu\n", __FILE__,__LINE__,ckGpu);	
 }

unsigned long cksumGpu(void *devS,char *file,int line)
{
int me = tidMtl();
size_t csz = ssz * sizeof(double);
void *T = calloc( csz,sizeof(char));
unsigned long csumDev = 0;
{
cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);
cudaError_t e= cudaGetLastError();
if( e != cudaSuccess)
{
const char *es = cudaGetErrorString(e);
printf(“%s:%ld cksumGpu(tid=%ld) cant copy device mem to host e=%ld %s\n”, file,line,me,e,es);
exit(1);
}
else
{
unsigned long i,x,a,c;
char *p = (char *) T;
for ( i=0,a=0,c=0;i<csz;i++,p++)
{
x = *p;
a += x;
c+=a;
}
csumDev = c;
}
}
free (T);
return (csumDev);
}

The checksum is correct for both the host memory using cksumCpu and the devS using the cksumGpu.

unsigned long cksumCpu(double S[ssz])
{
size_t csz = ssz * sizeof(double);
unsigned long i,c,x,a;
char *p = (char *) S;
for ( i=0,c=0,x=0,a=0; i<csz;i++,p++)
{
x = *p;
a +=x;
c+=a;
}
return c;
}

All looks fine on the host so far.

then start a separate cpu thread.
Here is the thread argument

struct toPolicyInserter
{
double (*S)[ssz];
void *devS;
};

Here is the thread itself
the checksum for the cpu address passed in (S) and devS are both taken

void tPolicyInserter(void *pIarg)
{
struct toPolicyInserter *Iarg = (struct toPolicyInserter *)pIarg;
int me = tidMtl();
printf(“tPolicyInserter %ld Thread up. \n”,me);
{
double (*S)[ssz] = Iarg->S;
void *devS = Iarg->devS;
printf(“S=%p\n”, *S );
printf(“devS=%p\n”, devS);
{
unsigned long ckCpu = cksumCpu(*S);
unsigned long ckGpu = cksumGpu(devS,FILE,LINE);
printf(“%s:%ld ckCpu=%lu\n”, FILE,LINE,ckCpu);
printf(“%s:%ld ckGpu=%lu\n”, FILE,LINE,ckGpu);
}
}
printf(“tPolicyInserter %ld Thread will sleep for 10 \n”,me);

sleepMtl(10);
printf("tPolicyInserter %ld Thread  going down\n",me);
exitMtlTh();

}

The cpu checksum works fine inside the thread, but the gpu checksum dies trying to do
cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);
cudaError_t e= cudaGetLastError();
if( e != cudaSuccess)
{
const char *es = cudaGetErrorString(e);
printf(“%s:%ld cksumGpu(tid=%ld) cant copy device mem to host e=%ld %s\n”, file,line,me,e,es);
exit(1);
}

here is what the run looks like
1 solo4
2 solo4.cu:73 ok to map host memory
3 devS extends from [000000FC00000000 to 000000FC02D00000) (47185920 bytes)
4 solo4.cu:85 ckCpu=2195193856
5 solo4.cu:86 ckGpu=2195193856
6 tPolicyInserter 1968 Thread up.
7 S=0000000000630040
8 devS=000000FC00000000
9 solo4.cu:52 cksumGpu(tid=1968) cant copy device mem to host e=11 invalid argument

In summary, what do I have to do to allow cudaMalloced pointers, passed to a windows thread,
to execute a cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);

Sarnath · April 13, 2011, 6:00am

Pre CUDA 4.0 - Each thread has a separate CUDA context. You cant mix pointers.
CUDA 4.0 - One context per device per Application - Multiple threads can co-exist in a context…i.e CUDART is thread-safe.

LHickey · April 13, 2011, 2:52pm

I am running cuda toolkit 3.2

Can /should I upograde to cuda 4.0 then to do this?
Is it a beta? What are the downsides?
I have a C2050 on windows 7 64 bit. Will I have to upgrade the driver too?
have driver ver 8.17.12.6081 now (I think).
Have installed CudaToolkit_3.2.16-win64.msi. When I get multiple gpu’s, I will need this functionality. The producer consumer queue that feeds this has allocations in one thread. input buffer preparation in another, consumption in another , the thread that “owns” the particular gpu, and output processing in another cpu thread. This works really well now in a pure cpu context. What do I need to know now if I plan to order (MONEY SPENT) a multiple gpu system and expect to pass pointer around. I know that when I move to multiple gpu’s I will have to allocate gpu buffers on the corresponding device- thats clear I think, using streams which has that device parameter, but will
I get surprises in the case of the buffers that are memory mapped on the host. Having obtained a gpu specific address from for host memory (with the Portable option on suppose too) can I get multiple gpu specific device addrress for the same memory mapped host memory chunk? Are there any thread specific problems in this environment- with some blocks cudaMalloced, some types of memory blocks memory mapped, with gpu specific addresses?
What you said about one cuda context for all threads per device, What about the multiple gpu device case with multiple cpu thread?

I really really need to know if there are any show stoppers in this before I tell management to order multiple gpu hardware.
By the way, when I post here, should I leave the tabs in the code fragments or convert to blank fill? What if the code fragments are preceeded with line numbers, then the tab field?

Topic		Replies	Views
CUDA + CPU threads CUDA Programming and Performance	5	11695	August 20, 2008
cudaMalloc and threads "invalid device pointer" error CUDA Programming and Performance	4	5453	June 26, 2007
cudaMalloc & cudaMemcpy from different host threads Anything new concerning this issue CUDA Programming and Performance	0	3747	November 6, 2009
Multithreadingã€€problem How to use cudaMemcpy() in a new thread? CUDA Programming and Performance	0	1510	April 27, 2009
Reporting a problem with CUDA memory access in multiple OS threads CUDA Programming and Performance	4	4916	April 30, 2007
CUDA with Pthread CUDA Programming and Performance	5	8470	July 24, 2008
Contexts and cudaMallocHost Same rules? CUDA Programming and Performance	17	11273	November 15, 2008
cudaMemcpy returning cudaSuccess even when the copy has not been performed CUDA Programming and Performance	2	11410	June 15, 2009
cudaMemcpy thread safe CUDA Programming and Performance	1	79	October 17, 2024
cudaMemcpy error invalid device pointer CUDA Programming and Performance	2	4910	January 23, 2009

cudaMalloced pointer in one thread not allow cudaMemcpy in another

Related topics