a cudaMalloced pointer, passed to a windows thread,
cant execute cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);
- suppose we do this in the main program thread
host int oktomap(void)
{
cudaDeviceProp prop;
int whichDevice=0;
HANDLE_ERROR (cudaGetDevice( &whichDevice));
HANDLE_ERROR (cudaGetDeviceProperties( &prop,whichDevice));
if ( prop.canMapHostMemory != 1)
{
fprintf(stderr,“device cant map host memory\n”);
return(0);
}
return (1);
};
- then load some host memory, here called S
enum {sz=7};
enum {ssz=5898240};
void loadF(double S[ssz])
{
int i;
for ( i=0;i< ssz;i++)
S[i] = DBL_MAX - (double) i;
}
unsigned long cksumCpu(double S[ssz]);
unsigned long cksumGpu(void *devS, char *file,int line);
-
then cuda malloc some memory on the gpu. and cudaMemcpy the memory from the host to the devS on the gpu.
{
size_t csz = ssz * sizeof(double);
HANDLE_ERROR( cudaMalloc( (void**)&devS , csz));
HANDLE_ERROR( cudaMemcpy( devS, *S, csz,cudaMemcpyHostToDevice ) );}
-
Then take a checksum of the memory on the host, and define special check sum
routine which takes the checksum of the device memory
Of course it must be copied back to the host first.printf("devS extends from [%p to %p) (%lu bytes)\n", (char *) devS, ((char *)devS)+csz,csz); { unsigned long ckCpu = cksumCpu(*S); unsigned long ckGpu = cksumGpu(devS,__FILE__,__LINE__); printf("%s:%ld ckCpu=%lu\n", __FILE__,__LINE__,ckCpu); printf("%s:%ld ckGpu=%lu\n", __FILE__,__LINE__,ckGpu); }
unsigned long cksumGpu(void *devS,char *file,int line)
{
int me = tidMtl();
size_t csz = ssz * sizeof(double);
void *T = calloc( csz,sizeof(char));
unsigned long csumDev = 0;
{
cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);
cudaError_t e= cudaGetLastError();
if( e != cudaSuccess)
{
const char *es = cudaGetErrorString(e);
printf(“%s:%ld cksumGpu(tid=%ld) cant copy device mem to host e=%ld %s\n”, file,line,me,e,es);
exit(1);
}
else
{
unsigned long i,x,a,c;
char *p = (char *) T;
for ( i=0,a=0,c=0;i<csz;i++,p++)
{
x = *p;
a += x;
c+=a;
}
csumDev = c;
}
}
free (T);
return (csumDev);
}
The checksum is correct for both the host memory using cksumCpu and the devS using the cksumGpu.
unsigned long cksumCpu(double S[ssz])
{
size_t csz = ssz * sizeof(double);
unsigned long i,c,x,a;
char *p = (char *) S;
for ( i=0,c=0,x=0,a=0; i<csz;i++,p++)
{
x = *p;
a +=x;
c+=a;
}
return c;
}
All looks fine on the host so far.
- then start a separate cpu thread.
Here is the thread argument
struct toPolicyInserter
{
double (*S)[ssz];
void *devS;
};
Here is the thread itself
the checksum for the cpu address passed in (S) and devS are both taken
void tPolicyInserter(void *pIarg)
{
struct toPolicyInserter *Iarg = (struct toPolicyInserter *)pIarg;
int me = tidMtl();
printf(“tPolicyInserter %ld Thread up. \n”,me);
{
double (*S)[ssz] = Iarg->S;
void *devS = Iarg->devS;
printf(“S=%p\n”, *S );
printf(“devS=%p\n”, devS);
{
unsigned long ckCpu = cksumCpu(*S);
unsigned long ckGpu = cksumGpu(devS,FILE,LINE);
printf(“%s:%ld ckCpu=%lu\n”, FILE,LINE,ckCpu);
printf(“%s:%ld ckGpu=%lu\n”, FILE,LINE,ckGpu);
}
}
printf(“tPolicyInserter %ld Thread will sleep for 10 \n”,me);
sleepMtl(10);
printf("tPolicyInserter %ld Thread going down\n",me);
exitMtlTh();
}
The cpu checksum works fine inside the thread, but the gpu checksum dies trying to do
cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);
cudaError_t e= cudaGetLastError();
if( e != cudaSuccess)
{
const char *es = cudaGetErrorString(e);
printf(“%s:%ld cksumGpu(tid=%ld) cant copy device mem to host e=%ld %s\n”, file,line,me,e,es);
exit(1);
}
here is what the run looks like
1 solo4
2 solo4.cu:73 ok to map host memory
3 devS extends from [000000FC00000000 to 000000FC02D00000) (47185920 bytes)
4 solo4.cu:85 ckCpu=2195193856
5 solo4.cu:86 ckGpu=2195193856
6 tPolicyInserter 1968 Thread up.
7 S=0000000000630040
8 devS=000000FC00000000
9 solo4.cu:52 cksumGpu(tid=1968) cant copy device mem to host e=11 invalid argument
In summary, what do I have to do to allow cudaMalloced pointers, passed to a windows thread,
to execute a cudaMemcpy( T, devS, csz,cudaMemcpyDeviceToHost);