Hello,
- Expected the time taking will be less using the pinned memory But I’m getting almost same time for both pageable memory and pinned memory.
Why is it not making any difference?
int * host_p;
int * dev_p;
int main(void) {
int data_size = 4 * sizeof(int);
cudaStream_t stream1 ;
cudaStreamCreate ( &stream1) ;
//host_p = (int *) malloc(data_size);
cudaMallocHost(&host_p, data_size); //pinned memory
cudaMalloc(&dev_p, data_size);
/* Transfer data p --> dev_p */
cudaMemcpyAsync(dev_p, host_p, data_size, cudaMemcpyHostToDevice,0);
//cudaFree(host_p);
cudaFreeHost(host_p);
cudaFree(dev_p);
return 0;
}
- If I’m doing the cudaMemcpyAsync operation followed by Kernel using the same data in the default stream .
Whether these two operations will be serialized? or kernel will execute with default previous data?
If I use different streams for above two operations, then what kind of behaviour can we expect?