Hi, I’m trying to get asynchronous memory copy calls working correctly, but am unable to do so. I am using a Geforce 8800 GTX. It’s compute capability 1.0, but asynchronous memory copy should still work, right?
Here is a sample program that does not behave as expected:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime_api.h>
# define SAFE_CALL(call) do { \
cudaError_t err = call; \
if(err != cudaSuccess) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(1); \
} } while (0)
#define NUM_BYTES 200000000
void timestamp(char* message);
int main() {
char* host_ptr;
char* device_ptr;
SAFE_CALL(cudaMallocHost((void**)(&host_ptr),NUM_BYTES));
SAFE_CALL(cudaMalloc((void**)(&device_ptr),NUM_BYTES));
timestamp("done mallocing");
SAFE_CALL(cudaMemcpyAsync(device_ptr,host_ptr,NUM_BYTES,cudaMemcpyHostToDevice,0));
timestamp("done issuing memory copy");
SAFE_CALL(cudaThreadSynchronize());
timestamp("completed memory copy");
}
clock_t last_time = 0;
void timestamp(char* message) {
clock_t current_time = (clock()*1000) / CLOCKS_PER_SEC;
fprintf(stderr,"%s +%dms (overall time=%dms)\n",message,current_time - last_time,current_time);
last_time=current_time;
}
If I compile this program as nvcc test.cu, then run it, I get the following output:
done mallocing +187ms (overall time=187ms)
done issuing memory copy +78ms (overall time=265ms)
completed memory copy +0ms (overall time=265ms)
Am I doing something wrong? Can someone else try this code and let me know what you get, thanks!