for(int i=0; i<numOfStreams; i++){
// here I get the error
cutilSafeCall( cudaMemcpyAsync(d_idata + i*bytesS, h_idata + i*bytesS, bytesS, cudaMemcpyHostToDevice, stream[i]) );
.... // launch kernel, etc.
}
Since the structure seems to me quite correct, and as well it follows the instructions given in the programming guide concerning stream handling and async copies, I don’t really understand what is wrong here.
Hi,
Just out of curiosity, did you check which value i holds when you encounter the error?
Could it be due to a previously launched kernel inside your loop?
I was referring to the loop counter “i”. What is its value when you get the error?
Basically, what I want to know is if it’s at the first attempt to call cudaMemcpyAsync() that the error is detected, or if it’s during a subsequent attempt.
In which case, the error detection might not be due to the cudaMemcpyAsync() itself, but rather to a previous and not yet detected error. For example, it could be the case that one of your kernels (as launched inside the loop right after the call to cudaMemcpyAsync()) could have trigger the error.
Does that make sense?
anyway, this is a slightly modified version, which aims to allow each stream to process its portion of input data ( recall [font=“Courier New”]bytes = size * sizeof(T);[/font] )
for(int i=0; i<numOfStreams; i++){
cudaStreamSynchronize(stream[i]);
printf("i_i: %d\n;", i);
// here I get the error
cutilSafeCall( cudaMemcpyAsync(d_idata + i*bytes, h_idata + i*bytes, bytes, cudaMemcpyHostToDevice, stream[i]) );
// ARE dimensions right? recall I allocated 'numOfStreams*bytes' space. so copying 'bytes' amount of data for
// each stream should do the work. isn't it?
.... // launch kernel, etc.
}
adding some prints, it turns out that the first attempt works fine (prints “i_i: 0”); the error is detected at the second attempt, because it prints “i_i: 1;” and then reports the error.
So now that you know that i==1 when the error is detected, you might check if the error is already set prior to call cudaMemcpyAsync().
Just call cudaGetLastError() straight and check its result. This way you’ll know if the error actually comes from your call to cudaMemcpyAsync().
Does that make sense to you?
uhm, that’s the reason why I use the ‘cudaSafeCall’ wrapper: it tells me exactly that at row X (the row above) there’s an error.
Precisely, it says:
“file.cpp(321) : cudaSafeCall() Runtime API error 11: invalid argument.”
so I know that the error comes from there.
and actually, error 11 is a “cudaErrorInvalidValue”, which means that “…one or more of the parameters passed to the API call is not within an acceptable range of values.”
I do understand what that error means, but still cannot figure out where is the mistake.
Not necessarily, as Gilles_C has been trying to point out in the past few posts. If you look at the documentation for cudaMemcpyAsync(), it says “Note that this function may also return error codes from previous, asynchronous launches.”
ok, now that’s even more annoying because I solved it by trials and errors but I’m not really aware of what I’ve done. Or better, I know what I’ve done but I’m not sure I understand what the problem was.
so, to recall something:
// host side memory size
unsigned int bytes = size * sizeof(T);
unsigned int bytesS = numOfStreams * bytes;
// device memory size
unsigned int o_bytes = numBlocks*sizeof(T);
unsigned int o_bytesS = numOfStreams*(o_bytes);
// allocate page-locked host memory
T* h_idata = NULL;
T* h_odata = NULL;
cutilSafeCall( cudaMallocHost((void**) &h_idata, bytesS) );
cutilSafeCall( cudaMallocHost((void**) &h_odata, o_bytesS) );
// allocate device memory and data
T* d_idata = NULL;
T* d_odata = NULL;
cutilSafeCall( cudaMalloc((void**) &d_idata, bytesS) );
cutilSafeCall( cudaMalloc((void**) &d_odata, o_bytesS) );
// array of streams handles
cudaStream_t *stream = (cudaStream_t*) malloc(numOfStreams*sizeof(cudaStream_t));
for(int i=0; i<numOfStreams; i++){
cutilSafeCall( cudaStreamCreate(&(stream[i])) );
}
// run kernels on streams - NOW WORKING!
for(int i=0; i<numOfStreams; i++){
cudaStreamSynchronize(stream[i]);
cutilSafeCall( cudaMemcpyAsync(d_idata + i * size, h_idata +i * size, size, cudaMemcpyHostToDevice, stream[i]) );
cutilSafeCall( cudaMemcpyAsync(d_odata + i * numBlocks, h_odata + i * numBlocks, numBlocks, cudaMemcpyHostToDevice, stream[i]) );
reduceS<T>(size, numThreads, numBlocks, d_idata, d_odata, stream[i]);
.... // rest of the code
}
basically what I changed is the quantity of bytes copied and the offsets, which are now ruled by ‘size’, that is the number of elements upon which I work. I don’t understand why the previous quantities were wrong and caused the problem!