Hello,
i’m beggining to “play” with streams and asynchronous execution operations.
Well, first i wrote a code where i have to copy from device memory to host memory a few bytes.
for( i = iniciot;i < fimt;i++){
cutilSafeCall(cudaMemcpy(parada + (i*raios+inicior),d_parada+(i*raios+inicior),(fimr-inicior)*sizeof(int),cudaMemcpyDeviceToHost,));
for(j = inicior;j < fimr;j++){
posicao = i*(int)posicao1 + j*(int)posicao2;
size = (parada[i*raios+j]-posicao)*sizeof(Slots);
out[i][j] = (Slots*)malloc(size);
cutilSafeCall(cudaMemcpyAsync( out[i][j], d_out + posicao , size ,cudaMemcpyDeviceToHost));
}
}
Tested and it works.
So i tried to add some stream support to do all this copy faster. I got the idea from programming guide (Section 3.2.6.5).This is the code:
cudaStream_t stream1[fimt-iniciot],stream2[fimr-inicior];
for(i=0;i < (fimt-iniciot);i++)
cudaStreamCreate(&stream1[i]);
for(i=0;i < (fimr-inicior);i++)
cudaStreamCreate(&stream2[i]);
for( i = iniciot;i < fimt;i++){
cutilSafeCall(cudaMemcpyAsync(parada + (i*raios+inicior),d_parada+(i*raios+inicior),(fimr-inicior)*sizeof(int),cudaMemcpyDeviceToHost,stream1[i]));
for(j = inicior;j < fimr;j++){
//A variavel posicao marca o endereço aonde o raio verificado começou a ser gravado na memoria global da GPU
posicao = i*(int)posicao1 + j*(int)posicao2;
size = (parada[i*raios+j]-posicao)*sizeof(Slots);
out[i][j] = (Slots*)malloc(size);
cutilSafeCall(cudaMemcpyAsync( out[i][j], d_out + posicao , size ,cudaMemcpyDeviceToHost,stream2[j]));
}
}
cudaThreadSynchronize();
Well, when i execute i got this: cudaSafeCall() Runtime API error : invalid argument.
What i’m doing wrong?