write some routine to test if there are benefic for my propose to use stream, but the result was amazing.
Code with stream take more time to execute than those without it.
I have to process a large quantity of video data, i was thinking , i could use Stream to overlap Memory copy and kernel execution, so i can accelerate the process.
my question: when is it interessant to use stream?
thank for advice.
// moveArrays.cu
//
// demonstrates CUDA interface to data allocation on device (GPU)
// and data movement between host (CPU) and device.
:rolleyes:
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#include <cutil_inline.h>
#include <conio.h>
#define MUL(a,b) __mul24(a,b)
#define GRIDOFFSETD(xpos,ypos,ydim) ((ypos) + MUL((xpos),(ydim)))
//#include "matrix_2_vector_converter.cpp"
__global__ void mykernel(float *myarray, const int Sx)
{
int gidx = MUL(blockIdx.x,blockDim.x) + threadIdx.x;
int gidy = MUL(blockIdx.y,blockDim.y) + threadIdx. y;
int goffs = GRIDOFFSETD(gidx,gidy,Sx);
myarray[gidx]=myarray[gidx]*myarray[gidx]+1;
}
//extern "C"
//float* Matrix_2_vector_converter(size_t row,size_t col);
int main(void)
{
float *a_h, *b_h,*c_h; // pointers to host memory
float *a_d=0, *b_d; // pointers to device memory
int N = 14;
float *testvec;
float data[8][8];
int i;
float elapsed_time, time_memcpy, time_kernel;
//testvec=Matrix_2_vector_converter(8,8);
// allocate arrays on host
a_h = (float *)malloc(sizeof(float)*10240);
testvec = (float *)malloc(sizeof(float)*10240);
//pinned memory allocation
cutilSafeCall( cudaMallocHost((void**)&b_h, (sizeof(float)*10240)) );
cutilSafeCall( cudaMallocHost((void**)&c_h, (sizeof(float)*10240)) );
// allocate device Memory
cudaMalloc((void **) &a_d, sizeof(float)*10240);
cudaMalloc((void **) &b_d, sizeof(float)*10240);
// initialize host data
for (i=0; i<10240; i++) {
a_h[i] = 10.f+i;
b_h[i] = 0.f;
c_h[i] = 4.6f+i;
testvec[i]=2.5;
}
//*****************
// allocate and initialize an array of stream handles
cudaStream_t *streams = (cudaStream_t*) malloc(4 * sizeof(cudaStream_t));
for(int i = 0; i < 4; i++)
cutilSafeCall( cudaStreamCreate(&(streams[i])) );
// create CUDA event handles
cudaEvent_t start_event, stop_event;
cutilSafeCall( cudaEventCreate(&start_event) );
cutilSafeCall( cudaEventCreate(&stop_event) );
/*___________________________________________________________
________________________
*Analysing a piece of code just execute Kernel
*"time_kernel"
_______________________________________________________
_____________________________*/
// timeexecution kernel
printf("::::::::::Kernel execution without streaming:::::::\n\n\n\n");
cudaEventRecord(start_event, 0); // record in stream-0, to ensure that all previous CUDA calls have completed
cudaMemcpy(a_d , a_h , sizeof(float)*10240, cudaMemcpyHostToDevice);
mykernel<<<20, 512>>>(a_d,0);
cudaMemcpy(c_h, a_d , sizeof(float)*10240, cudaMemcpyDeviceToHost);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event); // block until the event is actually recorded
cutilSafeCall( cudaEventElapsedTime(&time_kernel, start_event, stop_event) );
printf("time_kernel:\t%.3f\n\n\n\n\n", time_kernel);
/*___________________________________________________________
________________________
*Analysing a piece of code just execute asynchronous memoyy copy from Device to host
*"time_memcpy"
_______________________________________________________
_____________________________*/
// time memcopy from device
printf(":::::::::::::::::::asynchronous copy from Host::::::::::::::::::::::::::\n\n\n\n");
cudaEventRecord(start_event, 0); // record in stream-0, to ensure that all previous CUDA calls have completed
cudaMemcpyAsync(c_h, a_d, sizeof(float)*10240, cudaMemcpyDeviceToHost, streams[0]);
//cudaMemcpyAsync(a_d, a_h, sizeof(float)*64, cudaMemcpyHostToDevice, streams[0]);
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event); // block until the event is actually recorded
cutilSafeCall( cudaEventElapsedTime(&time_memcpy, start_event, stop_event) );
printf("time_memcopy:\t%.3f\n\n\n\n\n", time_memcpy);
/*___________________________________________________________
________________________
*Analysing a piece of code using stream to overloap Kernel execution and memory copy
**"elapsed_time"
_______________________________________________________
_____________________________*/
cudaEventRecord(start_event, 0);
for(int k = 0; k < 5; k++)
{
for(int i = 0; i <4; i++)
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream x have completed
cudaMemcpyAsync(a_d + i *10240/4, a_h + i * 10240/4 , sizeof(float)*10240/ 4, cudaMemcpyHostToDevice, streams[i]);
// asynchronously launch nstreams kernels, each operating on its own portion of data
for(int i = 0; i < 4; i++)
mykernel<<<5, 512, 0, streams[i]>>>(a_d+i*10240/4,0);
for(int i = 0; i <4; i++)
// asynchronoously launch nstreams memcopies. Note that memcopy in stream x will only
// commence executing when all previous CUDA calls in stream [x] have completed
cudaMemcpyAsync(c_h + i * 10240/4, a_d + i *10240/4, sizeof(float)*10240/ 4, cudaMemcpyDeviceToHost, streams[i]);
// cudaThreadSynchronize();
}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cutilSafeCall( cudaEventElapsedTime(&elapsed_time, start_event, stop_event) );
printf("%d streams:\t%.2f (%.2f expected with compute capability 1.1 or later)\n", 4, elapsed_time / 5, time_kernel + time_memcpy / 4);
/*for (i=0; i<64; i++)
printf("...c_h[%d]=%f\ta_h[%d]=%f\n",i,c_h[i],i,a_h[i]);*/
// release resources
for(int i = 0; i < 4; i++)
cudaStreamDestroy(streams[i]);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(c_h);
cudaFreeHost(b_h);
cudaFree(a_d);
cudaFree(b_d);
//*****************
getch();
return 0;
}