How to use stream in my program

In my program I do 16 simulation, one of them in each block. I do a big loop of 500000 or 1000000, at the end of every loop every block write to a struct array the results. This array is loop * 16 every block has his own space. My problem is when loop end the program return to host and must do cudaMemCpy back to host and write the array to 16 different files before kernel start again. The half time spend for the cudamemcpy and write to files and that was the reason for the stream use. I did something like that but the time is the same or bigger.

typedef struct __align__(8){

    float t;	//time

    int pos;	//posistion (reaction)

}data_t;

mode = 0

frm_kernel <<<NumOfSim, num_reactions, shared_size,  streams[0]>>> (d_data0, d_data1,mode);

	

	

while(stop == 0)

{	

    	

  if(cudaStreamQuery(streams[0]) == cudaSuccess)

  { 

    mode = 1

    frm_kernel <<<NumOfSim, num_reactions, shared_size,  streams[1]>>> (d_data0, d_data1,mode);

cudaMemcpy(data, d_data0, NumOfSim*BUFSIZE*sizeof(data_t), cudaMemcpyDeviceToHost);

for(b=0;b<NumOfSim;b++)

    {

      for(i=0;i<BUFSIZE;i++)

      {

        if(data[i + b*BUFSIZE].t == -1.0f)

        {

          stop = 1;

          break;

        }

		

        fprintf(pfile2[b],"%f,%d\n",data[i + b*BUFSIZE].t,data[i + b*BUFSIZE].pos);

      }

    }

		

    if(cudaStreamQuery(streams[1]) == cudaSuccess )

    {	

      mode = 0

      frm_kernel <<<NumOfSim, num_reactions, shared_size,  streams[0]>>> (d_data0, d_data1,mode);

cudaMemcpy(data, d_data1, NumOfSim*BUFSIZE*sizeof(data_t), cudaMemcpyDeviceToHost);

for(b=0;b<NumOfSim;b++)

      {

        for(i=0;i<BUFSIZE;i++)

        {

          if(data[i + b*BUFSIZE].t == -1.0f)

          {

            stop = 1;

            break;

          }

			

          fprintf(pfile2[b],"%f,%d\n",data[i + b*BUFSIZE].t,data[i + b*BUFSIZE].pos);

       }

  }

			

}

		

	

}

}

the kernel have more arguments also into kernel if mode is 0 program write to d_data0 else to d_data1

Any help. Is the above the right way to use streams?