Hi,

I will leave below the code:

```
#include <stdio.h>
#include <cuda_profiler_api.h>
__global__
void vector_addition_loop(int N, float *d_x, float *d_y, float *d_z){
int index=(blockIdx.x*blockDim.x) + threadIdx.x;
for(int i=0; i<1000000; ++i){
d_z[index]+=d_x[index]+d_y[index];
}
}
int main(int argc, char *argv[]){
int N = 32;
int nB = 1;
const int num_streams = 4;
cudaStream_t streams[num_streams];
float *x[num_streams], *y[num_streams], *z[num_streams], *d_x[num_streams], *d_y[num_streams], *d_z[num_streams];
for(int j=0; j<num_streams; ++j){
x[j] = (float*)malloc(nB*N*sizeof(float));
y[j] = (float*)malloc(nB*N*sizeof(float));
z[j] = (float*)malloc(nB*N*sizeof(float));
}
for(int j=0; j<num_streams; ++j){
for(int i=0; i<N*nB; i++){
x[j][i]=0.01f;
y[j][i]=0.02f;
}
}
for(int i=0; i<num_streams; ++i){
cudaStreamCreate(&streams[i]);
cudaMalloc(&d_x[i], nB*N*sizeof(float));
cudaMalloc(&d_y[i], nB*N*sizeof(float));
cudaMalloc(&d_z[i], nB*N*sizeof(float));
cudaMemcpy(d_x[i],x[i],N*nB*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y[i],y[i],N*nB*sizeof(float), cudaMemcpyHostToDevice);
vector_addition_loop<<<nB, N, 0, streams[i]>>>(N,d_x[i],d_y[i], d_z[i]);
}
cudaDeviceSynchronize();
cudaProfilerStop();
for(int i=0; i<num_streams; ++i){
cudaFree(d_x[i]);
cudaFree(d_y[i]);
cudaFree(d_z[i]);
free(x[i]);
free(y[i]);
free(z[i]);
}
cudaDeviceReset();
}
```

Thanks