Is it possible to launch multiple streams simultaneously in device function?

Properties : Win 10, VS 2013, CUDA 7.5, GeForce 920M.
There isn’t any error or warning for both case. Output is SAME for both case. Only difference is:
CASE 1 (timeline - Nsight):
Streams
Stream 1
Stream 2 ||||||||||||||||||||||kernel1|||||||||||||||||||||
Stream 3 ||||||||||||||||||||||kernel1|||||||||||||||||||||

CASE 2 (timeline - Nsight)
Streams
Stream 1

In second case Stream2 and Stream3 does not exist.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "../../common/common.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>

__global__ void kernel1(){
    for (int i = 0; i < 10000000; i++);
    printf("kernel1\n");
}

__global__ void kernel2(){
    cudaStream_t s1, s2;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);

    kernel1 << < 1, 1, 0, s1 >> >();
    kernel1 << < 1, 1, 0, s2 >> >();
}

int main(int argc, char **argv){
    printf("%s Starting...\n", argv[0]);    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Using Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));
    // FIRST CASE
    cudaStream_t s1, s2;
    cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
    cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
    kernel1 << <1, 1, 0, s1 >> >();
    kernel1 << <1, 1, 0, s2 >> >();

    //SECOND CASE
    //kernel2 << < 1, 1>> >();

    CHECK(cudaDeviceSynchronize());
    CHECK(cudaGetLastError());  // check kernel error
    CHECK(cudaDeviceReset());   // reset device

    printf("\nEnd\n");
    getchar();
    return (0);
}

i also add common.h

#include <time.h>
#include <stdio.h>

#ifndef _COMMON_H
#define _COMMON_H

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        getchar();exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUBLAS(call)                                                     \
{                                                                              \
    cublasStatus_t err;                                                        \
    if ((err = (call)) != CUBLAS_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        getchar();exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CURAND(call)                                                     \
{                                                                              \
    curandStatus_t err;                                                        \
    if ((err = (call)) != CURAND_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        getchar();exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult err;                                                           \
    if ( (err = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__,        \
                __LINE__);                                                     \
        getchar();exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUSPARSE(call)                                                   \
{                                                                              \
    cusparseStatus_t err;                                                      \
    if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)                             \
    {                                                                          \
        fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__);   \
        cudaError_t cuda_err = cudaGetLastError();                             \
        if (cuda_err != cudaSuccess)                                           \
        {                                                                      \
            fprintf(stderr, "  CUDA error \"%s\" also detected\n",             \
                    cudaGetErrorString(cuda_err));                             \
        }                                                                      \
        getchar();exit(1);                                                               \
    }                                                                          \
}
clock_t seconds()
{
    return clock();
}
#endif // _COMMON_H

See also on Stack Overflow.