a problem about stream (concurrent copy and execute)

The following code segment is the streaming execution part of my project. While there is only one stream, it works ok, however, it crash if more than one stream.

            cudaStream_t stream[4];
            for(int i=0;i<PARALLEL;i++)
                    cudaStreamCreate(&stream[i]);


          // copy two chunks of data to device

           for(int i=0;i<p_max;i++)
            {
            cutilSafeCall(cudaMemcpyAsync(d_thread_node_idx+i*P_NUM,thre

ad_node_idx+iP_NUM,
P_NUM
sizeof(uint32_t),cudaMemcpyHostToDevice,stream[i]));
cutilSafeCall(cudaMemcpyAsync(d_inst+p_index[i],(pl_inst + last_start+p_index[i]),
p_size[i]*sizeof(uint32_t),cudaMemcpyHostToDevice,stream[i])
);
}

           //start kernel

            for(int i=0;i<p_max;i++)
            {
            //define the kernel type
            dim3 dimblock(BLOCK_WIDTH);
            dim3 dimgrid(thread_exe[i]/BLOCK_WIDTH);
            //launch the kernel 
            score_kernel<<<dimgrid, dimblock,0,stream[i]>>>(d_thread_node_idx+i*P_NUM,d_omiga,d_inst+p_index[i],
            config_obj->nonclkWeight,d_grad_val+i*P_NUM,d_objFuncVal+i*P_NUM);
            }

            //copy two chunks of result data from device to host
            //cutilCheckMsg("Kernel execution failed");
            
            for(int i=0;i<p_max;i++)
            {
            cutilSafeCall(cudaMemcpyAsync(h_objFuncVal+i*P_NUM,d_objFunc

Val+iP_NUM,
thread_exe[i]INST_THREADsizeof(double),cudaMemcpyDeviceToH
ost,stream[i]));
cutilSafeCall(cudaMemcpyAsync(h_grad_val+i
P_NUM,d_grad_val+
i*P_NUM,
thread_exe[i]INST_THREADsizeof(double),cudaMemcpyDeviceToH
ost,stream[i]));
}

            cudaThreadSynchronize();
            cutilSafeCall(cudaFree(d_inst));

The program crashes at line “cudaThreadSynchronize();”
The error code is :cudaSafeCall() Runtime API error : unspecified launch failure.