The following code segment is the streaming execution part of my project. While there is only one stream, it works ok, however, it crash if more than one stream.
cudaStream_t stream[4];
for(int i=0;i<PARALLEL;i++)
cudaStreamCreate(&stream[i]);
// copy two chunks of data to device
for(int i=0;i<p_max;i++)
{
cutilSafeCall(cudaMemcpyAsync(d_thread_node_idx+i*P_NUM,thre
ad_node_idx+iP_NUM,
P_NUMsizeof(uint32_t),cudaMemcpyHostToDevice,stream[i]));
cutilSafeCall(cudaMemcpyAsync(d_inst+p_index[i],(pl_inst + last_start+p_index[i]),
p_size[i]*sizeof(uint32_t),cudaMemcpyHostToDevice,stream[i])
);
}
//start kernel
for(int i=0;i<p_max;i++)
{
//define the kernel type
dim3 dimblock(BLOCK_WIDTH);
dim3 dimgrid(thread_exe[i]/BLOCK_WIDTH);
//launch the kernel
score_kernel<<<dimgrid, dimblock,0,stream[i]>>>(d_thread_node_idx+i*P_NUM,d_omiga,d_inst+p_index[i],
config_obj->nonclkWeight,d_grad_val+i*P_NUM,d_objFuncVal+i*P_NUM);
}
//copy two chunks of result data from device to host
//cutilCheckMsg("Kernel execution failed");
for(int i=0;i<p_max;i++)
{
cutilSafeCall(cudaMemcpyAsync(h_objFuncVal+i*P_NUM,d_objFunc
Val+iP_NUM,
thread_exe[i]INST_THREADsizeof(double),cudaMemcpyDeviceToH
ost,stream[i]));
cutilSafeCall(cudaMemcpyAsync(h_grad_val+iP_NUM,d_grad_val+
i*P_NUM,
thread_exe[i]INST_THREADsizeof(double),cudaMemcpyDeviceToH
ost,stream[i]));
}
cudaThreadSynchronize();
cutilSafeCall(cudaFree(d_inst));
The program crashes at line “cudaThreadSynchronize();”
The error code is :cudaSafeCall() Runtime API error : unspecified launch failure.