NPP with Cuda 6.5


I encounter a problem using a NPP routine with last Cuda 6.5 revision.
The call to nppsMulC_32f_I operation on a matrix 128 x 16384 take much more time than previous Cuda 5.5 or 6.0.
Is someone know how to correct this?
Is it a Cuda bug?


cudaMemcpyAsync(gpu_grad, gpu_data_out, bsize*odim*sizeof(REAL), cudaMemcpyDeviceToDevice, Gpu::curStream);

int nb_threads = std::min(Gpu::curDevProps->maxThreadsDim[0], bsize);
int n_shared_bytes = nb_threads * sizeof(REAL);
KernelErrFctSoftmCrossEntNgramCalcGradNull<<<bsize+1, nb_threads, n_shared_bytes, Gpu::curStream>>>(bsize, odim, gpu_data_out, gpu_grad, gpu_target, gpu_res);

cudaError_t err = cudaGetLastError();
if(cudaSuccess != err){
  ErrorN("Error in Gpu::ErrFctSoftmCrossEntNgramCalcGradNull: %s", cudaGetErrorString(err));