cuda-memcheck : windows + cublas

I have an issue with cuda-memcheck in a simple example cases.
I want check if there are not memory leak in my program. When I run it, there is no issue; when I done the same operation with cuda-memcheck program crashes before the end of the execution.

In order to reproduce my issue, I have taken an simple example of the cublas documentation

//Example 2. Application Using C and CUBLAS: 0-based indexing
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))

static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int n, int p, int q, float alpha, float beta){
    cublasSscal (handle, n-p, &alpha, &m[IDX2C(p,q,ldm)], ldm);
    cublasSscal (handle, ldm-p, &beta, &m[IDX2C(p,q,ldm)], 1);

int main (void){
    cudaError_t cudaStat;    
    cublasStatus_t stat;
    cublasHandle_t handle;
    int i, j;
    float* devPtrA;
    float* a = 0;
    a = (float *)malloc (M * N * sizeof (*a));
    if (!a) {
        printf ("host memory allocation failed");
        return EXIT_FAILURE;
    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            a[IDX2C(i,j,M)] = (float)(i * M + j + 1);
    cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
    if (cudaStat != cudaSuccess) {
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    stat = cublasCreate(&handle);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (devPtrA);
        return EXIT_FAILURE;
    modify (handle, devPtrA, M, N, 1, 2, 16.0f, 12.0f);
    stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data upload failed");
        cudaFree (devPtrA);
        return EXIT_FAILURE;
    cudaFree (devPtrA);
    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            printf ("%7.0f", a[IDX2C(i,j,M)]);
        printf ("\n");
    return EXIT_SUCCESS;

The compilation and the execution of this program works in my case.
When I run the command : cuda-memcheck.exe example.exe, “cublasGetMatrix” function fails,
I get “data upload failed” message and the cuda-memcheck log gives :

========= Invalid global read of size 4
========= at 0x000000c8 in void scal_kernel_val<float, float, int=0>(cublasScalParamsVal<float, float>)
========= by thread (3,0,0) in block (0,0,0)
========= Address 0xb00c0007c is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time

========= LEAK SUMMARY: 0 bytes leaked in 0 allocations
========= ERROR SUMMARY: 24 errors

I don’t understand why I get these errors.
Thanks for your help.
I use CUDA 9.1, Windows 7, NVIDIA GeForce GTX 1050 Ti

Can you please try this on the CUDA 10.0 release?

Thanks for your answer.
I have done the test with CUDA 10.0 release and I have the same result.