Strange behaviour setMatrix 3.2 v 3.1

Dear all,

I wrote a driver to catch a call to gemm functions which run fine since version of cublas 3.2. I rebuild a small case that produce the error in order to let you understand what’s happen. I would really appreciate any help let me fix this problem.

On hand for any information you need.

Regars,

Girotto Ivan

igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.1
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.1/cuda/include -I/ichec/packages/cuda/3.1/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.1/cuda/lib64 -L/ichec/packages/cuda/3.1/C/lib -L/ichec/packages/cuda/3.1/C/common/lib/linux/ -L/ichec/packages/cuda/3.1/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.2-rc
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.2-rc/cuda/include -I/ichec/packages/cuda/3.2-rc/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.2-rc/cuda/lib64 -L/ichec/packages/cuda/3.2-rc/C/lib -L/ichec/packages/cuda/3.2-rc/C/common/lib/linux/ -L/ichec/packages/cuda/3.2-rc/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
!!! device access error (write C) 11
igirotto@fermi-x2:~/Ivan/test_fatica$

igirotto@fermi-x2:~/Ivan/test_fatica$ gcc --version
gcc (Debian 4.3.5-2) 4.3.5
Copyright © 2008 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

igirotto@fermi-x2:~/Ivan/test_fatica$ ifort --version
ifort (IFORT) 11.1 20100203
Copyright © 1985-2010 Intel Corporation. All rights reserved.

igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010
GCC version: gcc version 4.3.5 (Debian 4.3.5-2)
igirotto@fermi-x2:~/Ivan/test_fatica$ uname -a
Linux fermi-x2 2.6.32-5-amd64 #1 SMP Wed Aug 25 13:59:41 UTC 2010 x86_64 GNU/Linux

I upload here all the files as I can add them by the form:

igirotto@fermi-x2:~/Ivan/test_fatica$ cat Makefile
all: dgemm_speed

dgemm_speed: dgemm_speed.f90 dgemm_ivan.o
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o $(CUDA_MAKEFILE_L) -lcudart -lcufft -lcublas -L$(MKL_LIB) -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5

dgemm_ivan.o: dgemm_ivan.c
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o $(CUDA_MAKEFILE_I)

clean:
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_ivan.c
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>

#include “cuda_runtime.h”
#include “cublas.h”

/*define huge scratch array on GPU */
void *dev_scratch;

void cublas_dgemm_ivan_ (const char *transa, const char *transb, const int *m,
const int *n, const int *k, const double *alpha,
const double *A, const int *lda, const double *B,
const int *ldb, const double *beta, double *C, const int *ldc)
{
int m_gpu, n_gpu, k_gpu;
double *devPtrA, *devPtrB, *devPtrC;
static float split=0.7;
cublasStatus status;

cudaSetDevice(0);
cublasInit();

/* allocate scratch space for library on device */
cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 );

// New split
k_gpu = *k ;

n_gpu = *n ;
m_gpu = ceil(*m /64*split) *64;

 devPtrA=(double *)dev_scratch;
 status = cublasSetMatrix (m_gpu, k_gpu, sizeof(A[0]), A, *lda, devPtrA, m_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
   printf ( "!!!! device access error (write A ) %d\n", status);
 }

 devPtrB=devPtrA+m_gpu * k_gpu;
 status = cublasSetMatrix (k_gpu, n_gpu, sizeof(B[0]), B, *ldb, devPtrB, k_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
    printf ("!!!! device access error (write B) %d\n", status);
 }

 devPtrC = devPtrB+k_gpu * n_gpu;
 status = cublasSetMatrix (m_gpu, n_gpu, sizeof(C[0]), C, *ldc, devPtrC, m_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
   printf ("!!!! device access error (write C) %d\n", status);
 }

 exit(1);

}
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_speed.f90
program matrix_multiply

implicit none

integer, parameter :: fp_kind = kind(0.0d0)
real (fp_kind), dimension(:,:), allocatable :: A, B, C
double precision :: time_start,time_end, wallclock
real (fp_kind):: alpha=1._fp_kind,beta=1._fp_kind, c_right
integer:: i,j,m1,m2,m3

m1=1280
m2=m1
m3=m1

allocate(A(m1,m2))
allocate(B(m2,m3))
allocate(C(m1,m3))

! Initialize the matrices A,B and C
A=1._fp_kind
B=2._fp_kind
C=3._fp_kind

call CUBLAS_DGEMM_IVAN(‘n’,‘n’,m1,m3,m2,alpha,A,m1,B,m2,beta,C,m1)

deallocate(A,B,C)

end program matrix_multiply

Dear all,

I wrote a driver to catch a call to gemm functions which run fine since version of cublas 3.2. I rebuild a small case that produce the error in order to let you understand what’s happen. I would really appreciate any help let me fix this problem.

On hand for any information you need.

Regars,

Girotto Ivan

igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.1
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.1/cuda/include -I/ichec/packages/cuda/3.1/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.1/cuda/lib64 -L/ichec/packages/cuda/3.1/C/lib -L/ichec/packages/cuda/3.1/C/common/lib/linux/ -L/ichec/packages/cuda/3.1/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.2-rc
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.2-rc/cuda/include -I/ichec/packages/cuda/3.2-rc/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.2-rc/cuda/lib64 -L/ichec/packages/cuda/3.2-rc/C/lib -L/ichec/packages/cuda/3.2-rc/C/common/lib/linux/ -L/ichec/packages/cuda/3.2-rc/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
!!! device access error (write C) 11
igirotto@fermi-x2:~/Ivan/test_fatica$

igirotto@fermi-x2:~/Ivan/test_fatica$ gcc --version
gcc (Debian 4.3.5-2) 4.3.5
Copyright © 2008 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

igirotto@fermi-x2:~/Ivan/test_fatica$ ifort --version
ifort (IFORT) 11.1 20100203
Copyright © 1985-2010 Intel Corporation. All rights reserved.

igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010
GCC version: gcc version 4.3.5 (Debian 4.3.5-2)
igirotto@fermi-x2:~/Ivan/test_fatica$ uname -a
Linux fermi-x2 2.6.32-5-amd64 #1 SMP Wed Aug 25 13:59:41 UTC 2010 x86_64 GNU/Linux

I upload here all the files as I can add them by the form:

igirotto@fermi-x2:~/Ivan/test_fatica$ cat Makefile
all: dgemm_speed

dgemm_speed: dgemm_speed.f90 dgemm_ivan.o
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o $(CUDA_MAKEFILE_L) -lcudart -lcufft -lcublas -L$(MKL_LIB) -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5

dgemm_ivan.o: dgemm_ivan.c
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o $(CUDA_MAKEFILE_I)

clean:
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_ivan.c
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>

#include “cuda_runtime.h”
#include “cublas.h”

/*define huge scratch array on GPU */
void *dev_scratch;

void cublas_dgemm_ivan_ (const char *transa, const char *transb, const int *m,
const int *n, const int *k, const double *alpha,
const double *A, const int *lda, const double *B,
const int *ldb, const double *beta, double *C, const int *ldc)
{
int m_gpu, n_gpu, k_gpu;
double *devPtrA, *devPtrB, *devPtrC;
static float split=0.7;
cublasStatus status;

cudaSetDevice(0);
cublasInit();

/* allocate scratch space for library on device */
cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 );

// New split
k_gpu = *k ;

n_gpu = *n ;
m_gpu = ceil(*m /64*split) *64;

 devPtrA=(double *)dev_scratch;
 status = cublasSetMatrix (m_gpu, k_gpu, sizeof(A[0]), A, *lda, devPtrA, m_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
   printf ( "!!!! device access error (write A ) %d\n", status);
 }

 devPtrB=devPtrA+m_gpu * k_gpu;
 status = cublasSetMatrix (k_gpu, n_gpu, sizeof(B[0]), B, *ldb, devPtrB, k_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
    printf ("!!!! device access error (write B) %d\n", status);
 }

 devPtrC = devPtrB+k_gpu * n_gpu;
 status = cublasSetMatrix (m_gpu, n_gpu, sizeof(C[0]), C, *ldc, devPtrC, m_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
   printf ("!!!! device access error (write C) %d\n", status);
 }

 exit(1);

}
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_speed.f90
program matrix_multiply

implicit none

integer, parameter :: fp_kind = kind(0.0d0)
real (fp_kind), dimension(:,:), allocatable :: A, B, C
double precision :: time_start,time_end, wallclock
real (fp_kind):: alpha=1._fp_kind,beta=1._fp_kind, c_right
integer:: i,j,m1,m2,m3

m1=1280
m2=m1
m3=m1

allocate(A(m1,m2))
allocate(B(m2,m3))
allocate(C(m1,m3))

! Initialize the matrices A,B and C
A=1._fp_kind
B=2._fp_kind
C=3._fp_kind

call CUBLAS_DGEMM_IVAN(‘n’,‘n’,m1,m3,m2,alpha,A,m1,B,m2,beta,C,m1)

deallocate(A,B,C)

end program matrix_multiply

I take it the question you are actually trying to ask (albeit in the most obfuscated of fashions) is why CUBLAS is returning CUBLAS_STATUS_MAPPING_ERROR with CUDA 3.2rc and not with CUDA 3.1?

I will take a guess that the answer is you haven’t installed series 260 drivers, and there is a underlying toolkit version versus driver conflict.

I take it the question you are actually trying to ask (albeit in the most obfuscated of fashions) is why CUBLAS is returning CUBLAS_STATUS_MAPPING_ERROR with CUDA 3.2rc and not with CUDA 3.1?

I will take a guess that the answer is you haven’t installed series 260 drivers, and there is a underlying toolkit version versus driver conflict.

I take it the question you are actually trying to ask (albeit in the most obfuscated of fashions) is why CUBLAS is returning CUBLAS_STATUS_MAPPING_ERROR with CUDA 3.2rc and not with CUDA 3.1?

I will take a guess that the answer is you haven’t installed series 260 drivers, and there is a underlying toolkit version versus driver conflict.

You got the right question. BTW, the driver is installed as I reported. I’ve also tested the sdk/3.2 examples that are all working fine. Do you know any other way that can permit me to check if the driver works fine?

Thanks for the answer,

Ivan

igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version

NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010

GCC version: gcc version 4.3.5 (Debian 4.3.5-2)

You got the right question. BTW, the driver is installed as I reported. I’ve also tested the sdk/3.2 examples that are all working fine. Do you know any other way that can permit me to check if the driver works fine?

Thanks for the answer,

Ivan

igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version

NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010

GCC version: gcc version 4.3.5 (Debian 4.3.5-2)

From the code, can I assume that the error comes from cublasSetMatrix, not cublasSgemm
Which call to cublasSetMatrix fails?

From the code, can I assume that the error comes from cublasSetMatrix, not cublasSgemm
Which call to cublasSetMatrix fails?

yes, i guess it comes from the cublasSetMatrix call as title of this topic. The one fails it’s the third one (matrix C) as reported by the output of the program.

Ivan

yes, i guess it comes from the cublasSetMatrix call as title of this topic. The one fails it’s the third one (matrix C) as reported by the output of the program.

Ivan

cublasSetMatrix set CUBLAS_STATUS_MAPPING_ERROR if the underlying cudaMemcpy or cudaMemcoy2D fails.

You should try to get the actual cudaError_t calling cudaGetLastError();

cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 ); ← this hardcoded floating point value is very suspicious.

The actual source code of cublasSetMatrix is something like this:

cudaError_t cudaStat = cudaSuccess;

const char *sp = (const char *)A;

char *dp = (char *)B;

if ((lda <= 0) || (ldb <= 0) || (elemSize <= 0) || (rows < 0) || (cols<0)){

    return CUBLAS_STATUS_INVALID_VALUE;

}

/* early out if nothing to do */

if ((rows == 0) || (cols == 0)) {

    return CUBLAS_STATUS_SUCCESS;

}

if ((rows == lda) && (rows == ldb)) {

    cudaStat = cudaMemcpy (dp, sp, 

                           (size_t)rows * (size_t)cols * (size_t)elemSize, 

                           cudaMemcpyHostToDevice);

}  else {

    cudaStat = cudaMemcpy2D (dp, (size_t)ldb * (size_t)elemSize, 

                             sp, (size_t)lda * (size_t)elemSize,

                             (size_t)rows * (size_t)elemSize, cols,

                             cudaMemcpyHostToDevice);

}

if (cudaStat != cudaSuccess) {

    return CUBLAS_STATUS_MAPPING_ERROR;

}

return CUBLAS_STATUS_SUCCESS;

With this, you should be able to find your problem quickly.

cublasSetMatrix set CUBLAS_STATUS_MAPPING_ERROR if the underlying cudaMemcpy or cudaMemcoy2D fails.

You should try to get the actual cudaError_t calling cudaGetLastError();

cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 ); ← this hardcoded floating point value is very suspicious.

The actual source code of cublasSetMatrix is something like this:

cudaError_t cudaStat = cudaSuccess;

const char *sp = (const char *)A;

char *dp = (char *)B;

if ((lda <= 0) || (ldb <= 0) || (elemSize <= 0) || (rows < 0) || (cols<0)){

    return CUBLAS_STATUS_INVALID_VALUE;

}

/* early out if nothing to do */

if ((rows == 0) || (cols == 0)) {

    return CUBLAS_STATUS_SUCCESS;

}

if ((rows == lda) && (rows == ldb)) {

    cudaStat = cudaMemcpy (dp, sp, 

                           (size_t)rows * (size_t)cols * (size_t)elemSize, 

                           cudaMemcpyHostToDevice);

}  else {

    cudaStat = cudaMemcpy2D (dp, (size_t)ldb * (size_t)elemSize, 

                             sp, (size_t)lda * (size_t)elemSize,

                             (size_t)rows * (size_t)elemSize, cols,

                             cudaMemcpyHostToDevice);

}

if (cudaStat != cudaSuccess) {

    return CUBLAS_STATUS_MAPPING_ERROR;

}

return CUBLAS_STATUS_SUCCESS;

With this, you should be able to find your problem quickly.