Strange behaviour setMatrix 3.2 v 3.1

igirotto · September 28, 2010, 5:00am

Dear all,

I wrote a driver to catch a call to gemm functions which run fine since version of cublas 3.2. I rebuild a small case that produce the error in order to let you understand what’s happen. I would really appreciate any help let me fix this problem.

On hand for any information you need.

Regars,

Girotto Ivan

igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.1
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.1/cuda/include -I/ichec/packages/cuda/3.1/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.1/cuda/lib64 -L/ichec/packages/cuda/3.1/C/lib -L/ichec/packages/cuda/3.1/C/common/lib/linux/ -L/ichec/packages/cuda/3.1/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.2-rc
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.2-rc/cuda/include -I/ichec/packages/cuda/3.2-rc/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.2-rc/cuda/lib64 -L/ichec/packages/cuda/3.2-rc/C/lib -L/ichec/packages/cuda/3.2-rc/C/common/lib/linux/ -L/ichec/packages/cuda/3.2-rc/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
!!! device access error (write C) 11
igirotto@fermi-x2:~/Ivan/test_fatica$

igirotto@fermi-x2:~/Ivan/test_fatica$ gcc --version
gcc (Debian 4.3.5-2) 4.3.5
Copyright © 2008 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

igirotto@fermi-x2:~/Ivan/test_fatica$ ifort --version
ifort (IFORT) 11.1 20100203
Copyright © 1985-2010 Intel Corporation. All rights reserved.

igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010
GCC version: gcc version 4.3.5 (Debian 4.3.5-2)
igirotto@fermi-x2:~/Ivan/test_fatica$ uname -a
Linux fermi-x2 2.6.32-5-amd64 #1 SMP Wed Aug 25 13:59:41 UTC 2010 x86_64 GNU/Linux

I upload here all the files as I can add them by the form:

igirotto@fermi-x2:~/Ivan/test_fatica$ cat Makefile
all: dgemm_speed

dgemm_speed: dgemm_speed.f90 dgemm_ivan.o
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o $(CUDA_MAKEFILE_L) -lcudart -lcufft -lcublas -L$(MKL_LIB) -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5

dgemm_ivan.o: dgemm_ivan.c
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o $(CUDA_MAKEFILE_I)

clean:
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_ivan.c
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>

#include “cuda_runtime.h”
#include “cublas.h”

/*define huge scratch array on GPU */
void *dev_scratch;

void cublas_dgemm_ivan_ (const char *transa, const char *transb, const int *m,
const int *n, const int *k, const double *alpha,
const double *A, const int *lda, const double *B,
const int *ldb, const double *beta, double *C, const int *ldc)
{
int m_gpu, n_gpu, k_gpu;
double *devPtrA, *devPtrB, *devPtrC;
static float split=0.7;
cublasStatus status;

cudaSetDevice(0);
cublasInit();

/* allocate scratch space for library on device */
cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 );

// New split
k_gpu = *k ;

n_gpu = *n ;
m_gpu = ceil(*m /64*split) *64;

 devPtrA=(double *)dev_scratch;
 status = cublasSetMatrix (m_gpu, k_gpu, sizeof(A[0]), A, *lda, devPtrA, m_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
   printf ( "!!!! device access error (write A ) %d\n", status);
 }

 devPtrB=devPtrA+m_gpu * k_gpu;
 status = cublasSetMatrix (k_gpu, n_gpu, sizeof(B[0]), B, *ldb, devPtrB, k_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
    printf ("!!!! device access error (write B) %d\n", status);
 }

 devPtrC = devPtrB+k_gpu * n_gpu;
 status = cublasSetMatrix (m_gpu, n_gpu, sizeof(C[0]), C, *ldc, devPtrC, m_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
   printf ("!!!! device access error (write C) %d\n", status);
 }

 exit(1);

}
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_speed.f90
program matrix_multiply

implicit none

integer, parameter :: fp_kind = kind(0.0d0)
real (fp_kind), dimension(:,:), allocatable :: A, B, C
double precision :: time_start,time_end, wallclock
real (fp_kind):: alpha=1._fp_kind,beta=1._fp_kind, c_right
integer:: i,j,m1,m2,m3

m1=1280
m2=m1
m3=m1

allocate(A(m1,m2))
allocate(B(m2,m3))
allocate(C(m1,m3))

! Initialize the matrices A,B and C
A=1._fp_kind
B=2._fp_kind
C=3._fp_kind

call CUBLAS_DGEMM_IVAN(‘n’,‘n’,m1,m3,m2,alpha,A,m1,B,m2,beta,C,m1)

deallocate(A,B,C)

end program matrix_multiply

igirotto · September 28, 2010, 5:00am

Dear all,

I wrote a driver to catch a call to gemm functions which run fine since version of cublas 3.2. I rebuild a small case that produce the error in order to let you understand what’s happen. I would really appreciate any help let me fix this problem.

On hand for any information you need.

Regars,

Girotto Ivan

igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.1
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.1/cuda/include -I/ichec/packages/cuda/3.1/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.1/cuda/lib64 -L/ichec/packages/cuda/3.1/C/lib -L/ichec/packages/cuda/3.1/C/common/lib/linux/ -L/ichec/packages/cuda/3.1/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.2-rc
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.2-rc/cuda/include -I/ichec/packages/cuda/3.2-rc/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.2-rc/cuda/lib64 -L/ichec/packages/cuda/3.2-rc/C/lib -L/ichec/packages/cuda/3.2-rc/C/common/lib/linux/ -L/ichec/packages/cuda/3.2-rc/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
!!! device access error (write C) 11
igirotto@fermi-x2:~/Ivan/test_fatica$

igirotto@fermi-x2:~/Ivan/test_fatica$ gcc --version
gcc (Debian 4.3.5-2) 4.3.5
Copyright © 2008 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

igirotto@fermi-x2:~/Ivan/test_fatica$ ifort --version
ifort (IFORT) 11.1 20100203
Copyright © 1985-2010 Intel Corporation. All rights reserved.

igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010
GCC version: gcc version 4.3.5 (Debian 4.3.5-2)
igirotto@fermi-x2:~/Ivan/test_fatica$ uname -a
Linux fermi-x2 2.6.32-5-amd64 #1 SMP Wed Aug 25 13:59:41 UTC 2010 x86_64 GNU/Linux

I upload here all the files as I can add them by the form:

igirotto@fermi-x2:~/Ivan/test_fatica$ cat Makefile
all: dgemm_speed

dgemm_speed: dgemm_speed.f90 dgemm_ivan.o
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o $(CUDA_MAKEFILE_L) -lcudart -lcufft -lcublas -L$(MKL_LIB) -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5

dgemm_ivan.o: dgemm_ivan.c
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o $(CUDA_MAKEFILE_I)

clean:
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_ivan.c
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>

#include “cuda_runtime.h”
#include “cublas.h”

/*define huge scratch array on GPU */
void *dev_scratch;

void cublas_dgemm_ivan_ (const char *transa, const char *transb, const int *m,
const int *n, const int *k, const double *alpha,
const double *A, const int *lda, const double *B,
const int *ldb, const double *beta, double *C, const int *ldc)
{
int m_gpu, n_gpu, k_gpu;
double *devPtrA, *devPtrB, *devPtrC;
static float split=0.7;
cublasStatus status;

cudaSetDevice(0);
cublasInit();

/* allocate scratch space for library on device */
cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 );

// New split
k_gpu = *k ;

n_gpu = *n ;
m_gpu = ceil(*m /64*split) *64;

 devPtrA=(double *)dev_scratch;
 status = cublasSetMatrix (m_gpu, k_gpu, sizeof(A[0]), A, *lda, devPtrA, m_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
   printf ( "!!!! device access error (write A ) %d\n", status);
 }

 devPtrB=devPtrA+m_gpu * k_gpu;
 status = cublasSetMatrix (k_gpu, n_gpu, sizeof(B[0]), B, *ldb, devPtrB, k_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
    printf ("!!!! device access error (write B) %d\n", status);
 }

 devPtrC = devPtrB+k_gpu * n_gpu;
 status = cublasSetMatrix (m_gpu, n_gpu, sizeof(C[0]), C, *ldc, devPtrC, m_gpu);
 if (status != CUBLAS_STATUS_SUCCESS) {
   printf ("!!!! device access error (write C) %d\n", status);
 }

 exit(1);

}
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_speed.f90
program matrix_multiply

implicit none

integer, parameter :: fp_kind = kind(0.0d0)
real (fp_kind), dimension(:,:), allocatable :: A, B, C
double precision :: time_start,time_end, wallclock
real (fp_kind):: alpha=1._fp_kind,beta=1._fp_kind, c_right
integer:: i,j,m1,m2,m3

m1=1280
m2=m1
m3=m1

allocate(A(m1,m2))
allocate(B(m2,m3))
allocate(C(m1,m3))

! Initialize the matrices A,B and C
A=1._fp_kind
B=2._fp_kind
C=3._fp_kind

call CUBLAS_DGEMM_IVAN(‘n’,‘n’,m1,m3,m2,alpha,A,m1,B,m2,beta,C,m1)

deallocate(A,B,C)

end program matrix_multiply

avidday · September 28, 2010, 8:40am

I take it the question you are actually trying to ask (albeit in the most obfuscated of fashions) is why CUBLAS is returning CUBLAS_STATUS_MAPPING_ERROR with CUDA 3.2rc and not with CUDA 3.1?

I will take a guess that the answer is you haven’t installed series 260 drivers, and there is a underlying toolkit version versus driver conflict.

avidday · September 28, 2010, 8:40am

I take it the question you are actually trying to ask (albeit in the most obfuscated of fashions) is why CUBLAS is returning CUBLAS_STATUS_MAPPING_ERROR with CUDA 3.2rc and not with CUDA 3.1?

I will take a guess that the answer is you haven’t installed series 260 drivers, and there is a underlying toolkit version versus driver conflict.

avidday · September 28, 2010, 8:40am

I take it the question you are actually trying to ask (albeit in the most obfuscated of fashions) is why CUBLAS is returning CUBLAS_STATUS_MAPPING_ERROR with CUDA 3.2rc and not with CUDA 3.1?

I will take a guess that the answer is you haven’t installed series 260 drivers, and there is a underlying toolkit version versus driver conflict.

igirotto · September 28, 2010, 2:23pm

You got the right question. BTW, the driver is installed as I reported. I’ve also tested the sdk/3.2 examples that are all working fine. Do you know any other way that can permit me to check if the driver works fine?

Thanks for the answer,

Ivan

igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version

NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010

GCC version: gcc version 4.3.5 (Debian 4.3.5-2)

igirotto · September 28, 2010, 2:23pm

You got the right question. BTW, the driver is installed as I reported. I’ve also tested the sdk/3.2 examples that are all working fine. Do you know any other way that can permit me to check if the driver works fine?

Thanks for the answer,

Ivan

igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version

NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010

GCC version: gcc version 4.3.5 (Debian 4.3.5-2)

philippev · September 28, 2010, 6:59pm

From the code, can I assume that the error comes from cublasSetMatrix, not cublasSgemm
Which call to cublasSetMatrix fails?

philippev · September 28, 2010, 6:59pm

From the code, can I assume that the error comes from cublasSetMatrix, not cublasSgemm
Which call to cublasSetMatrix fails?

igirotto · September 29, 2010, 3:41am

yes, i guess it comes from the cublasSetMatrix call as title of this topic. The one fails it’s the third one (matrix C) as reported by the output of the program.

Ivan

igirotto · September 29, 2010, 3:41am

yes, i guess it comes from the cublasSetMatrix call as title of this topic. The one fails it’s the third one (matrix C) as reported by the output of the program.

Ivan

philippev · October 22, 2010, 11:22pm

cublasSetMatrix set CUBLAS_STATUS_MAPPING_ERROR if the underlying cudaMemcpy or cudaMemcoy2D fails.

You should try to get the actual cudaError_t calling cudaGetLastError();

cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 ); ← this hardcoded floating point value is very suspicious.

The actual source code of cublasSetMatrix is something like this:

cudaError_t cudaStat = cudaSuccess;

const char *sp = (const char *)A;

char *dp = (char *)B;

if ((lda <= 0) || (ldb <= 0) || (elemSize <= 0) || (rows < 0) || (cols<0)){

    return CUBLAS_STATUS_INVALID_VALUE;

}

/* early out if nothing to do */

if ((rows == 0) || (cols == 0)) {

    return CUBLAS_STATUS_SUCCESS;

}

if ((rows == lda) && (rows == ldb)) {

    cudaStat = cudaMemcpy (dp, sp, 

                           (size_t)rows * (size_t)cols * (size_t)elemSize, 

                           cudaMemcpyHostToDevice);

}  else {

    cudaStat = cudaMemcpy2D (dp, (size_t)ldb * (size_t)elemSize, 

                             sp, (size_t)lda * (size_t)elemSize,

                             (size_t)rows * (size_t)elemSize, cols,

                             cudaMemcpyHostToDevice);

}

if (cudaStat != cudaSuccess) {

    return CUBLAS_STATUS_MAPPING_ERROR;

}

return CUBLAS_STATUS_SUCCESS;

With this, you should be able to find your problem quickly.

philippev · October 22, 2010, 11:22pm

cublasSetMatrix set CUBLAS_STATUS_MAPPING_ERROR if the underlying cudaMemcpy or cudaMemcoy2D fails.

You should try to get the actual cudaError_t calling cudaGetLastError();

cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 ); ← this hardcoded floating point value is very suspicious.

The actual source code of cublasSetMatrix is something like this:

cudaError_t cudaStat = cudaSuccess;

const char *sp = (const char *)A;

char *dp = (char *)B;

if ((lda <= 0) || (ldb <= 0) || (elemSize <= 0) || (rows < 0) || (cols<0)){

    return CUBLAS_STATUS_INVALID_VALUE;

}

/* early out if nothing to do */

if ((rows == 0) || (cols == 0)) {

    return CUBLAS_STATUS_SUCCESS;

}

if ((rows == lda) && (rows == ldb)) {

    cudaStat = cudaMemcpy (dp, sp, 

                           (size_t)rows * (size_t)cols * (size_t)elemSize, 

                           cudaMemcpyHostToDevice);

}  else {

    cudaStat = cudaMemcpy2D (dp, (size_t)ldb * (size_t)elemSize, 

                             sp, (size_t)lda * (size_t)elemSize,

                             (size_t)rows * (size_t)elemSize, cols,

                             cudaMemcpyHostToDevice);

}

if (cudaStat != cudaSuccess) {

    return CUBLAS_STATUS_MAPPING_ERROR;

}

return CUBLAS_STATUS_SUCCESS;

With this, you should be able to find your problem quickly.

Topic		Replies	Views
CUBLAS_STATUS_MAPPING_ERROR in cublasGetMatrix() after cublasDgemm() GPU-Accelerated Libraries	10	9340	February 21, 2013
cublasDgemm returns wrong results for large matrix dimensions? CUDA Programming and Performance	12	3240	November 30, 2010
CUBLAS_STATUS_MAPPING_ERROR in cublasGetMatrix() CUDA Programming and Performance	0	5839	July 20, 2009
cublasSgemm() alway fail during compute intensify task CUDA Programming and Performance	14	4638	January 8, 2015
cublasZgemm fails on FERMI but not on TESLA CUBLAS_STATUS_NOT_INITIALIZED even if 'cublasInit()& CUDA Programming and Performance	2	5924	February 17, 2011
Help Matrix Multiplication using cuBLAS CUDA Programming and Performance	10	23932	July 24, 2010
CUBLAS_STATUS_MAPPING_ERROR with basic matrix multiplication code GPU-Accelerated Libraries	2	1080	April 4, 2017
strange things of cublas 3.0 on RHEL 5.3 CUDA Programming and Performance	2	3826	April 14, 2010
strange things of cublas 3.0 on RHEL 5.3 CUDA Programming and Performance	0	1137	April 8, 2010
strange things of cublas 3.0 on RHEL 5.3 CUDA Programming and Performance	0	2691	April 8, 2010

Strange behaviour setMatrix 3.2 v 3.1

Related topics