Dear all,
I wrote a driver to catch a call to gemm functions which run fine since version of cublas 3.2. I rebuild a small case that produce the error in order to let you understand what’s happen. I would really appreciate any help let me fix this problem.
On hand for any information you need.
Regars,
Girotto Ivan
igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.1
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.1/cuda/include -I/ichec/packages/cuda/3.1/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.1/cuda/lib64 -L/ichec/packages/cuda/3.1/C/lib -L/ichec/packages/cuda/3.1/C/common/lib/linux/ -L/ichec/packages/cuda/3.1/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
igirotto@fermi-x2:~/Ivan/test_fatica$ module purge
igirotto@fermi-x2:~/Ivan/test_fatica$ module load cuda/3.2-rc
igirotto@fermi-x2:~/Ivan/test_fatica$ make clean
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ make
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o -I/ichec/packages/cuda/3.2-rc/cuda/include -I/ichec/packages/cuda/3.2-rc/C/common/inc/
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o -L/ichec/packages/cuda/3.2-rc/cuda/lib64 -L/ichec/packages/cuda/3.2-rc/C/lib -L/ichec/packages/cuda/3.2-rc/C/common/lib/linux/ -L/ichec/packages/cuda/3.2-rc/OpenCL/common/lib/Linux64/ -lcudart -lcufft -lcublas -L/ichec/packages/intel/Compiler/11.1/069/mkl/lib/em64t -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
igirotto@fermi-x2:~/Ivan/test_fatica$ ./dgemm_speed
!!! device access error (write C) 11
igirotto@fermi-x2:~/Ivan/test_fatica$
igirotto@fermi-x2:~/Ivan/test_fatica$ gcc --version
gcc (Debian 4.3.5-2) 4.3.5
Copyright © 2008 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
igirotto@fermi-x2:~/Ivan/test_fatica$ ifort --version
ifort (IFORT) 11.1 20100203
Copyright © 1985-2010 Intel Corporation. All rights reserved.
igirotto@fermi-x2:~/Ivan/test_fatica$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module 260.24 Thu Sep 9 17:01:12 PDT 2010
GCC version: gcc version 4.3.5 (Debian 4.3.5-2)
igirotto@fermi-x2:~/Ivan/test_fatica$ uname -a
Linux fermi-x2 2.6.32-5-amd64 #1 SMP Wed Aug 25 13:59:41 UTC 2010 x86_64 GNU/Linux
I upload here all the files as I can add them by the form:
igirotto@fermi-x2:~/Ivan/test_fatica$ cat Makefile
all: dgemm_speed
dgemm_speed: dgemm_speed.f90 dgemm_ivan.o
ifort -O3 -o dgemm_speed dgemm_speed.f90 dgemm_ivan.o $(CUDA_MAKEFILE_L) -lcudart -lcufft -lcublas -L$(MKL_LIB) -lmkl_lapack -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -liomp5
dgemm_ivan.o: dgemm_ivan.c
gcc -O3 -c -fPIC dgemm_ivan.c -o dgemm_ivan.o $(CUDA_MAKEFILE_I)
clean:
rm dgemm_ivan.o
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_ivan.c
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include “cuda_runtime.h”
#include “cublas.h”
/*define huge scratch array on GPU */
void *dev_scratch;
void cublas_dgemm_ivan_ (const char *transa, const char *transb, const int *m,
const int *n, const int *k, const double *alpha,
const double *A, const int *lda, const double *B,
const int *ldb, const double *beta, double *C, const int *ldc)
{
int m_gpu, n_gpu, k_gpu;
double *devPtrA, *devPtrB, *devPtrC;
static float split=0.7;
cublasStatus status;
cudaSetDevice(0);
cublasInit();
/* allocate scratch space for library on device */
cudaMalloc ((void**)&dev_scratch, 1.0*134217728 *8 );
// New split
k_gpu = *k ;
n_gpu = *n ;
m_gpu = ceil(*m /64*split) *64;
devPtrA=(double *)dev_scratch;
status = cublasSetMatrix (m_gpu, k_gpu, sizeof(A[0]), A, *lda, devPtrA, m_gpu);
if (status != CUBLAS_STATUS_SUCCESS) {
printf ( "!!!! device access error (write A ) %d\n", status);
}
devPtrB=devPtrA+m_gpu * k_gpu;
status = cublasSetMatrix (k_gpu, n_gpu, sizeof(B[0]), B, *ldb, devPtrB, k_gpu);
if (status != CUBLAS_STATUS_SUCCESS) {
printf ("!!!! device access error (write B) %d\n", status);
}
devPtrC = devPtrB+k_gpu * n_gpu;
status = cublasSetMatrix (m_gpu, n_gpu, sizeof(C[0]), C, *ldc, devPtrC, m_gpu);
if (status != CUBLAS_STATUS_SUCCESS) {
printf ("!!!! device access error (write C) %d\n", status);
}
exit(1);
}
igirotto@fermi-x2:~/Ivan/test_fatica$ cat dgemm_speed.f90
program matrix_multiply
implicit none
integer, parameter :: fp_kind = kind(0.0d0)
real (fp_kind), dimension(:,:), allocatable :: A, B, C
double precision :: time_start,time_end, wallclock
real (fp_kind):: alpha=1._fp_kind,beta=1._fp_kind, c_right
integer:: i,j,m1,m2,m3
m1=1280
m2=m1
m3=m1
allocate(A(m1,m2))
allocate(B(m2,m3))
allocate(C(m1,m3))
! Initialize the matrices A,B and C
A=1._fp_kind
B=2._fp_kind
C=3._fp_kind
call CUBLAS_DGEMM_IVAN(‘n’,‘n’,m1,m3,m2,alpha,A,m1,B,m2,beta,C,m1)
deallocate(A,B,C)
end program matrix_multiply