Hi,
lately I started to use the cuda-memcheck tool to look for out-of-bounds accesses etc…
I found that this check crashes my application. The problem seems to be in a call to the CUBLAS
cgemm routine. The result is calculated correctly and the program runs fine. However, if I run
cuda-memcheck I get a segmentation fault.
I’m running the program on a Tesla C2050 with CUDA 4.0.
It would be great if somebody could verify this problem.
Cheers,
Eric
The programm I used is the following:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <cublas.h>
#include <cuda_runtime_api.h>
// Transposition flags
const char TransA = 'T'; const char TransB = 'N';
// Complex data type
typedef cuComplex Complex;
__inline__ void CUDA_CHECK_ERROR(void)
{
cudaError_t cuErr = cudaGetLastError();
if(cuErr != cudaSuccess)
printf("%s\n", cudaGetErrorString(cuErr));
}
__inline__ void CUBLAS_CHECK_ERROR(void)
{
cublasStatus_t cubErr = cublasGetError();
if(cubErr != CUBLAS_STATUS_SUCCESS)
printf("%i\n", cubErr);
}
void print_matrix (const Complex *A, const int lda, const int N, const int M)
{
for (int i=0; i < N; ++i) {
for (int j=0; j < M; ++j) {
printf("%8.2f+%8.2f*I\t",A[j*lda+i].x,A[j*lda+i].y);
}
printf("\n");
}
}
int main(int argc, char* argv[]) {
const int M = 2;
const int ld = 4;
const int Msize = ld*M;
const Complex alpha = {.x=1.0, .y=2.0};
const Complex beta = {.x=1.0, .y=2.0};
Complex *A_D, *C_D; // pointers to DEVICE memory
Complex *A, *C; // pointers to HOST memory
cublasInit(); CUBLAS_CHECK_ERROR();
// Allocate arrays on DEVICE
cudaMalloc ((void **) &A_D, (size_t)(sizeof(Complex) * Msize)); CUDA_CHECK_ERROR();
cudaMalloc ((void **) &C_D, (size_t)(sizeof(Complex) * Msize)); CUDA_CHECK_ERROR();
// Allocate arrays on HOST
A = (Complex*) malloc((size_t)(sizeof(Complex) * Msize));
C = (Complex*) malloc((size_t)(sizeof(Complex) * Msize));
// Initialize host data
for (int i=0; i < M; ++i) {
for (int j=0; j < M; ++j) {
A [j*ld + i].x = 2.0; A [j*ld + i].y = 2.0;
C [j*ld + i].x = 4.0; C [j*ld + i].y = 4.0;
}
}
printf("\nMatrix A\n"); print_matrix (A,ld,M,M);
printf("\nMatrix C before CGEMM\n"); print_matrix (C,ld,M,M);
// Copy input-data from host memory to device memory
cudaMemcpy (A_D, A, (size_t)(sizeof(Complex)*Msize), cudaMemcpyHostToDevice); CUDA_CHECK_ERROR();
cudaMemcpy (C_D, C, (size_t)(sizeof(Complex)*Msize), cudaMemcpyHostToDevice); CUDA_CHECK_ERROR();
// call CGEMM-Kernel
printf ("\nbefore CGEMM call\n");
cublasCgemm (TransA,TransB,M,M,M,alpha,A_D,ld,A_D,ld,beta,C_D,ld); CUBLAS_CHECK_ERROR();
printf ("\nafter CGEMM call\n");
// Copy result back to host
cudaMemcpy (C, C_D, (size_t)(sizeof(Complex)*Msize), cudaMemcpyDeviceToHost); CUDA_CHECK_ERROR();
printf("\nMatrix C after CGEMM \n"); print_matrix (C,ld,M,M);
// Clean-Up
cudaFree (C_D); cudaFree (A_D);
free (C); free (A);
cublasShutdown (); CUBLAS_CHECK_ERROR();
}
which was compiled with
CUDA_PATH = /opt/cuda
CUDA_INC_PATH = $(CUDA_PATH)/include
CUDA_LIB_PATH = $(CUDA_PATH)/lib64
LIBRARY = -L/usr/lib64 -L$(CUDA_LIB_PATH)
INCLUDE = -I$(CUDA_INC_PATH)
NVCC = $(CUDA_PATH)/bin/nvcc
NVCCFLAGS = -g -G -v -arch sm_20 -Xptxas="-v"
NVCCOPS = -gencode arch=compute_20,code=compute_20 --maxrregcount 63 -Xcompiler -std=gnu99
all: test
test: cgemm.cu
$(NVCC) $(NVCCFLAGS) $(NVCCOPS) $(INCLUDE) $(LIBRARY) -lcublas -lcudart -o test cgemm.cu
clean:
rm *.o *.bak test
Running it I get the following output:
which gives the correct result. If I run
the program crashes and I see the following line in the kernel messages