Try calling cublasgemmBatch in OpenACC

I tried using OpenACC to call the cublasgemmBatch function in cubals, but I tried many times and always prompted me


Failing in Thread: 1

Call to cuStreamSynchronize returned error 700: Illegal address during kernel execution

But the same code can be compiled and executed using the NVCC compiler, which I believe may be due to incorrect function calls, but I have tried many times and still keep making mistakes

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define M 2
#define N 10
#define K 5
#define BATCH_SIZE 2

int main()
{
    cudaError_t cudaStatus;
    cublasStatus_t cublasStatus;
    cublasHandle_t handle;

    float*h_A=(float*)malloc(BATCH_SIZE*M * N*sizeof(float)); 
    float*h_B=(float*)malloc(BATCH_SIZE*N * K*sizeof(float)); 
    float*h_C=(float*)malloc(BATCH_SIZE*M * K*sizeof(float));     

for(int i=0;i<BATCH_SIZE;i++)
{
	for(int j=0;j<M;j++)
	for(int k=0;k<N;k++)
	h_A[i*M*N+j*N+k]=j*N+k;
	
	for(int j=0;j<N;j++)
	for(int k=0;k<K;k++)
	h_B[i*K*N+j*K+k]=j*K+k;	

	for(int j=0;j<M;j++)
	for(int k=0;k<K;k++)
	h_A[i*M*K+j*K+k]=j*K+k;
	
}

    cublasStatus = cublasCreate(&handle);

    const float alpha = 1.0f;
    const float beta = 0.0f;
    const cublasOperation_t transa = CUBLAS_OP_N;
    const cublasOperation_t transb = CUBLAS_OP_N;

 
    #pragma acc enter data copyin(h_A[0:BATCH_SIZE*M*N],h_B[0:BATCH_SIZE*N*K],h_C[0:BATCH_SIZE*M*K])
    #pragma acc host_data use_device(h_A,h_B,h_C)
    cublasStatus = cublasSgemmBatched(handle, transa, transb, K, M, N, &alpha, &h_B, K, &h_A, N, &beta, &h_C, K, BATCH_SIZE);


#pragma acc exit data copyout(h_C[0:BATCH_SIZE*M*K])
  
    for (int i = 0; i < BATCH_SIZE; i++) {
        for (int j = 0; j < M * K; j++) {
            printf("%f ", h_C[i*M*K+j]);
        }
        printf("\n");
    }
    cublasStatus = cublasDestroy(handle);

    return 0;
}

I believe the cublas batched routines are expecting an array of pointers size to the batch, Converting to use a 2D multi-dimensional arrays seems to work as expected:

% cat test2.cpp
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define M 2
#define N 10
#define K 5
#define BATCH_SIZE 2

int main()
{
    cudaError_t cudaStatus;
    cublasStatus_t cublasStatus;
    cublasHandle_t handle;

    float**h_A=(float**)malloc(BATCH_SIZE*sizeof(float*));
    float**h_B=(float**)malloc(BATCH_SIZE*sizeof(float*));
    float**h_C=(float**)malloc(BATCH_SIZE*sizeof(float*));

for(int i=0;i<BATCH_SIZE;i++)
{
        h_A[i] = (float*) malloc(M * N*sizeof(float));
        h_B[i] = (float*) malloc(N * K*sizeof(float));
        h_C[i] = (float*) malloc(M * K*sizeof(float));

        for(int j=0;j<M;j++)
        for(int k=0;k<N;k++)
        h_A[i][(j*N)+k]=2;

        for(int j=0;j<N;j++)
        for(int k=0;k<K;k++)
        h_B[i][(j*K)+k]=4;

        for(int j=0;j<M;j++)
        for(int k=0;k<K;k++)
        h_C[i][(j*M)+k]=0;

}
    cublasStatus = cublasCreate(&handle);

    const float alpha = 1.0f;
    const float beta = 0.0f;
    const cublasOperation_t transa = CUBLAS_OP_N;
    const cublasOperation_t transb = CUBLAS_OP_N;

    #pragma acc enter data copyin(h_A[0:BATCH_SIZE][:M*N],h_B[0:BATCH_SIZE][:N*K],h_C[0:BATCH_SIZE][:M*K])
    #pragma acc host_data use_device(h_A,h_B,h_C)
    {
    cublasStatus = cublasSgemmBatched(handle, transa, transb, K, M, N, &alpha, h_B, K, h_A, N, &beta, h_C, K, BATCH_SIZE);
    }

#pragma acc exit data copyout(h_C[0:BATCH_SIZE][:M*K]) delete(h_A,h_B)

    for (int i = 0; i < BATCH_SIZE; i++) {
        for (int j = 0; j < M * K; j++) {
            printf("%f ", h_C[i][j]);
        }
        printf("\n");
    }
    cublasStatus = cublasDestroy(handle);

    return 0;
}
% nvc++ -w -acc -cuda -cudalib=cublas test2.cpp ; a.out
80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000
80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000

Hope this helps,
Mat

Thank you Mat, I don’t understand cublas Batch well enough, but from your example, it seems that this is not suitable for one-dimensional Pointers