I tried using OpenACC to call the cublasgemmBatch function in cubals, but I tried many times and always prompted me
Failing in Thread: 1
Call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
But the same code can be compiled and executed using the NVCC compiler, which I believe may be due to incorrect function calls, but I have tried many times and still keep making mistakes
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define M 2
#define N 10
#define K 5
#define BATCH_SIZE 2
int main()
{
cudaError_t cudaStatus;
cublasStatus_t cublasStatus;
cublasHandle_t handle;
float*h_A=(float*)malloc(BATCH_SIZE*M * N*sizeof(float));
float*h_B=(float*)malloc(BATCH_SIZE*N * K*sizeof(float));
float*h_C=(float*)malloc(BATCH_SIZE*M * K*sizeof(float));
for(int i=0;i<BATCH_SIZE;i++)
{
for(int j=0;j<M;j++)
for(int k=0;k<N;k++)
h_A[i*M*N+j*N+k]=j*N+k;
for(int j=0;j<N;j++)
for(int k=0;k<K;k++)
h_B[i*K*N+j*K+k]=j*K+k;
for(int j=0;j<M;j++)
for(int k=0;k<K;k++)
h_A[i*M*K+j*K+k]=j*K+k;
}
cublasStatus = cublasCreate(&handle);
const float alpha = 1.0f;
const float beta = 0.0f;
const cublasOperation_t transa = CUBLAS_OP_N;
const cublasOperation_t transb = CUBLAS_OP_N;
#pragma acc enter data copyin(h_A[0:BATCH_SIZE*M*N],h_B[0:BATCH_SIZE*N*K],h_C[0:BATCH_SIZE*M*K])
#pragma acc host_data use_device(h_A,h_B,h_C)
cublasStatus = cublasSgemmBatched(handle, transa, transb, K, M, N, &alpha, &h_B, K, &h_A, N, &beta, &h_C, K, BATCH_SIZE);
#pragma acc exit data copyout(h_C[0:BATCH_SIZE*M*K])
for (int i = 0; i < BATCH_SIZE; i++) {
for (int j = 0; j < M * K; j++) {
printf("%f ", h_C[i*M*K+j]);
}
printf("\n");
}
cublasStatus = cublasDestroy(handle);
return 0;
}
I believe the cublas batched routines are expecting an array of pointers size to the batch, Converting to use a 2D multi-dimensional arrays seems to work as expected:
% cat test2.cpp
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define M 2
#define N 10
#define K 5
#define BATCH_SIZE 2
int main()
{
cudaError_t cudaStatus;
cublasStatus_t cublasStatus;
cublasHandle_t handle;
float**h_A=(float**)malloc(BATCH_SIZE*sizeof(float*));
float**h_B=(float**)malloc(BATCH_SIZE*sizeof(float*));
float**h_C=(float**)malloc(BATCH_SIZE*sizeof(float*));
for(int i=0;i<BATCH_SIZE;i++)
{
h_A[i] = (float*) malloc(M * N*sizeof(float));
h_B[i] = (float*) malloc(N * K*sizeof(float));
h_C[i] = (float*) malloc(M * K*sizeof(float));
for(int j=0;j<M;j++)
for(int k=0;k<N;k++)
h_A[i][(j*N)+k]=2;
for(int j=0;j<N;j++)
for(int k=0;k<K;k++)
h_B[i][(j*K)+k]=4;
for(int j=0;j<M;j++)
for(int k=0;k<K;k++)
h_C[i][(j*M)+k]=0;
}
cublasStatus = cublasCreate(&handle);
const float alpha = 1.0f;
const float beta = 0.0f;
const cublasOperation_t transa = CUBLAS_OP_N;
const cublasOperation_t transb = CUBLAS_OP_N;
#pragma acc enter data copyin(h_A[0:BATCH_SIZE][:M*N],h_B[0:BATCH_SIZE][:N*K],h_C[0:BATCH_SIZE][:M*K])
#pragma acc host_data use_device(h_A,h_B,h_C)
{
cublasStatus = cublasSgemmBatched(handle, transa, transb, K, M, N, &alpha, h_B, K, h_A, N, &beta, h_C, K, BATCH_SIZE);
}
#pragma acc exit data copyout(h_C[0:BATCH_SIZE][:M*K]) delete(h_A,h_B)
for (int i = 0; i < BATCH_SIZE; i++) {
for (int j = 0; j < M * K; j++) {
printf("%f ", h_C[i][j]);
}
printf("\n");
}
cublasStatus = cublasDestroy(handle);
return 0;
}
% nvc++ -w -acc -cuda -cudalib=cublas test2.cpp ; a.out
80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000
80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000 80.000000
Hope this helps,
Mat
Thank you Mat, I don’t understand cublas Batch well enough, but from your example, it seems that this is not suitable for one-dimensional Pointers