I’ve actually tried something similar on a small scale source code that does exactly that, and it seemed to work:
/* Includes, system */
#include "stdafx.h"
#include "nsim_matrix.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas.h>
/* Matrix size */
#define N (5)
/* Main */
int main(int argc, char** argv)
{
cublasStatus status;
cudaError state;
float* h_A;
float* h_B;
float* h_C;
float* h_result;
float* h_C_ref;
float* d_A = 0;
float* d_B = 0;
float* d_C = 0;
float* d_result = 0;
float alpha = 1.0f;
float beta = 0.0f;
int n2 = N * N;
int i;
float error_norm;
float ref_norm;
float diff;
/* Initialize CUBLAS */
printf("simpleCUBLAS test running..\n");
status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
/* Allocate host memory for the matrices */
h_A = (float*)malloc(n2 * sizeof(h_A[0]));
if (h_A == 0) {
fprintf (stderr, "!!!! host memory allocation error (A)\n");
return EXIT_FAILURE;
}
h_B = (float*)malloc(n2 * sizeof(h_B[0]));
if (h_B == 0) {
fprintf (stderr, "!!!! host memory allocation error (B)\n");
return EXIT_FAILURE;
}
h_C = (float*)malloc(N * sizeof(h_C[0]));
if (h_C == 0) {
fprintf (stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
h_result = (float*) malloc(N * sizeof(h_result[0]));
if (h_result == 0) {
fprintf (stderr, "!!!! host memory allocation error (result)\n");
return EXIT_FAILURE;
}
/* Fill the matrices with test data */
for (i = 0; i < n2; i++) {
h_A[i] = rand() % 10 + 1;
h_B[i] = rand() % 10 + 1;
}
for (i = 0; i < N; i++)
h_C[i] = rand() % 10 + 1;
/* Allocate device memory for the matrices */
state = cudaMalloc((void**)&d_A, n2*sizeof(d_A[0]));
if (state != cudaSuccess) {
fprintf (stderr, "!!!! device memory allocation error (A)\n");
return EXIT_FAILURE;
}
state = cudaMalloc((void**)&d_B, n2*sizeof(d_B[0]));
if (state != cudaSuccess) {
fprintf (stderr, "!!!! device memory allocation error (B)\n");
return EXIT_FAILURE;
}
state = cudaMalloc((void**)&d_C, N*sizeof(d_C[0]));
if (state != cudaSuccess) {
fprintf (stderr, "!!!! device memory allocation error (C)\n");
return EXIT_FAILURE;
}
state = cudaMalloc((void**)&d_result, N*sizeof(d_result[0]));
if (state != cudaSuccess) {
fprintf (stderr, "!!!! device memory allocation error (result)\n");
return EXIT_FAILURE;
}
/* Initialize the device matrices with the host matrices */
state = cudaMemcpy(d_A, h_A, n2*sizeof(h_A[0]), cudaMemcpyHostToDevice);
if (state != cudaSuccess) {
fprintf (stderr, "!!!! device access error (write A)\n");
return EXIT_FAILURE;
}
state = cudaMemcpy(d_B, h_B, n2*sizeof(h_B[0]), cudaMemcpyHostToDevice);
if (state != cudaSuccess) {
fprintf (stderr, "!!!! device access error (write B)\n");
return EXIT_FAILURE;
}
state = cudaMemcpy(d_C, h_C, N*sizeof(h_C[0]), cudaMemcpyHostToDevice);
if (state != cudaSuccess) {
fprintf (stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
/* Performs operation using plain C code */
//simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
sgemv('n', N, 1, 1.0, &h_A[3*N], N, h_C, 1, 0.0, h_result, 1);
h_C_ref = h_C;
/* Clear last error */
cublasGetError();
/* Performs operation using cublas */
cublasSgemv('n', N, 1, 1.0, &d_A[3*N], N, d_C, 1, 0.0, d_result, 1);
//cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);
status = cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Allocate host memory for reading back the result from device memory */
h_C = (float*)malloc(N * sizeof(h_C[0]));
if (h_C == 0) {
fprintf (stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Read the result back */
state = cudaMemcpy(h_C, d_result, N*sizeof(h_C[0]), cudaMemcpyDeviceToHost);
if (state != cudaSuccess) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
/* Memory clean up */
free(h_A);
free(h_B);
free(h_C);
free(h_result);
free(h_C_ref);
state = cudaFree(d_A);
if (status != cudaSuccess) {
fprintf (stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
state = cudaFree(d_B);
if (status != cudaSuccess) {
fprintf (stderr, "!!!! memory free error (B)\n");
return EXIT_FAILURE;
}
state = cudaFree(d_C);
if (status != cudaSuccess) {
fprintf (stderr, "!!!! memory free error (C)\n");
return EXIT_FAILURE;
}
state = cudaFree(d_result);
if (status != cudaSuccess) {
fprintf (stderr, "!!!! memory free error (result)\n");
return EXIT_FAILURE;
}
/* Shutdown */
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
if (argc > 1) {
if (!strcmp(argv[1], "-noprompt") ||
!strcmp(argv[1], "-qatest") )
{
return EXIT_SUCCESS;
}
}
else
{
printf("\nPress ENTER to exit...\n");
getchar();
}
return EXIT_SUCCESS;
}
The problem could be something else, but at least the answer has been found experimentally.