device memory not allocated error in emu mode using cublas

I am running the beginner cublas program.





[b]#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cublas.h>

#define M 6
#define N 5

#define IDX2C(i,j,ld) (((j)*(ld))+(i))

void modify (float *m, int ldm, int n, int p, int q, float alpha,float beta)
{
cublasSscal (n-p, alpha, &m[IDX2C(p,q,ldm)], ldm);
cublasSscal (ldm-p, beta, &m[IDX2C(p,q,ldm)], 1);
}

int main(int argc, char argv[])
{
int i, j;
cublasStatus stat;
float
devPtrA;
float* a = 0;
a = (float )malloc (M * N * sizeof (a));
if (!a) {
printf (“host memory allocation failed”);
return 1;
}
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
a[IDX2C(i,j,M)] = i * M + j + 1;
}
}
cublasInit();
stat=cublasAlloc (M
N, sizeof(a), (void
)&devPtrA);
//printf(“%s”,stat);
if (stat!= CUBLAS_STATUS_SUCCESS) {
printf (“device memory allocation failed”);
return 1;
}
cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
modify (devPtrA, M, N, 1, 2, 16.0f, 12.0f);
cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
cublasFree (devPtrA);
cublasShutdown();
for (j = 0; j < N; j++) {
for (i = 0; i < M; i++) {
printf (“%7.0f”, a[IDX2C(i,j,M)]);
}
printf (“\n”);
}

    return 0;

}
[/b]





i am compiling using the command
nvcc -deviceemu -o modify modify.cu -lcublas

while running i am getting the error
device memory not allocated…

any suggestions???

You need to link with the CUBLAS emulation library (libcublasemu). I am pretty sure if you check the return status of the cublasInit() call, it will be returning an error.

You should be aware that while your code will probably work OK in emulation mode, the indexing into the device array inside your modify() function will fail if you try compiling it and running it on a real GPU.

yeah the cublasinit() function also failed.

can you tell me how to link with CUBLAS emulation library

and why my indexing fails on a real gpu?

nvcc -deviceemu -o modify modify.cu -lcublasemu
void modify (float *m, int ldm, int n, int p, int q, float alpha,float beta)

{

cublasSscal (n-p, alpha, &m[IDX2C(p,q,ldm)], ldm);

cublasSscal (ldm-p, beta, &m[IDX2C(p,q,ldm)], 1);

}

When you run this on a device, m is a pointer in device memory, not host memory, so the pointer dereferencing in the two cublasSscal calls will fail.