I tried multiply 2 matrix in GPU memory with this code:
test=(float*)malloc(Ne*Ne*sizeof(test[0]));
initmatrix(test,Ne,3.0); //initialize first matrix with 3 each element
cudaMallocPitch((void**)&d_wmas, &pitch, Ne*Ne*sizeof(float),Nl);
cublasAlloc(Ne*Ne,sizeof(d_test[0]),(void**)&d_test);
cublasSetVector(Ne*Ne,sizeof(d_wmas[0]),test,1,d_wmas,1);
initmatrix(test,Ne,2.0);
cublasSetVector(Ne*Ne,sizeof(test[0]),test,1,*(&d_wmas+Ne*Ne),1);
cudaThreadSynchronize();
cublasSgemm('n','n',Ne,Ne,Ne,1.0,d_wmas,Ne,*(&d_wmas+Ne*Ne),Ne,0.0,d_test,Ne);
status = cublasGetError();
  if (status != CUBLAS_STATUS_SUCCESS) {
    fprintf (stderr, "!!!! kernel execution error.\n");
 Â
  }
cudaThreadSynchronize();
cublasGetVector(Ne*Ne,sizeof(d_test[0]),d_test,1,test,1);
cudaThreadSynchronize();
And I got just second matrix returned in the test array. Am I doing something wrong? I checked that both first matrix writes in *d_wmas and second in (&d_wmas+NeNe) correct.