I have one row major matrix of size M*N and want to apply dgeam subroutine of cublas to this matrix to have one column major ordered matrix. Yet when i print the transposed matrix, i see that a resultant matrix is produced whose each element is 0 and I could not find out what i am doing wrong and why each entry of the matrix is assigned to 0.

```
cublasHandle_t handle;
cublasStatus_t status;
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", status, __LINE__);
exit(EXIT_FAILURE);
}
/* Transpose of matrix V , because it is row rank */
int M = 8 ;
int N = 12;
unsigned int size_V=M*N;
unsigned int mem_size_V=sizeof(double)*size_V;
double* h_V;
h_V=(double*)malloc(mem_size_V);
for(int i=0; i<M; i++){
for(int j= 0; j<N; j++){
//row major
h_V[j*n+i] = i*j;
}
}
double* d_V;
CudaSafeCall(cudaMalloc((void**) &d_V, mem_size_V));
const double alf = 1.0;
const double bet = 0.0;
const double *alpha = &alf;
const double *beta = &bet;
double* clone;
clone=(double*)malloc(mem_size_V);
double* clone_d ;
CudaSafeCall(cudaMalloc((void**) &clone_d, mem_size_V));
CudaSafeCall(cudaMemcpy(clone_d, clone, mem_size_V, cudaMemcpyHostToDevice));
dim3 grid(1,1,1);
dim3 block(16,1,1);
gpuCopy<<<grid, block>>>(clone_d,d_V,M,N);
CudaSafeCall(cudaMemcpy(clone, clone_d, mem_size_V, cudaMemcpyDeviceToHost));
// copy matrix is correct
for(int b; b<10; b++)
std::cout << clone[b] << '\t' << std::endl;
CudaCheckError();
CudaSafeCall(cudaMemcpy(clone_d, clone, mem_size_V, cudaMemcpyHostToDevice));
CublasSafeCall(cublasDgeam( handle, CUBLAS_OP_T, CUBLAS_OP_N, M, N, alpha, clone_d, N, beta, clone_d, M, d_V, M));
CudaSafeCall(cudaMemcpy(h_V, d_V, mem_size_V, cudaMemcpyDeviceToHost));
// each entry in tranposed matrix is 0
for(int b; b<10; b++)
std::cout << h_V[b] << '\t' << std::endl;
CudaCheckError();
cudaDeviceSynchronize();
```