I am trying to execute CUBLAS TRSM (AX=B, solving for X, left-lower-nonTranspose-Unit combination)function in CUDA version 4.2. But no matter what, the output result remains the same as input B Matrix. I’d appreciate if anyone can help me figure it out. Here is my program:

```
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
// Initializing Data for right hand side Matrix B
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i) {
float a = rand() / (float)RAND_MAX;
int inti = (int) (a * 10);
data[i] = inti;
}
}
//Initializing Data for Lower Traingular Square Matrix A
void randomLowerTriangle(float* data, int n)
{
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
if ( i == j ) {
data[ (i*n)+j ] = 1;
} else if ( j > i ) {
data[ (i*n)+j ] = 0;
} else {
float a = rand() / (float) RAND_MAX;
int inti = (int) (a * 10);
data[ (i*n)+j ] = inti;
}
}
}
}
int main(int argc , char **argv)
{
int m = 5, n = 5, i, j;
float alpha = 1.0;
int size_A = m * m;
int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*)malloc(mem_size_A);
int size_B = m * n;
int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*)malloc(mem_size_B);
float* h_CUBLAS = (float*)malloc(mem_size_B);
printf("Assigning random numbers to A and B\n");
randomLowerTriangle(h_A, m);
randomInit(h_B, size_B);
printf("Matrix A[%d X %d]:\n",m, n);
for(i = 0; i < m; i++ ) {
for(j = 0 ; j < m; j++ )
printf("%.3f\t", h_A[i*m+j]);
printf("\n");
}
printf("Matrix B[%d X %d]:\n", m, n);
for(i = 0; i < m; i++ ){
for(j = 0; j < n; j++ )
printf("%.3f\t", h_B[ i*n+j ]);
printf("\n");
}
// allocate device memory
float* d_A, *d_B;
printf("Allocating memory on device for A and B\n");
cudaMalloc((void**) &d_A, mem_size_A);
cudaMalloc((void**) &d_B, mem_size_B);
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
cublasStatus_t ret;
cublasHandle_t handle;
ret = cublasCreate_v2(&handle);
ret = cublasStrsm_v2(handle,CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N,
CUBLAS_DIAG_UNIT, m, n, &alpha, d_A, m, d_B, n);
cudaMemcpy(h_CUBLAS, d_B, mem_size_B, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printf("Matrix X (CUBLAS)[%d X %d]:\n", m, n);
for(i = 0; i < m; i++ ){
for(j = 0; j < n; j++ )
printf("%.3f\t", h_CUBLAS[ i*n+j ]);
printf("\n");
}
free(h_A);
free(h_B);
free(h_CUBLAS);
cudaFree(d_A);
cudaFree(d_B);
cublasDestroy_v2(handle);
cudaDeviceReset();
return 0;
}
```