cublasSgemv ordering problems?

I am trying to use CUBLAS GEMV to compute a matrix-vector operation. To be sure that the concept is correct I built a version using only CPU and got the correct results however when using the GPU CUBLAS GEMV I get invalid results. I think it may be ordering but don’t know for sure.

The code is listed below. I apologize if it is messy, the project I am working on involves a lot of classes but the source of the problem is the kernel listed in the code. I commented-out the CPU code that works.

If anyone can think of a reason why cuBlasSgemv gives wrong answer but the CPU version doesn’t please let me know. Any hint/idea would be greatly appreciated.

/* Solution is held at T->L()  array which is n*n*n in length */
    /* T is a pointer to node class */

    chebyshev* cheb = T->cheb;
    precomputation* pcomp = T->pcomp;
    int n = cheb->n;
    int N = n*n*n;
    int LIDX = T->which();
    float lam = (1<<(-T->kern->lambda)*T->level());
    node *J;

    int row = 216*N, col = N;
    std::vector<float> A(row*col);
    std::vector<float> x(col);
    std::vector<float> y(col);

    for(int i(0); i < row; ++i){
        for(int j(0); j < col; ++j){
            A[col*i + j] = 0.0f;

    // store in ROW-MAJOR order ...
    int I = 0;
    for(int l(0); l < N; ++l) {
        x[l] = 1.0f;
        y[l] = T->L()[l];
        for(int j(0); j < 216; ++j) {
            if(T->interactants()[j].pointer != NULL) {
                J = T->interactants()[j].pointer;
                for(int m(0); m < N; ++m) {
                    float mat = pcomp->matrix_M2L[LIDX][j][m][l];
                    A[col*I + m] = J->M()[m]*mat;

    // COLUMN-MAJOR access using CPU - gives correct results ...
    for(int i(0); i < col; ++i) {
        for(int j(0); j < row; ++j) {
            y[i] += (lam*A[row*i + j]*x[i]);

    for(int i(0); i < N; ++i) {
        T->L()[i] = y[i];   // store back for solution

    float *d_A, *d_x, *d_y;
    cublasHandle_t handle;
    int size = row*col;

    cudaMalloc((void**)&d_A,  size*sizeof(float));
    cudaMalloc((void**)&d_x,  col*sizeof(float));
    cudaMalloc((void**)&d_y,  col*sizeof(float));
    cudaMemcpy(d_A,, size*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x,, col*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y,, col*sizeof(float), cudaMemcpyHostToDevice);
    float alf = lam, beta = 1.0f;

    cudaEventRecord(start, 0);

    // call to cublas returns INCORRECT results 
    cublasSgemv(handle, CUBLAS_OP_T, col, row, &alf, d_A, col, d_x, 1, &beta, d_y, 1);

    cudaEventRecord(stop, 0);
    cudaEventElapsedTime(&curtime, start, stop);
    time += curtime;
    cudaMemcpy(T->L(), d_y, col*sizeof(float), cudaMemcpyDeviceToHost);

Edit: Could the cublasSgemv call be returning incorrect values due to a mis-match with C/C++ style array and the fact that cublas follows FORTRAN style? If so, is there examples with how to call cublasSgemv with C-style arrays?