Problem with CUBLAS New problem

Hello,

I’ve just started using CUBLAS, and I’ve already got a question

The following code ;

[codebox]

/* Includes, system */

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

/* Includes, cuda */

#include “cublas.h”

/* Main */

int main(int argc, char** argv) {

int N = 16;

    int square = N * N;

    cublasStatus status;

    unsigned int i, j;

/* Vectors */

    float* b;

    float* bHost;

    float* x;

    float* xHost;

/* Matrices */

    float* A;

    float* AHost;

bHost = (float*) malloc(N * sizeof(*bHost));

    xHost = (float*) malloc(N * sizeof(*xHost));

    AHost = (float*) malloc(square * sizeof(*AHost));

for (i = 0; i < N; ++i) {

            for(j = 0; j < N; ++j) {

                    if(i == j) {

                            AHost[i * N + j] = 1;

                    } else {

                            AHost[i * N + j] = 0;

                    }

                    /* printf("A[%i, %i] = %f\t", i, j, AHost[i * N + j]); */

            }

            /* printf("\n"); */

            bHost[i] = (float)1;

            xHost[i] = (float)1;

    }

status = cublasInit();

status = cublasAlloc(N, sizeof(B), (void*)&b);

    status = cublasAlloc(N, sizeof(*x), (void**)&x);

    status = cublasAlloc(square, sizeof(*A), (void**)&A);

status = cublasSetVector(square, sizeof(*A), AHost, 1, A, 1);

    status = cublasSetVector(N, sizeof(*B), bHost, 1, b, 1);

    status = cublasSetVector(N, sizeof(*x), xHost, 1, x, 1);

    cublasSgemv('n', N, N, 10.0f, A, N, b, N, 9, x, 1);

    /* cublasSaxpy('n', -1, b, 1, x, 1); */

    cudaThreadSynchronize();

    status = cublasGetError();

    status = cublasGetVector(N, sizeof(*x), x, 1, xHost, 1);

for (i = 0; i < N; ++i) {

                    printf("bHost[%i] = %f\txHost[%i] = %f\n", i, bHost[i], i, xHost[i]);

    }

status = cublasFree(B);

    status = cublasFree(x);

    status = cublasFree(A);

    status = cublasShutdown();

free(bHost);

    free(xHost);

    free(AHost);

return 0;

}

[/codebox]

outputs :

[codebox]

bHost[0] = 1.000000 xHost[0] = 19.000000

bHost[1] = 1.000000 xHost[1] = 9.697553

bHost[2] = 1.000000 xHost[2] = 12.984366

bHost[3] = 1.000000 xHost[3] = 10.762107

bHost[4] = 1.000000 xHost[4] = 19.000000

bHost[5] = 1.000000 xHost[5] = 169.000000

bHost[6] = 1.000000 xHost[6] = 329.000000

bHost[7] = 1.000000 xHost[7] = 489.000000

bHost[8] = 1.000000 xHost[8] = 19.000000

bHost[9] = 1.000000 xHost[9] = 9.000000

bHost[10] = 1.000000 xHost[10] = 9.000000

bHost[11] = 1.000000 xHost[11] = 9.000000

bHost[12] = 1.000000 xHost[12] = 9.000000

bHost[13] = 1.000000 xHost[13] = 9.000000

bHost[14] = 1.000000 xHost[14] = 9.000000

bHost[15] = 1.000000 xHost[15] = 9.000000

[/codebox]

All I’m trying to do is multiply 10 * x * Identity + 9 * x where x is a column vector of N 1 and affect that result into x

Hence, the result should be a vector x width all its values at 19.000, but …

Do you have any idea ?

I use CUDA 3.0 on a Linux x64 box with a 9400

Thanks in advance.

wrong parameter, try following

cublasSgemv(‘n’, N, N, 10.0f, A, N, b, 1, 9.0f, x, 1);

cublasSgemv (char trans, int m, int n, float alpha,

const float *A, int lda, const float *x,

int incx, float beta, float *y, int incy)

Thanks !

I’ve got another dummy question.

I’m now using Saxpy in the following code :

[codebox]/* Includes, system */

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

/* Includes, cuda */

#include “cublas.h”

//#define DEBUG

/* Main */

int main(int argc, char** argv) {

int N = 4;

    cublasStatus status;

    unsigned int i, j;

    unsigned int n;

/* Vectors */

    float* r;

    float* rInit;

    float* t;

    float* tmpHost;

/* Scalars */

    float omega = 5.0f;

tmpHost = (float*) malloc(N * sizeof(*tmpHost));

for (i = 0; i < N; ++i) {

            tmpHost[i] = 1.0f;

    }

    status = cublasInit();

status = cublasAlloc(N, sizeof(r), (void*)&r);

    status = cublasAlloc(N, sizeof(*rInit), (void**)&rInit);

    status = cublasAlloc(N, sizeof(*t), (void**)&t);

status = cublasSetVector(N, sizeof(*rInit), tmpHost, 1, rInit, 1);

    status = cublasSetVector(N, sizeof(*t), tmpHost, 1, t, 1);

cublasScopy(N, rInit, 1, r, 1);

cublasGetVector(N, sizeof(*rInit), rInit, 1, tmpHost, 1);

    for (i = 0; i < N; ++i) {

            printf("%s[%i] = %f \n", "rInit", i, tmpHost[i]);

    }

    cublasGetVector(N, sizeof(*rInit), r, 1, tmpHost, 1);

    for (i = 0; i < N; ++i) {

            printf("%s[%i] = %f \n", "r", i, tmpHost[i]);

    }

    cublasSaxpy('n', -omega, t, 1, r, 1);

    cublasGetVector(N, sizeof(*rInit), rInit, 1, tmpHost, 1);

    for (i = 0; i < N; ++i) {

            printf("%s[%i] = %f \n", "rInit", i, tmpHost[i]);

    }

    cublasGetVector(N, sizeof(*r), r, 1, tmpHost, 1);

    for (i = 0; i < N; ++i) {

            printf("%s[%i] = %f \n", "r", i, tmpHost[i]);

    }

status = cublasFree®;

    status = cublasFree(rInit);

    status = cublasFree(t);

    status = cublasShutdown();

    free(tmpHost);

return 0;

}

[/codebox]

which produces :

[codebox]

rInit[0] = 1.000000

rInit[1] = 1.000000

rInit[2] = 1.000000

rInit[3] = 1.000000

r[0] = 1.000000

r[1] = 1.000000

r[2] = 1.000000

r[3] = 1.000000

rInit[0] = -27.184694

rInit[1] = 11.722600

rInit[2] = 5.718123

rInit[3] = 15.436247

r[0] = -4.000000

r[1] = -4.000000

r[2] = -4.000000

r[3] = -4.000000

[/codebox]

The ‘r’ vector is nice (r = r - 5.0f * t), but why doing this operation is changing ‘rInit’ ?

I just do a copy at the begining ‘rInit → r’, that’s all, I don’t touch rInit afterwards.

Thanks in advance

wrong parameter, try

cublasSaxpy(N, -omega, t, 1, r, 1);

cublasSaxpy (int n, float alpha, const float *x,

int incx, float *y, int incy)

‘n’ is 110, that means that you do axpy for 110 elements.

Logically speaking, you should have segmentation fault, but nothing happens

because when you do cuda memory allocation, it will allocate basic chunk size, I believe chunk size should be 256 bytes.

That is why you have no segmentation fault.