Problem with CUBLAS New problem

qsdqsd12 · July 3, 2010, 11:31am

Hello,

I’ve just started using CUBLAS, and I’ve already got a question

The following code ;

[codebox]

/* Includes, system */

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

/* Includes, cuda */

#include “cublas.h”

/* Main */

int main(int argc, char** argv) {

int N = 16;

    int square = N * N;

    cublasStatus status;

    unsigned int i, j;

/* Vectors */

    float* b;

    float* bHost;

    float* x;

    float* xHost;

/* Matrices */

    float* A;

    float* AHost;

bHost = (float*) malloc(N * sizeof(*bHost));

    xHost = (float*) malloc(N * sizeof(*xHost));

    AHost = (float*) malloc(square * sizeof(*AHost));

for (i = 0; i < N; ++i) {

            for(j = 0; j < N; ++j) {

                    if(i == j) {

                            AHost[i * N + j] = 1;

                    } else {

                            AHost[i * N + j] = 0;

                    }

                    /* printf("A[%i, %i] = %f\t", i, j, AHost[i * N + j]); */

            }

            /* printf("\n"); */

            bHost[i] = (float)1;

            xHost[i] = (float)1;

    }

status = cublasInit();

status = cublasAlloc(N, sizeof(B), (void*)&b);

    status = cublasAlloc(N, sizeof(*x), (void**)&x);

    status = cublasAlloc(square, sizeof(*A), (void**)&A);

status = cublasSetVector(square, sizeof(*A), AHost, 1, A, 1);

    status = cublasSetVector(N, sizeof(*B), bHost, 1, b, 1);

    status = cublasSetVector(N, sizeof(*x), xHost, 1, x, 1);

    cublasSgemv('n', N, N, 10.0f, A, N, b, N, 9, x, 1);

    /* cublasSaxpy('n', -1, b, 1, x, 1); */

    cudaThreadSynchronize();

    status = cublasGetError();

    status = cublasGetVector(N, sizeof(*x), x, 1, xHost, 1);

for (i = 0; i < N; ++i) {

                    printf("bHost[%i] = %f\txHost[%i] = %f\n", i, bHost[i], i, xHost[i]);

    }

status = cublasFree(B);

    status = cublasFree(x);

    status = cublasFree(A);

    status = cublasShutdown();

free(bHost);

    free(xHost);

    free(AHost);

return 0;

}

[/codebox]

outputs :

[codebox]

bHost[0] = 1.000000 xHost[0] = 19.000000

bHost[1] = 1.000000 xHost[1] = 9.697553

bHost[2] = 1.000000 xHost[2] = 12.984366

bHost[3] = 1.000000 xHost[3] = 10.762107

bHost[4] = 1.000000 xHost[4] = 19.000000

bHost[5] = 1.000000 xHost[5] = 169.000000

bHost[6] = 1.000000 xHost[6] = 329.000000

bHost[7] = 1.000000 xHost[7] = 489.000000

bHost[8] = 1.000000 xHost[8] = 19.000000

bHost[9] = 1.000000 xHost[9] = 9.000000

bHost[10] = 1.000000 xHost[10] = 9.000000

bHost[11] = 1.000000 xHost[11] = 9.000000

bHost[12] = 1.000000 xHost[12] = 9.000000

bHost[13] = 1.000000 xHost[13] = 9.000000

bHost[14] = 1.000000 xHost[14] = 9.000000

bHost[15] = 1.000000 xHost[15] = 9.000000

[/codebox]

All I’m trying to do is multiply 10 * x * Identity + 9 * x where x is a column vector of N 1 and affect that result into x

Hence, the result should be a vector x width all its values at 19.000, but …

Do you have any idea ?

I use CUDA 3.0 on a Linux x64 box with a 9400

Thanks in advance.

LSChien · July 3, 2010, 2:44pm

wrong parameter, try following

cublasSgemv(‘n’, N, N, 10.0f, A, N, b, 1, 9.0f, x, 1);

cublasSgemv (char trans, int m, int n, float alpha,

const float *A, int lda, const float *x,

int incx, float beta, float *y, int incy)

qsdqsd12 · July 3, 2010, 3:03pm

wrong parameter, try following

cublasSgemv(‘n’, N, N, 10.0f, A, N, b, 1, 9.0f, x, 1);
cublasSgemv (char trans, int m, int n, float alpha,

const float *A, int lda, const float *x,

int incx, float beta, float *y, int incy)

Thanks !

qsdqsd12 · July 4, 2010, 1:23pm

I’ve got another dummy question.

I’m now using Saxpy in the following code :

[codebox]/* Includes, system */

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

/* Includes, cuda */

#include “cublas.h”

//#define DEBUG

/* Main */

int main(int argc, char** argv) {

int N = 4;

    cublasStatus status;

    unsigned int i, j;

    unsigned int n;

/* Vectors */

    float* r;

    float* rInit;

    float* t;

    float* tmpHost;

/* Scalars */

    float omega = 5.0f;

tmpHost = (float*) malloc(N * sizeof(*tmpHost));

for (i = 0; i < N; ++i) {

            tmpHost[i] = 1.0f;

    }

    status = cublasInit();

status = cublasAlloc(N, sizeof(r), (void*)&r);

    status = cublasAlloc(N, sizeof(*rInit), (void**)&rInit);

    status = cublasAlloc(N, sizeof(*t), (void**)&t);

status = cublasSetVector(N, sizeof(*rInit), tmpHost, 1, rInit, 1);

    status = cublasSetVector(N, sizeof(*t), tmpHost, 1, t, 1);

cublasScopy(N, rInit, 1, r, 1);

cublasGetVector(N, sizeof(*rInit), rInit, 1, tmpHost, 1);

    for (i = 0; i < N; ++i) {

            printf("%s[%i] = %f \n", "rInit", i, tmpHost[i]);

    }

    cublasGetVector(N, sizeof(*rInit), r, 1, tmpHost, 1);

    for (i = 0; i < N; ++i) {

            printf("%s[%i] = %f \n", "r", i, tmpHost[i]);

    }

    cublasSaxpy('n', -omega, t, 1, r, 1);

    cublasGetVector(N, sizeof(*rInit), rInit, 1, tmpHost, 1);

    for (i = 0; i < N; ++i) {

            printf("%s[%i] = %f \n", "rInit", i, tmpHost[i]);

    }

    cublasGetVector(N, sizeof(*r), r, 1, tmpHost, 1);

    for (i = 0; i < N; ++i) {

            printf("%s[%i] = %f \n", "r", i, tmpHost[i]);

    }

status = cublasFree®;

    status = cublasFree(rInit);

    status = cublasFree(t);

    status = cublasShutdown();

    free(tmpHost);

return 0;

}

[/codebox]

which produces :

[codebox]

rInit[0] = 1.000000

rInit[1] = 1.000000

rInit[2] = 1.000000

rInit[3] = 1.000000

r[0] = 1.000000

r[1] = 1.000000

r[2] = 1.000000

r[3] = 1.000000

rInit[0] = -27.184694

rInit[1] = 11.722600

rInit[2] = 5.718123

rInit[3] = 15.436247

r[0] = -4.000000

r[1] = -4.000000

r[2] = -4.000000

r[3] = -4.000000

[/codebox]

The ‘r’ vector is nice (r = r - 5.0f * t), but why doing this operation is changing ‘rInit’ ?

I just do a copy at the begining ‘rInit → r’, that’s all, I don’t touch rInit afterwards.

Thanks in advance

LSChien · July 4, 2010, 2:51pm

wrong parameter, try

cublasSaxpy(N, -omega, t, 1, r, 1);

cublasSaxpy (int n, float alpha, const float *x,

int incx, float *y, int incy)

‘n’ is 110, that means that you do axpy for 110 elements.

Logically speaking, you should have segmentation fault, but nothing happens

because when you do cuda memory allocation, it will allocate basic chunk size, I believe chunk size should be 256 bytes.

That is why you have no segmentation fault.

Topic		Replies	Views
CUBLAS issues Some simple question about CUBLAS CUDA Programming and Performance	1	1267	August 22, 2011
Incorrect resut when using cublasSgemm and cublasSaxpy together CUDA Programming and Performance	1	519	March 29, 2018
Cublas saxpy error nvc, nvc++ and nvfortran	20	1329	October 21, 2021
Questions about cublasSaxpy CUDA Programming and Performance	15	5131	June 26, 2009
cublas - cublasSgemm - problem CUDA Programming and Performance	2	2124	March 16, 2010
cublas matrix-vector problem CUDA Programming and Performance	1	3069	May 15, 2009
Matrix Multiplication by cublasSgemm CUDA Programming and Performance	1	7522	March 26, 2010
beginner CUBLAS Sgemm question CUDA Programming and Performance	2	1688	March 9, 2010
The use of CUBLAS. Using CUBLAS in simple C / C++ code. CUDA Programming and Performance	7	26106	March 7, 2008
a cublas problem CUDA Programming and Performance	4	3473	August 3, 2011

Problem with CUBLAS New problem

Related topics