Hello,
I’ve just started using CUBLAS, and I’ve already got a question
The following code ;
[codebox]
/* Includes, system */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* Includes, cuda */
#include “cublas.h”
/* Main */
int main(int argc, char** argv) {
int N = 16;
int square = N * N;
cublasStatus status;
unsigned int i, j;
/* Vectors */
float* b;
float* bHost;
float* x;
float* xHost;
/* Matrices */
float* A;
float* AHost;
bHost = (float*) malloc(N * sizeof(*bHost));
xHost = (float*) malloc(N * sizeof(*xHost));
AHost = (float*) malloc(square * sizeof(*AHost));
for (i = 0; i < N; ++i) {
for(j = 0; j < N; ++j) {
if(i == j) {
AHost[i * N + j] = 1;
} else {
AHost[i * N + j] = 0;
}
/* printf("A[%i, %i] = %f\t", i, j, AHost[i * N + j]); */
}
/* printf("\n"); */
bHost[i] = (float)1;
xHost[i] = (float)1;
}
status = cublasInit();
status = cublasAlloc(N, sizeof(B), (void*)&b);
status = cublasAlloc(N, sizeof(*x), (void**)&x);
status = cublasAlloc(square, sizeof(*A), (void**)&A);
status = cublasSetVector(square, sizeof(*A), AHost, 1, A, 1);
status = cublasSetVector(N, sizeof(*B), bHost, 1, b, 1);
status = cublasSetVector(N, sizeof(*x), xHost, 1, x, 1);
cublasSgemv('n', N, N, 10.0f, A, N, b, N, 9, x, 1);
/* cublasSaxpy('n', -1, b, 1, x, 1); */
cudaThreadSynchronize();
status = cublasGetError();
status = cublasGetVector(N, sizeof(*x), x, 1, xHost, 1);
for (i = 0; i < N; ++i) {
printf("bHost[%i] = %f\txHost[%i] = %f\n", i, bHost[i], i, xHost[i]);
}
status = cublasFree(B);
status = cublasFree(x);
status = cublasFree(A);
status = cublasShutdown();
free(bHost);
free(xHost);
free(AHost);
return 0;
}
[/codebox]
outputs :
[codebox]
bHost[0] = 1.000000 xHost[0] = 19.000000
bHost[1] = 1.000000 xHost[1] = 9.697553
bHost[2] = 1.000000 xHost[2] = 12.984366
bHost[3] = 1.000000 xHost[3] = 10.762107
bHost[4] = 1.000000 xHost[4] = 19.000000
bHost[5] = 1.000000 xHost[5] = 169.000000
bHost[6] = 1.000000 xHost[6] = 329.000000
bHost[7] = 1.000000 xHost[7] = 489.000000
bHost[8] = 1.000000 xHost[8] = 19.000000
bHost[9] = 1.000000 xHost[9] = 9.000000
bHost[10] = 1.000000 xHost[10] = 9.000000
bHost[11] = 1.000000 xHost[11] = 9.000000
bHost[12] = 1.000000 xHost[12] = 9.000000
bHost[13] = 1.000000 xHost[13] = 9.000000
bHost[14] = 1.000000 xHost[14] = 9.000000
bHost[15] = 1.000000 xHost[15] = 9.000000
[/codebox]
All I’m trying to do is multiply 10 * x * Identity + 9 * x where x is a column vector of N 1 and affect that result into x
Hence, the result should be a vector x width all its values at 19.000, but …
Do you have any idea ?
I use CUDA 3.0 on a Linux x64 box with a 9400
Thanks in advance.