Dear Cuda programmers.
I’ve just started my way with developing in Cuda and have stuck in a simple task.
Task: Write a Cuda code that will convert a square matrix into an upper triangular form. The same task in OpenMp went really smooth (thanks to my teammate)
#include <stdlib.h>
#include <omp.h>
#include “utp-2.h”
/*
- The second version parallizes both of the inner loops
*/
void upper_triangle_2 ( double **A, int N, int num_of_threads )
{
int i, j, k;
double ratio;
omp_set_num_threads(num_of_threads);
for ( i = 0; i < N-1; i++ )
{
#pragma omp parallel for private(k, ratio) shared(N)
for ( j = i+1; j < N; j++ )
{
ratio = A[j][i] / A[i][i];
#pragma omp parallel for
for ( k = i; k < N; k++ )
{
A[j][k] -= ratio * A[i][k];
}
}
}
}
But I’ve stuck in Cuda version of this task.
#include “…/…/lib/book.h”
#define N (100)
global void matrixtransform(double **MatA){
int i,j,k;
double ratio =1.0;
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = blockIdx.y * blockDim.y + threadIdx.y;
for (i=0; i<N-1; i++){
for (j =i+1; j<N; j++){
ratio= MatA[j+ix][i*iy]/MatA[i*iy][i*iy];
__syncthreads();
for (k=i; k < N; k++){
MatA[j][k]-=ratio*MatA[i][k];
__syncthreads();
}
}
}
}
int main (int argc, char *argv){
double a[N][N];
double **dev_a;
float elapsedTime;
printf("--- ADAS Upper Triangular Matrix Converter - Cuda Version---\n");
// allocate the memory on the CPU
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * N * sizeof(double) ) );
// fill the arrays 'a' and 'b' on the CPU
//mat_rand(a, N, N);
for (int i=1; i<N; i++){
for(int k=1; k<N; k++ )
a[i][k] = k*1.0;
}
// copy the array 'a' to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, N * N* sizeof(double), cudaMemcpyHostToDevice ) );
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//matrixtransform<<<3,10>>>(dev_a);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("elapsed time = %g msec\n", elapsedTime);
// free the memory allocated on the GPU
HANDLE_ERROR( cudaFree( dev_a ) );
//HANDLE_ERROR( cudaFree( dev_b ) );
return 0;
}
When I compile the code with nvcc: nvcc -o filename filename.cu -lpthtread
I get following error: Cannot tell what pointer points to, assuming global memory space
Any ideas what should i do?
Thank you very much in advance