I am new to CUDA and I am trying to write power iteration method for calculating dominant eigen vector in CUDA and here is my approach
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#define N 256
#define THREADS_PER_BLOCK 256
__global__ void matrix_mul(float *d_A , float *d_x)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if (idx < N)
{
float sum = 0.0f;
#pragma unroll
for (int j = 0; j < N; j++)
{
sum += d_A[idx * N + j] * d_x[j];
__syncthreads();
}
d_x[idx] = sum;
__syncthreads();
}
}
__global__ void normalize(float *d_x)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx<N)
{
d_x[idx] = d_x[idx] / d_x[N-1];
__syncthreads();
}
}
__global__ void eigenvalue(float *d_x , float *d_temp , float *d_result)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
__syncthreads();
if (idx == 0 )
{
float numerator = 0.0f;
float denominator = 0.0f;
#pragma unroll
for (int i = 0; i < N; i++)
{
numerator += (d_temp[i] * d_x[i]);
denominator += (d_x[i] * d_x[i]);
}
*d_result = numerator/denominator;
}
}
__global__ void copy(float *d_x , float *d_temp)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
if(idx<N)
{
d_temp[idx] = d_x[idx];
__syncthreads();
}
}
void power_iteration(float *h_A , float *h_x)
{
float *d_A ;
cudaMalloc((void **)&d_A, N * N * sizeof(float));
float *d_x , *d_temp;
cudaMalloc((void **)&d_x, N * sizeof(float));
cudaMalloc((void **)&d_temp, N * sizeof(float));
cudaMemcpy(d_A, h_A, N*N*sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_x, h_x, N*sizeof(float) , cudaMemcpyHostToDevice);
cudaMemcpy(d_temp, h_x, N*sizeof(float) , cudaMemcpyHostToDevice);
float *d_result;
cudaMalloc((void **)&d_result, sizeof(float));
int numBlocks = (N+THREADS_PER_BLOCK-1) / (THREADS_PER_BLOCK);
int numThreads= (THREADS_PER_BLOCK);
float tolerance = 1e-4;
int iterations = 0;
float old = 0.0f;
float error = 0.0f;
float *h_result = (float *) malloc(sizeof(float));
while(error>tolerance || iterations ==0)
{
iterations++;
matrix_mul<<<numBlocks, numThreads>>>(d_A,d_x);
normalize<<<numBlocks, numThreads>>>(d_x);
eigenvalue<<<numBlocks, numThreads>>>(d_x , d_temp , d_result);
cudaMemcpy(h_result , d_result , sizeof(float) , cudaMemcpyDeviceToHost);
error = *h_result-old;
old = *h_result;
copy<<<numBlocks, numThreads>>>(d_x,d_temp);
}
printf("%d\n" ,iterations);
cudaDeviceSynchronize();
cudaMemcpy(h_x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost);
for(int i = 0 ; i<N ; i++)
{
printf("%f ", h_x[i]);
}
printf("\n");
cudaFree(d_x);
cudaFree(d_temp);
cudaFree(d_result);
free(h_result);
cudaFree(d_A);
}
int main() {
float *h_A;
h_A = (float *)malloc(N*N*sizeof(float));
for(int i = 0 ; i<N ; i++)
{
for(int j =0 ; j< N ; j++)
h_A[i*N+j] = (float) i/N;
}
float *h_x;
h_x = (float *)malloc(N*sizeof(float));
for (int i = 0; i < N; ++i) {
h_x[i] = 1.0;
}
power_iteration(h_A , h_x);
free(h_x);
free(h_A);
return 0;
}
Here I have divided the procedure into several kernels and launched them , this code works perfect for N<=256 but for N > 256 say 257 it fails , I am unable to find why can any one help