I am writing CUDA code, however my algorithm is not much parallel. I am comparing MATLAB vs CUDA performance and the former one results about three times faster.
Would you mind to look through the code and advice how may I improve my code? My task is to probe CUDA is faster than MATLAB. So it would be great if this CUDA code outperforms MATLAB.
Thank you
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define number 13
int *getGeneratedQ_MARK();
float * getGeneratedR_MARK(float *CoefficientMatrix);
__global__ void SIC(float *coefficients, float *R_Real, float *R_Imag, float *r_Mark_Real, float *r_Mark_Imag, int whichNumber) {
int index = blockIdx.x;
float mark[1];
float sumMarkCoefficient[1];
if (index == 0)
{
for (int i = number; i >= whichNumber; i--)
{
mark[0] = (*r_Mark_Real - sumMarkCoefficient[0]) / (*R_Real*coefficients[i]);
if (mark[0]>0)
mark[0] = 1;
else
mark[0] = -1;
if (i != whichNumber)
{
sumMarkCoefficient[0] = sumMarkCoefficient[0] + (*R_Real*mark[0] * coefficients[i]);
}
}
*r_Mark_Real = mark[0];
}
else
{
for (int i = number; i >= whichNumber; i--)
{
mark[0] = (*r_Mark_Imag - sumMarkCoefficient[0]) / (*R_Imag*coefficients[i]);
if (mark[0]>0)
mark[0] = 1;
else
mark[0] = -1;
if (i != whichNumber)
{
sumMarkCoefficient[0] = sumMarkCoefficient[0] + (*R_Imag*mark[0] * coefficients[i]);
}
}
*r_Mark_Imag = mark[0];
}
}
int main(void) {
float CoefficientMatrix[13] = {0.01f, 0.02f, 0.03f, 0.04f, 0.06f, 0.07f, 0.08f, 0.09f, 0.1f, 0.11f, 0.12f, 0.13f, 0.14f};
int *Q_mark;
float *R_Mark;
Q_mark = getGeneratedQ_MARK();
R_Mark = getGeneratedR_MARK(CoefficientMatrix);
float markWithCoefficient[number * 2];
for (int i = 0; i<number; i++) {
markWithCoefficient[i] = Q_mark[i] * sqrt(CoefficientMatrix[i]);
markWithCoefficient[i + number] = Q_mark[i + number] * sqrt(CoefficientMatrix[i]);
}
float superMARKReal = 0;
float superMARKImag = 0;
for (int i = 0; i < number; i++) {
superMARKReal = superMARKReal + markWithCoefficient[i];
superMARKImag = superMARKImag + markWithCoefficient[i + number];
}
float receivedMark[number * 2];
for (int i = 0; i < number; i++) {
receivedMark[i] = superMARKReal*R_Mark[i];
receivedMark[i + number] = superMARKImag*R_Mark[i + number];
}
float *dev_CoefficientMatrix;
float *dev_RMarkReal;
float *dev_RMarkImag;
float *dev_receivedMarkReal;
float *dev_receivedMarkImag;
cudaMalloc((void**)&dev_CoefficientMatrix, number * sizeof(float));
cudaMemcpy(dev_CoefficientMatrix, CoefficientMatrix, number * sizeof(float), cudaMemcpyHostToDevice);
//capacity calculation start
cudaEvent_t start, stop;
float elapsedTime;
for (int i = 0; i<number; i++){
cudaMalloc((void**)&dev_RMarkReal, sizeof(float));
cudaMalloc((void**)&dev_RMarkImag, sizeof(float));
cudaMalloc((void**)&dev_receivedMarkReal, sizeof(float));
cudaMalloc((void**)&dev_receivedMarkImag, sizeof(float));
cudaMemcpy(dev_RMarkReal, &R_Mark[i], sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_RMarkImag, &R_Mark[i + number], sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_receivedMarkReal, &receivedMark[i], sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_receivedMarkImag, &receivedMark[i + number], sizeof(float), cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
SIC << <1, 2 >> > (dev_CoefficientMatrix, dev_RMarkReal, dev_RMarkImag, dev_receivedMarkReal, dev_receivedMarkImag, i);
cudaMemcpy(&receivedMark[i], dev_receivedMarkReal, sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(&receivedMark[i + number], dev_receivedMarkImag, sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("\n %d Time to generate: %.16f ms", i, elapsedTime);
}
cudaFree(dev_CoefficientMatrix);
cudaFree(dev_RMarkReal);
cudaFree(dev_RMarkImag);
cudaFree(dev_receivedMarkReal);
cudaFree(dev_receivedMarkImag);
printf("\n");
}
int *getGeneratedQ_MARK() {
int real = 0;
int imag = 0;
static int q_mark[number * 2];
for (int i = 0; i < number; i++) {
real = rand() % 19 + (-9);
imag = rand() % 19 + (-9);
if (real > 0)
real = 1;
else
real = -1;
if (imag > 0)
imag = 1;
else
imag = -1;
q_mark[i] = real;
q_mark[i + number] = imag;
}
return q_mark;
}
float *getGeneratedR_MARK(float *CoefficientMatrix) {
static float r_mark[number * 2];
int real = 0;
int imag = 0;
//srand(time(NULL));
srand((unsigned int)time(0));
real = rand() % 19 + (-9);
imag = rand() % 19 + (-9);
if (real < 0)
real = (-1)*real;
if (imag < 0)
imag = (-1)*imag;
for (int i = 0; i < number; i++) {
r_mark[i] = CoefficientMatrix[i] * 10 * sqrt(0.5f)*real;
r_mark[i + number] = CoefficientMatrix[i] * 10 * sqrt(0.5f)*imag;
}
return r_mark;
}