CUDA time improvement

I am writing CUDA code, however my algorithm is not much parallel. I am comparing MATLAB vs CUDA performance and the former one results about three times faster.

Would you mind to look through the code and advice how may I improve my code? My task is to probe CUDA is faster than MATLAB. So it would be great if this CUDA code outperforms MATLAB.

Thank you

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

#define number 13
int *getGeneratedQ_MARK();
float * getGeneratedR_MARK(float *CoefficientMatrix);

__global__ void SIC(float *coefficients, float *R_Real, float *R_Imag, float *r_Mark_Real, float *r_Mark_Imag, int whichNumber) {
	int index = blockIdx.x;
	float mark[1];
	float sumMarkCoefficient[1];
	if (index == 0)
	{
		for (int i = number; i >= whichNumber; i--)
		{
			mark[0] = (*r_Mark_Real - sumMarkCoefficient[0]) / (*R_Real*coefficients[i]);

			if (mark[0]>0)
				mark[0] = 1;
			else
				mark[0] = -1;

			if (i != whichNumber)
			{
				sumMarkCoefficient[0] = sumMarkCoefficient[0] + (*R_Real*mark[0] * coefficients[i]);
			}
		}
		*r_Mark_Real = mark[0];
	}
	else
	{
		for (int i = number; i >= whichNumber; i--)
		{
			mark[0] = (*r_Mark_Imag - sumMarkCoefficient[0]) / (*R_Imag*coefficients[i]);

			if (mark[0]>0)
				mark[0] = 1;
			else
				mark[0] = -1;

			if (i != whichNumber)
			{
				sumMarkCoefficient[0] = sumMarkCoefficient[0] + (*R_Imag*mark[0] * coefficients[i]);
			}
		}
		*r_Mark_Imag = mark[0];
	}
}
int main(void) {
	float CoefficientMatrix[13] = {0.01f, 0.02f, 0.03f, 0.04f, 0.06f, 0.07f, 0.08f, 0.09f, 0.1f, 0.11f, 0.12f, 0.13f, 0.14f};
	int *Q_mark;
	float *R_Mark;

	Q_mark = getGeneratedQ_MARK();
	R_Mark = getGeneratedR_MARK(CoefficientMatrix);

	float markWithCoefficient[number * 2];
	for (int i = 0; i<number; i++) {
		markWithCoefficient[i] = Q_mark[i] * sqrt(CoefficientMatrix[i]);
		markWithCoefficient[i + number] = Q_mark[i + number] * sqrt(CoefficientMatrix[i]);
	}

	float superMARKReal = 0;
	float superMARKImag = 0;

	for (int i = 0; i < number; i++) {
		superMARKReal = superMARKReal + markWithCoefficient[i];
		superMARKImag = superMARKImag + markWithCoefficient[i + number];
	}

	float receivedMark[number * 2];
	for (int i = 0; i < number; i++) {
		receivedMark[i] = superMARKReal*R_Mark[i];
		receivedMark[i + number] = superMARKImag*R_Mark[i + number];
	}

	float  *dev_CoefficientMatrix;
	float *dev_RMarkReal;
	float *dev_RMarkImag;
	float *dev_receivedMarkReal;
	float *dev_receivedMarkImag;

	cudaMalloc((void**)&dev_CoefficientMatrix, number * sizeof(float));
	cudaMemcpy(dev_CoefficientMatrix, CoefficientMatrix, number * sizeof(float), cudaMemcpyHostToDevice);
	//capacity calculation start
	cudaEvent_t start, stop;
	float elapsedTime;
	for (int i = 0; i<number; i++){

		cudaMalloc((void**)&dev_RMarkReal, sizeof(float));
		cudaMalloc((void**)&dev_RMarkImag, sizeof(float));

		cudaMalloc((void**)&dev_receivedMarkReal, sizeof(float));
		cudaMalloc((void**)&dev_receivedMarkImag, sizeof(float));

		cudaMemcpy(dev_RMarkReal, &R_Mark[i], sizeof(float), cudaMemcpyHostToDevice);
		cudaMemcpy(dev_RMarkImag, &R_Mark[i + number], sizeof(float), cudaMemcpyHostToDevice);
		cudaMemcpy(dev_receivedMarkReal, &receivedMark[i], sizeof(float), cudaMemcpyHostToDevice);
		cudaMemcpy(dev_receivedMarkImag, &receivedMark[i + number], sizeof(float), cudaMemcpyHostToDevice);

		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaEventRecord(start, 0);

		SIC << <1, 2 >> > (dev_CoefficientMatrix, dev_RMarkReal, dev_RMarkImag, dev_receivedMarkReal, dev_receivedMarkImag, i);

		cudaMemcpy(&receivedMark[i], dev_receivedMarkReal, sizeof(float), cudaMemcpyDeviceToHost);
		cudaMemcpy(&receivedMark[i + number], dev_receivedMarkImag, sizeof(float), cudaMemcpyDeviceToHost);

		cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);

		cudaEventElapsedTime(&elapsedTime, start, stop);
		printf("\n %d Time to generate: %.16f ms", i, elapsedTime);
	}

	cudaFree(dev_CoefficientMatrix);
	cudaFree(dev_RMarkReal);
	cudaFree(dev_RMarkImag);
	cudaFree(dev_receivedMarkReal);
	cudaFree(dev_receivedMarkImag);
	printf("\n");
}

int *getGeneratedQ_MARK() {
	int real = 0;
	int imag = 0;
	static int q_mark[number * 2];

	for (int i = 0; i < number; i++) {

		real = rand() % 19 + (-9);
		imag = rand() % 19 + (-9);

		if (real > 0)
			real = 1;
		else
			real = -1;

		if (imag > 0)
			imag = 1;
		else
			imag = -1;

		q_mark[i] = real;
		q_mark[i + number] = imag;
	}
	return q_mark;
}

float *getGeneratedR_MARK(float *CoefficientMatrix) {
	static float r_mark[number * 2];

	int real = 0;
	int imag = 0;
	//srand(time(NULL));
	srand((unsigned int)time(0));
	real = rand() % 19 + (-9);
	imag = rand() % 19 + (-9);

	if (real < 0)
		real = (-1)*real;

	if (imag < 0)
		imag = (-1)*imag;

	for (int i = 0; i < number; i++) {
		r_mark[i] = CoefficientMatrix[i] * 10 * sqrt(0.5f)*real;
		r_mark[i + number] = CoefficientMatrix[i] * 10 * sqrt(0.5f)*imag;
	}

	return r_mark;
}

You’ll never get anything to run fast on the GPU this way:

SIC << <1, 2 >> > ...

The first 2 most important principles for writing fast CUDA code are:

  1. Use enough threads to keep the machine busy
  2. make efficient use of the memory subsystem(s) (e.g. coalesced access, etc.)

You have violated principle number 1.

You want to aim for efficiently using something like 10,000 threads in a CUDA kernel call. Your kernel call uses 2 threads.

Thank you, txbob.

My question is how to break my task into parallel subtasks, so I can call more than two threads.

May I break the global task into parallel 13 or 26 subtasks?

Thank you