How to optimize this kernel

Hello All! Can you help please with the next kernel:

[codebox]

#define SHARED

device int FindAndDelete(float *okno, float element, int ArrayLenght, int width)

{

for (int p = 0; p < ArrayLenght; p++)

{

	if (okno[p * width] == element)

	{

		for (int k = p; k < (ArrayLenght - 1); k++)

			okno[k * width] = okno[(k + 1) * width];

		return p;

	}

}

return -1;

//return 0;

}

// Âñòà âëÿåò â óïîðÿäî÷åííûé ïî âîçðà ñòà íèþ ìà ññèâ çà äà ííîé äëèíû ýëåìåíò ñ çà äà ííûì çíà ÷åíèåì

device int InsertWithSort(float *okno, float element, int ArrayLenght, int width)

{

int index1 = 0;

int index2 = ArrayLenght;

while (index1 != index2)

{

	if (okno[((index1 + index2) / 2) * width] > element) 

		index2 = (index1 + index2) / 2;

    else 

		index1 = (index1 + index2) / 2 + 1;

}

// Âñòà âèòü ýëåìåíò â ïîëîæåíèå index1, ñäâèíóâ âñå ñëåäóþùèå ýëåìåíòû â ìà ññèâå íà îäíó ïîçèöèþ

for (int p = ArrayLenght; p > index1; p--)

	okno[p * width] = okno[(p - 1) * width];

okno[index1 * width] = element;

return index1;

}

extern “C” global void Detection(float *signal, float *okno, float *medium, int NumSweeps, int NumSamples, int window_size, int start_index, int end_index)

{

#if defined SHARED

__shared__ float windowmemory[16][208 + 1]; //+1 to eliminate bank conflicts

#else

#endif

int sweep = blockIdx.x * blockDim.x + threadIdx.x;



if (sweep >= NumSweeps) 

	return;

//1. çà ïîëíèòü Îêíî ñ ñîðòèðîâêîé

for (int sample = 0; sample < window_size; sample++)

	#if defined SHARED		

	InsertWithSort(windowmemory[threadIdx.x], signal[sample * NumSweeps + sweep], sample, 1);

	#else

	InsertWithSort(okno + sweep, signal[sample * NumSweeps + sweep], sample, NumSweeps);

	#endif



//2. ïîäñ÷èòà òü ñðåäíåå Îêíà 

float med = 0;

for (int sample = start_index; sample <= end_index; sample++)

	#if defined SHARED		

	med += windowmemory[threadIdx.x][sample];		

	#else

	med += okno[sample * NumSweeps + sweep];		

	#endif

	

medium[(window_size / 2) * NumSweeps + sweep] = med / (end_index - start_index + 1);

//3. ïîäñ÷¸ò ñðåäíåé ïðè äâèæåíèè Îêíà 

for (int sample = window_size; sample < NumSamples; sample++)

{

	//1. óäà ëèòü ñòà ðûé ýëåìåíò

	#if defined SHARED		

	int deleted_index = FindAndDelete(windowmemory[threadIdx.x], signal[(sample - window_size) * NumSweeps + sweep], window_size, 1);

	#else

	int deleted_index = FindAndDelete(okno + sweep, signal[(sample - window_size) * NumSweeps + sweep], window_size, NumSweeps);

	#endif

	//3. âñòà âèòü íîâûé ýëåìåíò

	#if defined SHARED		

	int inserted_index = InsertWithSort(windowmemory[threadIdx.x], signal[sample * NumSweeps + sweep], window_size - 1, 1);

	#else

	int inserted_index = InsertWithSort(okno + sweep, signal[sample * NumSweeps + sweep], window_size - 1, NumSweeps);

	#endif

	

	//4. ïåðåñ÷èòà òü ñðåäíåå

	if (deleted_index < start_index)

	{

		if (inserted_index < start_index) { }

		if (inserted_index >= start_index)

			if (inserted_index <= end_index)

			{

				#if defined SHARED		

				med -= windowmemory[threadIdx.x][(start_index - 1)];

				#else

				med -= okno[(start_index - 1) * NumSweeps + sweep];

				#endif

				med += signal[sample * NumSweeps + sweep];

			}

		if (inserted_index > end_index) 

		{ 

			#if defined SHARED		

			med -= windowmemory[threadIdx.x][(start_index - 1)];

			med += windowmemory[threadIdx.x][end_index];

			#else

			med -= okno[(start_index - 1) * NumSweeps + sweep];

			med += okno[end_index * NumSweeps + sweep];

			#endif

		}

	}

	if (deleted_index >= start_index)

		if (deleted_index <= end_index)

		{

			med -= signal[(sample - window_size) * NumSweeps + sweep];

			if (inserted_index <= start_index) 

				#if defined SHARED		

				med += windowmemory[threadIdx.x][start_index];

				#else

				med += okno[start_index * NumSweeps + sweep];

				#endif

				

			if (inserted_index > start_index)

				if (inserted_index <= end_index)

					med += signal[sample * NumSweeps + sweep];

			if (inserted_index > end_index) 

				#if defined SHARED		

				med += windowmemory[threadIdx.x][end_index];

				#else

				med += okno[end_index * NumSweeps + sweep];

				#endif

		}

	if (deleted_index > end_index)

	{

		if (inserted_index < start_index) 

		{

			#if defined SHARED		

			med -= windowmemory[threadIdx.x][(end_index + 1)];

			med += windowmemory[threadIdx.x][start_index];

			#else				

			med -= okno[(end_index + 1) * NumSweeps + sweep];

			med += okno[start_index * NumSweeps + sweep];

			#endif

		}

		if (inserted_index >= start_index)

			if (inserted_index <= end_index)

			{

				#if defined SHARED		

				med -= windowmemory[threadIdx.x][(end_index + 1)];

				#else

				med -= okno[(end_index + 1) * NumSweeps + sweep];

				#endif

				

				med += signal[sample * NumSweeps + sweep];

			}

		if (inserted_index > end_index) { }

	}

	

	medium[(sample - window_size / 2) * NumSweeps + sweep] = med / (end_index - start_index + 1);

}

//ñðåäíÿÿ äëÿ íà ÷à ëà 

for (int sample = 0; sample < window_size / 2; sample++)

	medium[sample * NumSweeps + sweep] = medium[(window_size / 2) * NumSweeps + sweep];

//ñðåäíÿÿ äëÿ êîíöà 

for (int sample = 0; sample < window_size / 2; sample++)

	medium[(NumSamples - 1 - sample) * NumSweeps + sweep] = medium[(NumSamples - 1 - window_size / 2) * NumSweeps + sweep];

}

[/codebox]

Hello All! Can you help please with the next kernel:

[codebox]

#define SHARED

device int FindAndDelete(float *okno, float element, int ArrayLenght, int width)

{

for (int p = 0; p < ArrayLenght; p++)

{

	if (okno[p * width] == element)

	{

		for (int k = p; k < (ArrayLenght - 1); k++)

			okno[k * width] = okno[(k + 1) * width];

		return p;

	}

}

return -1;

//return 0;

}

// Âñòà âëÿåò â óïîðÿäî÷åííûé ïî âîçðà ñòà íèþ ìà ññèâ çà äà ííîé äëèíû ýëåìåíò ñ çà äà ííûì çíà ÷åíèåì

device int InsertWithSort(float *okno, float element, int ArrayLenght, int width)

{

int index1 = 0;

int index2 = ArrayLenght;

while (index1 != index2)

{

	if (okno[((index1 + index2) / 2) * width] > element) 

		index2 = (index1 + index2) / 2;

    else 

		index1 = (index1 + index2) / 2 + 1;

}

// Âñòà âèòü ýëåìåíò â ïîëîæåíèå index1, ñäâèíóâ âñå ñëåäóþùèå ýëåìåíòû â ìà ññèâå íà îäíó ïîçèöèþ

for (int p = ArrayLenght; p > index1; p--)

	okno[p * width] = okno[(p - 1) * width];

okno[index1 * width] = element;

return index1;

}

extern “C” global void Detection(float *signal, float *okno, float *medium, int NumSweeps, int NumSamples, int window_size, int start_index, int end_index)

{

#if defined SHARED

__shared__ float windowmemory[16][208 + 1]; //+1 to eliminate bank conflicts

#else

#endif

int sweep = blockIdx.x * blockDim.x + threadIdx.x;



if (sweep >= NumSweeps) 

	return;

//1. çà ïîëíèòü Îêíî ñ ñîðòèðîâêîé

for (int sample = 0; sample < window_size; sample++)

	#if defined SHARED		

	InsertWithSort(windowmemory[threadIdx.x], signal[sample * NumSweeps + sweep], sample, 1);

	#else

	InsertWithSort(okno + sweep, signal[sample * NumSweeps + sweep], sample, NumSweeps);

	#endif



//2. ïîäñ÷èòà òü ñðåäíåå Îêíà 

float med = 0;

for (int sample = start_index; sample <= end_index; sample++)

	#if defined SHARED		

	med += windowmemory[threadIdx.x][sample];		

	#else

	med += okno[sample * NumSweeps + sweep];		

	#endif

	

medium[(window_size / 2) * NumSweeps + sweep] = med / (end_index - start_index + 1);

//3. ïîäñ÷¸ò ñðåäíåé ïðè äâèæåíèè Îêíà 

for (int sample = window_size; sample < NumSamples; sample++)

{

	//1. óäà ëèòü ñòà ðûé ýëåìåíò

	#if defined SHARED		

	int deleted_index = FindAndDelete(windowmemory[threadIdx.x], signal[(sample - window_size) * NumSweeps + sweep], window_size, 1);

	#else

	int deleted_index = FindAndDelete(okno + sweep, signal[(sample - window_size) * NumSweeps + sweep], window_size, NumSweeps);

	#endif

	//3. âñòà âèòü íîâûé ýëåìåíò

	#if defined SHARED		

	int inserted_index = InsertWithSort(windowmemory[threadIdx.x], signal[sample * NumSweeps + sweep], window_size - 1, 1);

	#else

	int inserted_index = InsertWithSort(okno + sweep, signal[sample * NumSweeps + sweep], window_size - 1, NumSweeps);

	#endif

	

	//4. ïåðåñ÷èòà òü ñðåäíåå

	if (deleted_index < start_index)

	{

		if (inserted_index < start_index) { }

		if (inserted_index >= start_index)

			if (inserted_index <= end_index)

			{

				#if defined SHARED		

				med -= windowmemory[threadIdx.x][(start_index - 1)];

				#else

				med -= okno[(start_index - 1) * NumSweeps + sweep];

				#endif

				med += signal[sample * NumSweeps + sweep];

			}

		if (inserted_index > end_index) 

		{ 

			#if defined SHARED		

			med -= windowmemory[threadIdx.x][(start_index - 1)];

			med += windowmemory[threadIdx.x][end_index];

			#else

			med -= okno[(start_index - 1) * NumSweeps + sweep];

			med += okno[end_index * NumSweeps + sweep];

			#endif

		}

	}

	if (deleted_index >= start_index)

		if (deleted_index <= end_index)

		{

			med -= signal[(sample - window_size) * NumSweeps + sweep];

			if (inserted_index <= start_index) 

				#if defined SHARED		

				med += windowmemory[threadIdx.x][start_index];

				#else

				med += okno[start_index * NumSweeps + sweep];

				#endif

				

			if (inserted_index > start_index)

				if (inserted_index <= end_index)

					med += signal[sample * NumSweeps + sweep];

			if (inserted_index > end_index) 

				#if defined SHARED		

				med += windowmemory[threadIdx.x][end_index];

				#else

				med += okno[end_index * NumSweeps + sweep];

				#endif

		}

	if (deleted_index > end_index)

	{

		if (inserted_index < start_index) 

		{

			#if defined SHARED		

			med -= windowmemory[threadIdx.x][(end_index + 1)];

			med += windowmemory[threadIdx.x][start_index];

			#else				

			med -= okno[(end_index + 1) * NumSweeps + sweep];

			med += okno[start_index * NumSweeps + sweep];

			#endif

		}

		if (inserted_index >= start_index)

			if (inserted_index <= end_index)

			{

				#if defined SHARED		

				med -= windowmemory[threadIdx.x][(end_index + 1)];

				#else

				med -= okno[(end_index + 1) * NumSweeps + sweep];

				#endif

				

				med += signal[sample * NumSweeps + sweep];

			}

		if (inserted_index > end_index) { }

	}

	

	medium[(sample - window_size / 2) * NumSweeps + sweep] = med / (end_index - start_index + 1);

}

//ñðåäíÿÿ äëÿ íà ÷à ëà 

for (int sample = 0; sample < window_size / 2; sample++)

	medium[sample * NumSweeps + sweep] = medium[(window_size / 2) * NumSweeps + sweep];

//ñðåäíÿÿ äëÿ êîíöà 

for (int sample = 0; sample < window_size / 2; sample++)

	medium[(NumSamples - 1 - sample) * NumSweeps + sweep] = medium[(NumSamples - 1 - window_size / 2) * NumSweeps + sweep];

}

[/codebox]

The task of kernel is to calculate average of signal. Signal is two dimensional array of size (NumSweeps x NumSamples). I need to calculate average for each array ([0][…], [1][…], so on).

Calculating average:

  1. moving window along signal

  2. sorting

  3. calculating average of middle of the window (from start_index to end_index)

So kernel works this way:

  1. place window at the beginning

  2. load signal into window with sorting (InsertWithSort method)

  3. calculate average

  4. store average

  5. cycle for all elements in signal

5.0 move window along signal

5.1 find and delete the oldest element

5.2 insert new element

5.3 recalculate average depending of index of deleteed and inserted element

The task of kernel is to calculate average of signal. Signal is two dimensional array of size (NumSweeps x NumSamples). I need to calculate average for each array ([0][…], [1][…], so on).

Calculating average:

  1. moving window along signal

  2. sorting

  3. calculating average of middle of the window (from start_index to end_index)

So kernel works this way:

  1. place window at the beginning

  2. load signal into window with sorting (InsertWithSort method)

  3. calculate average

  4. store average

  5. cycle for all elements in signal

5.0 move window along signal

5.1 find and delete the oldest element

5.2 insert new element

5.3 recalculate average depending of index of deleteed and inserted element