I wrote a signal processing program that correlates an input signal of 3 million points with a vector of 1024 points, the output is a vector of 3 million points.
For example: output[0] = input[0] * filter[0] + input[1] * filter[1] + … + input[1023] * filter[1023]
The function repeat 8 times.The input data types are all float-type complex numbers.The program is as follows:
include <stdio.h>
include <stdlib.h>
include <string.h>
include <cuda.h>
include <cuda_runtime.h>
include <sys/time.h>
include <complex.h>
define N 3000000
define BLOCK_SIZE 1024
long cpuSecond()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return((tp.tv_sec * 1000000) + tp.tv_usec);
}
global void auto_correlation(float2 *input, float2 *filter, float *output, int iLen)
{
int iThreadId = blockIdx.x * blockDim.x + threadIdx.x;
int iIdx = 0;
float2 sum1;
float sum2;
if (iThreadId < iLen)
{
for (iIdx = 0; iIdx < 1024; iIdx++)
{
/* (a+bi) * conj(c+di) = (a+bi)*(c-di) = (ac+bd) +(bc-ad)i */
sum1.x += input[iThreadId + iIdx].x * filter[iIdx].x + input[iThreadId + iIdx].y * filter[iIdx].y;
sum1.y += input[iThreadId + iIdx].y * filter[iIdx].x - input[iThreadId + iIdx].x * filter[iIdx].y;
sum2 += input[iThreadId + iIdx].x * input[iThreadId + iIdx].x + input[iThreadId + iIdx].y * input[iThreadId + iIdx].y;
}
sum2 = sum2 * 16.32;
output[iThreadId] = (sum1.x * sum1.x +sum1.y * sum1.y) / sum2;
}
}
int main(int argc, char* argv)
{
int i = 0;
int dev = 0;
long exec_start = 0;
long exec_end = 0;
float2 *h_input;
float2 *h_filter;
float2 *d_input;
float2 *d_filter;
float *result;
float *d_result;
size_t size_x = N * sizeof(float2);
size_t size_y = BLOCK_SIZE * sizeof(float2);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
cudaSetDevice(dev);
cudaMalloc((float2 **)&d_input, size_x);
cudaMalloc((float2 **)&d_filter, size_y);
cudaMalloc((float **)&d_result, N * sizeof(float));
h_input = (float2*)malloc(size_x);
h_filter = (float2*)malloc(size_y);
result = (float*)malloc(N * sizeof(float));
for (i = 0; i < N; i++)
{
h_input[i].x = ((1000 + i) % 10000)/100;
h_input[i].y = ((1000 + i) % 10000)/100;
}
for (i = 0; i < BLOCK_SIZE; i++)
{
h_filter[i].x = ((1000 + i) % 10000)/100;
h_filter[i].y = ((1000 + i) % 10000)/100;
}
exec_start = cpuSecond();
int block = 1024;
int grid = 2930;
cudaMemcpy(d_input, h_input, size_x, cudaMemcpyHostToDevice);
cudaMemcpy(d_filter, h_filter, size_y, cudaMemcpyHostToDevice);
for (i = 0; i < 8; i++)
{
auto_correlation<<<grid, block>>>(d_input, d_filter, d_result, N - 1023);
cudaDeviceSynchronize();
}
//cudaMemcpy(result, d_result, N * sizeof(float), cudaMemcpyDeviceToHost);
exec_end = cpuSecond();
cudaDeviceReset();
cudaFree(d_input);
cudaFree(d_filter);
cudaFree(d_result);
free(h_input);
free(h_filter);
free(result);
printf("GPU time use %ld us\n", (exec_end - exec_start));
return 0;
}
This program takes about 0.7 seconds to execute, and I wonder if there is a way to optimize the program and shorten the execution time.