CUDA performance problem about matrix multiplication time optimization

I wrote a signal processing program that correlates an input signal of 3 million points with a vector of 1024 points, the output is a vector of 3 million points.
For example: output[0] = input[0] * filter[0] + input[1] * filter[1] + … + input[1023] * filter[1023]
The function repeat 8 times.The input data types are all float-type complex numbers.The program is as follows:

include <stdio.h>
include <stdlib.h>
include <string.h>
include <cuda.h>
include <cuda_runtime.h>
include <sys/time.h>
include <complex.h>

define N 3000000
define BLOCK_SIZE 1024

long cpuSecond()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return((tp.tv_sec * 1000000) + tp.tv_usec);
}

global void auto_correlation(float2 *input, float2 *filter, float *output, int iLen)
{
int iThreadId = blockIdx.x * blockDim.x + threadIdx.x;
int iIdx = 0;
float2 sum1;
float sum2;

    if (iThreadId < iLen)
{
    for (iIdx = 0; iIdx < 1024; iIdx++)
    {
        /* (a+bi) * conj(c+di) = (a+bi)*(c-di) = (ac+bd) +(bc-ad)i   */
        sum1.x += input[iThreadId + iIdx].x * filter[iIdx].x + input[iThreadId + iIdx].y * filter[iIdx].y;
        sum1.y += input[iThreadId + iIdx].y * filter[iIdx].x - input[iThreadId + iIdx].x * filter[iIdx].y;
        sum2 += input[iThreadId + iIdx].x * input[iThreadId + iIdx].x + input[iThreadId + iIdx].y * input[iThreadId + iIdx].y;
    }
    sum2 = sum2 * 16.32;
    output[iThreadId] = (sum1.x * sum1.x +sum1.y * sum1.y) / sum2;
}

}

int main(int argc, char* argv)
{
int i = 0;
int dev = 0;
long exec_start = 0;
long exec_end = 0;
float2 *h_input;
float2 *h_filter;
float2 *d_input;
float2 *d_filter;
float *result;
float *d_result;
size_t size_x = N * sizeof(float2);
size_t size_y = BLOCK_SIZE * sizeof(float2);

cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
cudaSetDevice(dev);

cudaMalloc((float2 **)&d_input, size_x);
cudaMalloc((float2 **)&d_filter, size_y);
cudaMalloc((float **)&d_result, N * sizeof(float));

h_input = (float2*)malloc(size_x);  
    h_filter = (float2*)malloc(size_y);  
result = (float*)malloc(N * sizeof(float));


for (i = 0; i < N; i++)
    {
            h_input[i].x = ((1000 + i) % 10000)/100;
            h_input[i].y = ((1000 + i) % 10000)/100;
    }

    for (i = 0; i < BLOCK_SIZE; i++)
    {
            h_filter[i].x = ((1000 + i) % 10000)/100;
            h_filter[i].y = ((1000 + i) % 10000)/100;
    }

exec_start = cpuSecond();

    int block = 1024;
    int grid = 2930;
     
cudaMemcpy(d_input, h_input, size_x, cudaMemcpyHostToDevice);
    cudaMemcpy(d_filter, h_filter, size_y, cudaMemcpyHostToDevice);

for (i = 0; i < 8; i++)
{
            auto_correlation<<<grid, block>>>(d_input, d_filter, d_result, N - 1023);
            cudaDeviceSynchronize();
}

   
//cudaMemcpy(result, d_result, N * sizeof(float), cudaMemcpyDeviceToHost);
   
exec_end = cpuSecond();
cudaDeviceReset();

cudaFree(d_input);
cudaFree(d_filter);
cudaFree(d_result);

free(h_input);
    free(h_filter);
free(result);

printf("GPU time use %ld us\n", (exec_end - exec_start));
return 0;

}

This program takes about 0.7 seconds to execute, and I wonder if there is a way to optimize the program and shorten the execution time.

Hi,

You can find our official sample for matrix multiplication below:

Thanks.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.