Performance of mutual slice-wise vector distances

Hello,

I wrote a program that takes a number of vectors (with elements of type unsigned char). The vectors are subdivided into smaller vectors (slices). For one pair of vectors, the program selects every possible slice from one vector and every possible slice of the other vector and computes the distance (L1-norm) of each slice. The goal is, to find the minimum of these distances per vector pair (i.e., the slice-wise maximum similarity of any vector combination).

However, I found my CUDA implementation not being significantly faster than another implementation that runs on a CPU and uses AVX2 instructions. The number of vectors is 7000 in the first experiments and later may grow beyond 100’000. Since the number of combinations grows quadratically, computation time (counted in months or weeks) is really important here.

graphics card: Gainward Geforce 1080 GLH
CPU: Intel i7 7700k

Below you find the (condensed) source code which I compile via:
nvcc distances.cu --compiler-options=‘-O3’ -o distances

Basically, the program computes the upper triangular matrix of the mutual distance matrix (which is effectively stored as a vector),

I would very much appreciate suggestions on how to improve the computation time.

Thank you in advance!

#include <algorithm>
#include <limits>
#include <stdexcept>
#include <string>


struct Entry
{
    unsigned int m_size;
    unsigned char* m_data;
};

struct Database
{
    unsigned int m_size;
    Entry* m_entries;
};



__global__
void computeDistancesKernel(unsigned int entryCount, 
                            const Entry* entries, 
                            unsigned int dataSliceSize, 
                            unsigned int dataSliceShift, 
                            float* distances)
{
    int strideStartIdx = blockIdx.x * blockDim.x + threadIdx.x;
    int strideEndIdx = entryCount * entryCount / 2;

    int stride = blockDim.x * gridDim.x;
    for (int strideIdx = strideStartIdx; strideIdx < strideEndIdx; strideIdx += stride) {
        // x: first index within a (virtual) distance matrix
        // y: second index within a (virtual) distance matrix
        int x = strideIdx / entryCount;
        int y = strideIdx % entryCount;
        if (x == y) {
            // ignore
            continue; 
        } else if (x > y) {
            // reflect indices
            x = entryCount - x - 1;
            y = entryCount - y - 1;
        }

        //  holds: x < y < entryCount
        
        // compute linear index for the current index combination (x, y):
        unsigned int distIdx = x * (2 * entryCount - 3 - x) / 2 + y - 1;
        
        auto& entry1 = entries[x];
        auto& entry2 = entries[y];
        unsigned int compareSize = dataSliceSize;
        if (compareSize > entry1.m_size)
          compareSize = entry1.m_size;
        if (compareSize > entry2.m_size)
          compareSize = entry2.m_size;

        unsigned int endIdx1 = entry1.m_size - compareSize;
        unsigned int endIdx2 = entry2.m_size - compareSize;

        for (unsigned int idx1 = 0; idx1 <= endIdx1; idx1 += dataSliceShift) {
            auto data1 = &entry1.m_data[idx1];
            for (unsigned int idx2 = idx1 + 1; idx2 <= endIdx2; idx2 += dataSliceShift) {
                auto data2 = &entry2.m_data[idx2];
                float distance = 0;
                for (unsigned int i = 0; i < compareSize; ++i)
                    distance += fabsf(static_cast<float>(data1[i]) - static_cast<float>(data2[i]));
                distance /= static_cast<float>(compareSize);
                if (distance < distances[distIdx])
                    distances[distIdx] = distance;
            }
        }
    }
}


void computeDistances(Database& database, 
                      unsigned int dataSliceSize, 
                      unsigned int dataSliceShift, 
                      float* distances,
                      unsigned int distancesSize)
{
    // we are processing 4-tuples of bytes:
    dataSliceShift *= 4; 
    dataSliceSize *= 4;    
    
    // set distances to maximum:
    for (unsigned int i = 0; i < distancesSize; ++i)
        distances[i] = std::numeric_limits<float>::max();

    int numSMs;
    cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, 0);

    unsigned int blockSize = 256;
    int numBlocks = std::min(32 * static_cast<unsigned int>(numSMs), (distancesSize + blockSize - 1) / blockSize);
    computeDistancesKernel<<<numBlocks, blockSize>>>(database.m_size,
                                                     database.m_entries,
                                                     dataSliceSize,
                                                     dataSliceShift,
                                                     distances);
    cudaDeviceSynchronize();

    auto cudaErr = cudaGetLastError();
    if (cudaErr != cudaSuccess) {
        const char* cudaErrStr = cudaGetErrorString(cudaErr);
        throw std::runtime_error(std::string("CUDA error: ") + std::string(cudaErrStr));
    }    
}

int main(int argc, char** argv)
{
    Database database;
    float* distances;    
    unsigned int pairCount;
       
    // initialize database:    
    database.m_size = 7000; // sample value
    cudaMallocManaged(&database.m_entries, database.m_size * sizeof(Entry));
    for (unsigned int i = 0; i < database.m_size; ++i) {
        Entry& entry = database.m_entries[i];        
        entry.m_size = 4000; // sample value; this may vary between entries
        cudaMallocManaged(&entry.m_data, entry.m_size);
        
        // <initialize the vector entry.m_data...>
    }
    
    // initialize result vector:
    pairCount = (database.m_size - 1) * database.m_size / 2;
    cudaMallocManaged(&distances, pairCount * sizeof(float));
    
    computeDistances(database, 66, 66, distances, pairCount);
    
    // <do something with result (distances)...>
    
    // free all resources:
    for (unsigned int i = 0; i < database.m_size; ++i) {
        cudaFree(database.m_entries[i].m_data);
    }
    cudaFree(database.m_entries);
    cudaFree(distances);
}