Hello,
i’m trying to speedup some function by moving them to GPU, but as a result, its much slower on GPU than on CPU.
Measured time on CPU is 103.796 msecs on GPU it is 8547.94 msecs.
im calling the function by:
SetBitAllGpu<<<50, 128, 0, streamSetBitGpu>>>(gpuBait,
gpuReducedMetaR, gpuReducedMetaRPerR, container->rCols, gpuMetaRSizes, container->bitarrayLength);
cudaDeviceSynchroice(); // wait until its finished
SetBarFirstValueGpu<<<50, 128, 0, streamSetBarFirstValueGpu>>>(gpuBar, gpuBait, container->bitarrayLength);
cudaDeviceSynchroice(); // wait until its finished
AndOpAllGpu<<<50, 128, 0, streamAndOpAllGpu>>>(gpuBar, gpuBait, container->ramboCols, container->bitarrayLength);
cudaDeviceSynchroice(); // wait until its finished
The methods are:
__global__ void SetBitAllGpu(char *bait, int *rMRs, uint* noOfrMRpr, uint R, uint* mRS, uint bal)
{
uint mrsc = 0;
for (uint r = 0; r < R; r++)
{
for (uint n = 0; n < noOfrMRpr[r]; n++)
{
for (uint s = 0; s < mRS[n]; s++)
{
bait[r*bal + (rMRs[mrsc + s] / 8)] |= (1 << (rMRs[mrsc + s] % 8));
__syncthreads();
}
__syncthreads();
mrsc += mRS[n];
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}
__global__ void SetBarFirstValueGpu(char *bar, char *bait, int bitarrayLength)
{
for (int len = 0; len < bitarrayLength; len++)
{
bar[len] = bait[len]; //+ 1];
__syncthreads();
}
__syncthreads();
}
__global__ void AndOpAllGpu(char *bar, char *bait, int ramboCols, int bitarrayLength)
{
for (int baitNumber = 1; baitNumber < ramboCols; baitNumber++)
{
for (int len = 0; len < bitarrayLength; len++)
{
bar[len] &= bait[bitarrayLength * baitNumber + len]; //+ 1];
__syncthreads();
}
__syncthreads();
}
__syncthreads();
}