Hi there!
I’ve been utilizing CUDA as part of my honours project. I am focusing on optimization of an Aho-corasick implementation that uses a lot of population count operations.
In my program I have two versions of a population count that I switch between with #if defined(OPTIMIZATION_POPC). Both of these functions can be seen below:
#if defined(OPTIMIZATION_POPC)
__device__ int
popcount_texture(unsigned temp){
// This is the fastest popcount for 32-bit types as detailed in "Software Optimization Guide for AMD64 Processors"(179-180)
temp = temp - ((temp >> 1) & 0b01010101010101010101010101010101);
temp = (temp & 0b00110011001100110011001100110011) + ((temp >> 2) & 0b00110011001100110011001100110011);
return (((temp + (temp >> 4)) & 0b00001111000011110000111100001111) * 0b00000001000000010000000100000001) >> 24;
}
__device__ int // Count bits set in a given reduced node
popcount_node_gpu(unsigned bitmap[], int idx){
int count = 0; // number of set bits
for (int i = 0; i < DIV32(idx); ++i){
//count += __popc(bitmap[i]); // This also doesn't work
count += popcount_texture(bitmap[i]);
}
//count += __popc(bitmap[i] & ((1<<idx)-1)); // This also doesn't work
count += popcount_texture(bitmap[idx/32] & ((1<<idx)-1));
return count;
}
#else
// THESE ARE THE ONLY ONES THAT WORK? WHY?
__device__ int
popcount_texture(unsigned temp){
int i = 0, // index
count = 0; // number of set bits
do {
if((temp & (1 << i)) != 0){
++count;
}
++i;
} while (i < 32);
return count;
}
__device__ int // Count bits set in a given reduced node
popcount_node_gpu(unsigned bitmap[], int idx){
int i = 0, // index
count = 0; // number of set bits
do {
if((bitmap[i/32] & 1 << (i % 32)) != 0){
++count;
}
++i;
} while (i < idx);
return count;
}
#endif
I am calling them from my search_trie_global kernel (below) and a few others. I have not been able to get the optimized ones working within my application at all.
The expected behaviour of the device popcount functions is that they would return total population count of given bitmaps (unsigned ints) up to a given index, instead they return 0.
I have tested all of these methods extensively on CPU and found no issues, I have also been able to get them working on small GPU test applications (such as this one -->) https://gist.github.com/AR-Calder/957f2bdcfe00a087ab57a73a13ddd33f - which is what confuses me so much. I really hope I’ve just missed out something stupid because the optimization allows for a ~99% execution time reduction on CPU (500ns → 6ns) and I’d love to see how it performs in this context.
__global__ void
search_trie_global(NodeReduced * __restrict__ trie_array, const char * __restrict__ input_text, const unsigned size_text, unsigned * out) {
// Get ID of current thread
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= size_text) return;
unsigned popcount = 0;
NodeReduced * current_node = &trie_array[0];
unsigned ascii = input_text[tid];
for (int i=0; tid+i < size_text; ++i){
// Get character for comparison
ascii = input_text[tid + i];
if (((current_node->bitmap[ascii/32] & (1 << (ascii%32) )) != 0)){
// Match, get next node
popcount = popcount_node_gpu(current_node->bitmap, ascii);
current_node = &trie_array[(current_node->offset) + popcount];
} else {
// No match, no point in checking any further
break;
}
}
out[tid] = (current_node->offset == NodeState::end);
}
Another usage example search_trie_texture
__global__ void
search_trie_texture(const cudaTextureObject_t texture_trie, const char * __restrict__ input_text, const int size_text, unsigned * out) {
// Get ID of current thread
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= size_text) return;
int popcount = 0;
int node_idx = 0;
int ascii = 0;
int ascii_idx = 0;
for (int i=tid; i < size_text; ++i){
// Get character for comparison
ascii = input_text[i];
ascii_idx = ascii/32;
if((tex1Dfetch<int>(texture_trie, node_idx + ascii_idx) & (1 << (ascii % 32) )) != 0){
// Match, get next node
for (int j=0; j < ascii_idx; ++j){
popcount += popcount_texture(tex1Dfetch<int>(texture_trie, node_idx + j));
}
popcount += popcount_texture(tex1Dfetch<int>(texture_trie, node_idx + ascii_idx) & ((1<<MOD32(ascii))-1));
node_idx = TEXTURE_WIDTH*(tex1Dfetch<int>(texture_trie, node_idx + OFFSET_IDX) + popcount);
} else {
// No match, no point in checking any further
break;
}
popcount = 0;
}
out[tid] = (tex1Dfetch<int>(texture_trie, node_idx + OFFSET_IDX) == NodeState::end);
}
When I am testing these functions I reuse the same data so there shouldn’t be any difference in output.
I’ve been trying to diagnose the issue for several days now and I really don’t understand why it works in my test case but not in implementation. Can anyone offer any insight as to what on earth is going on?
I’ve tested it with cuda-memcheck which has returned 0 errors. My GPU is the primary output on my system so (afaik) I am unable to debug with cuda-gdb. Not sure if this matters but I am using NVCC 10.0 on Fedora 28.