Hello everyone,
I met a problem when I copy the hash table from gpu to cpu. I tested the hash table in gpu and it has been properly built. However, when I try to copy the whole table from gpu to cpu using the function copy_table_to_host given in the following, the entry->next always points to itself. I don’t know why this happens. Does anyone know how to solve this problem?
#define imin(a,b) (a<b?a:b)
const int N = 1024 * 1024;
const int threadsPerBlock = 256;
const int blocksPerGrid = imin(32, (N + threadsPerBlock - 1) / threadsPerBlock);
struct Lock {
int *mutex;
Lock(void) {
int state = 0;
cudaMalloc((void**)&mutex, sizeof(int));
cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
}
__device__ void lock(void) {
while (atomicCAS(mutex, 0, 1) != 0);
__threadfence();
}
__device__ void unlock(void) {
atomicExch(mutex, 0);
__threadfence();
}
};
struct Entry{
unsigned int key;
float value;
Entry *next;
};
struct Table{
size_t count;
Entry **entries;
Entry *pool;
};
__device__ __host__ size_t hash(unsigned int value, size_t count){
return value % count;
}
void initialize_table(Table &table, int nb_entries, int nb_elements){
table.count = nb_entries;
cudaMalloc((void**)&table.entries, nb_entries*sizeof(Entry*));
cudaMemset(table.entries, 0, nb_entries*sizeof(Entry*));
cudaMalloc((void**)&table.pool, nb_elements*sizeof(Entry));
}
void free_table(Table &table)
{
cudaFree(table.pool);
cudaFree(table.entries);
}
void copy_table_to_host(const Table &table, Table &hostTable){
hostTable.count = table.count;
hostTable.entries = (Entry**)calloc(table.count, sizeof(Entry*));
hostTable.pool = (Entry*)malloc(N*sizeof(Entry));
cudaMemcpy(hostTable.entries, table.entries, table.count*sizeof(Entry*),cudaMemcpyDeviceToHost);
cudaMemcpy(hostTable.pool, table.pool, N * sizeof(Entry), cudaMemcpyDeviceToHost);
for (int i = 0; i<N; i++){
if (hostTable.pool[i].next != NULL)
hostTable.pool[i].next = (Entry*)((size_t)hostTable.pool[i].next - (size_t)table.pool + (size_t)hostTable.pool);
}
for (int i=0;i<table.count;i++){
if (hostTable.entries[i] != NULL)
hostTable.entries[i] = (Entry*)((size_t)hostTable.entries[i] - (size_t)table.pool + (size_t)hostTable.pool);
}
}
__global__ void add_to_table(unsigned int *keys, float *values, Table table, Lock *lock)
{
int tid = threadIdx.x + blockIdx.x*gridDim.x;
int stride = blockDim.x*gridDim.x;
while (tid<N){
unsigned int key = keys[tid];
size_t hashValue = hash(key, table.count);
for (int i=0;i<32;i++)
{
if ((tid % 32) == i)
{
Entry *location = &(table.pool[tid]);
location->key = key;
location->value = values[tid];
lock[hashValue].lock();
location->next = table.entries[hashValue];
table.entries[hashValue] = location;
lock[hashValue].unlock();
}
}
tid += stride;
}
}
void hash_table_test()
{
unsigned int *buffer = (unsigned int*)malloc(N*sizeof(unsigned int));
for (int i = 0; i<N; i++) buffer[i] = (unsigned int)i;
float *values;
values = (float*)malloc(N*sizeof(float));
for (int i = 0; i<N; i++)
values[i] = (float)i*0.01;
unsigned int *dev_keys;
float *dev_values;
cudaMalloc((void**)&dev_keys, N * sizeof(unsigned int));
cudaMalloc((void**)&dev_values, N * sizeof(float));
cudaMemcpy(dev_keys,buffer, N * sizeof(unsigned int),cudaMemcpyHostToDevice);
cudaMemcpy(dev_values, values, N * sizeof(float), cudaMemcpyHostToDevice);
Table table;
initialize_table(table, HASH_ENTRIES, N);
Lock lock[HASH_ENTRIES];
Lock *dev_lock;
cudaMalloc((void**)&dev_lock, HASH_ENTRIES * sizeof(Lock));
cudaMemcpy(dev_lock, lock, HASH_ENTRIES * sizeof(Lock), cudaMemcpyHostToDevice);
add_to_table << < blocksPerGrid, threadsPerBlock >> > (dev_keys, dev_values, table, dev_lock);
Table hostTable;
copy_table_to_host(table, hostTable);
free_table(table);
cudaFree(dev_lock);
cudaFree(dev_keys);
cudaFree(dev_values);
free(buffer);
}