Copy from Host > Device and Device > Host works once?

Hello,

I am currently working on a project that involves CUDA. I have very little CUDA experience, so this problem is confusing me a bit.

The problem is that I am using the GPU for a hash table (currently I am just using 1 block and 1 thread…want to get this to work first). Anyway, I can get the data to copy to the GPU to initialize everything. However, when I read in a file and try to insert the values into the hash table, the table I copy back from the GPU to the CPU isn’t being updated. In fact, it looks like unintialized data. Any help would be appreciated. Thanks.

various:

const int TABLESIZE = 50; // ( 1024 * 1024 * 1 );

const int BLOCKS = 512;

struct HashEntry

{

    char key[51];

    int keysize;

    int values[512];

    int count;

};

main function:

int main( void )

{

    HashEntry HashTable[TABLESIZE]; //= new HashEntry[TABLESIZE];

    HashEntry *GPUtable;

long unsigned int tablesize = TABLESIZE * sizeof( HashEntry );

ifstream inFile( "input.txt" );

if( inFile.fail() )

    {

	std::cout << "ERROR: cannot open if.txt!"  << std::endl;

	exit(1);

    }

cudaMalloc( (void**) &GPUtable, tablesize );

    cudaMemcpy( GPUtable, HashTable, tablesize, cudaMemcpyHostToDevice );     

initializeTable2<<<1, 1>>>( GPUtable );

cudaMemcpy( HashTable, GPUtable, tablesize, cudaMemcpyDeviceToHost );

    cudaFree( GPUtable );

buildHashTable( HashTable, tablesize, inFile );

cudaFree( GPUtable );	

return 0;

}

buildHashTable function:

void buildHashTable( HashEntry Table[], int tablesize, ifstream& inFile )

{

double duration;

    int aKeySize;

    int val;

std::string key;

    char* aKey;

HashEntry *GPUtable;

    HashEntry Table2[TABLESIZE];

cudaMalloc( (void**) &GPUtable, tablesize );

    cudaMemcpy( GPUtable, Table, tablesize, cudaMemcpyHostToDevice );	

Timer timer;

    timer.start();

while( inFile >> key >> val ) 

    {

	aKey = (char *)key.c_str();

	aKeySize = key.length();

	

	// Only have one block and one thread do the insert to avoid 

	// race conditions

	insert<<<1, 1>>>( GPUtable, aKey, aKeySize, &val );

    }

duration = timer.stop();

    timer.printTime( duration );

int value;

cudaMemcpy( &Table2, GPUtable, tablesize, cudaMemcpyDeviceToHost );

for( int i = 0; i < 50; i++ )

    {

        key = Table2[i].key;

        value = Table2[i].values[0];

        std::cout << i << ": key = " << key <<  ", value = "  << value 

                  << ", count = " << Table2[i].count <<  std::endl;

}

cudaFree( GPUtable );

}

initializeTable2 function:

__global__ void initializeTable2( HashEntry *Table )

{

    int lowerBound = blockIdx.x * BLOCKS;

    int upperBound = ( blockIdx.x + 1 ) * BLOCKS;

char *empty = "empty1";

for( int i = 0; i < 50 && i < TABLESIZE; i++ )

    {

        Table[i].count = 5;

        stringCopy( Table[i].key, empty, 5 );

        Table[i].keysize = 0;

        for( int j = 0; j < 512; j++ )

        {

            Table[i].values[j] = -1;

        }

    }

    //__syncthreads();

}

insert function:

__global__ void insert( HashEntry *Table, const char *key, int keysize, int *value )

{

    int index;

    int *res;

int i = 0; 

Table[1].count++;

for( i = 0; i < 25 ; i++ )

    {

        //Table[1].count++;

	hashFunction( key, keysize, i, &index );

	if( Table[0].count == 0 )

	{

    	    stringCopy( Table[0].key, (char *) key, keysize );

    	    Table[0].values[ Table[0].count ] = *value;

     	    Table[0].count++;

	    break;

    	}

	else if( Table[index].count < 512 )

	{ 

	    stringCompare( Table[index].key, Table[index].keysize, (char *) key, keysize, res );

    	    if( *res == 1 )

	    {

		Table[index].values[ Table[index].count ] = *value;

		Table[index].count++;

		break;

	    }

	}

    }

}

stringCopy function:

__device__ void stringCopy( char *dest, char *src, int srclength )

{

    for( int i = 0; i < srclength && i < 50; i++ )

    {

	*dest = *src;

	dest++;

	src++;

    }	

*dest = '\0'; //Null terminator

}

I did not include all functions because right now I am not actually using my hash function, I am just trying to hard code values into the hash table to return to the CPU. Thanks again!

Some generic advice:

Check the return codes from all calls to CUDA functions for errors.
Run the program with cuda-memcheck to see if you have out-of-bounds memory accesses.

Thanks for the speedy reply. It looks like when I do the final cudaMemcpy in my build function, I am getting a cudaErrorLaunchFailure error.

Edit: Figured out the problem. In my buildHashTable function I wasn’t actually copying the key, keysize, and value to the GPU, so I was using addresses that referred to the host.