Hi all,
I’ve a program of this sort.
int main()
{
//These two arrays are used by all threads
src_array1[256] = {,,,,,};
src_array2[256] = {,,,,,};
//Data is of moderate size. Each thread operates on 16 bytes of data
src_data[1 MB] = {,,,,,};
src_result[1 MB];
//Allocating device memory for arrays
cudaMalloc( (void**)&array1, array1Size);
cudaMalloc( (void**)&array2, array2Size);
//Allocating device memory for data n result
cudaMalloc( (void**)&data, dataSize );
cudaMalloc( (void**)&result, resultSize );
//copy the arrays, data to device memory
cudaMemcpy( data, src_data, dataSize, cudaMemcpyHostToDevice );
cudaMemcpy( array1, src_array1, array1Size, cudaMemcpyHostToDevice );
cudaMemcpy( array2, src_array2, array2Size, cudaMemcpyHostToDevice );
// Initiate kernel (3000 blocks and each block has 16 threads)
dim3 dimGrid(3000,1,1);
dim3 dimBlock(16,1,1);
my_kernel<<< dimGrid, dimBlock >>>(result,data,array1,array2);
cudaThreadSynchronize();
//After processing data is written back to result in global memory
cudaMemcpy(src_result,result, resultSize, cudaMemcpyDeviceToHost );
}
__global__ void my_kernel(*result, *data, *array1, *array2)
{
//processing of data by using array1 and array2
.....
....
problem 1: Here when I read array1 and array2 from global memory it takes lot of time
problem 2: array1 and array2 are sort of look up tables, so i need to access them quite often
....
....
// Copying the processed data to result in global memory
......
Problem 3: While writing the results back to global memory, I observe that a lot of time is taken
.....
}
I need suggestions regarding memory management. Most of my time is wasted in accessing array1, array2 and writing processed data back to global memory.
Thanks in advance!