Dynamic Memory Allocation Problems

Hi *,

I am trying to setup a data model for the Viterbi algorithm.
And I am having trouble with it.

this is my gpu_context structure:

typedef float hmm_probability_t;
typedef int hmm_observation_t;
typedef int hmm_hidden_state_t;
typedef float start_probabilities_t;
typedef float transition_matrix_t;
typedef float emission_matrix_t;

typedef struct gpgpu_context{
    FILE *fd;
    int numClients;
	int clientInputs;
	int clientOutputs;
	int nFrames;

	cudaStream_t *stream;
	cudaError_t *result;

	int **cnt_h;
	int **cnt_d;

	hmm_hidden_state_t	**backtrack_states_h;
	hmm_probability_t	**max_path_probability_h;

	hmm_hidden_state_t	**backtrack_states_d;
	hmm_probability_t	**max_path_probability_d;

} gpgpu_context_t;

__constant__ hmm_hidden_state_t hmm_hidden_states[ HIDDENSTATES ];
__constant__ hmm_observation_t hmm_observations[ OBSERVATIONS ];
__constant__ start_probabilities_t hmm_start_probabilities[ HIDDENSTATES ];
__constant__ transition_matrix_t hmm_transition_matrix[ HIDDENSTATES * HIDDENSTATES ];
__constant__ emission_matrix_t hmm_emission_matrix[ HIDDENSTATES * OBSERVATIONS ];

The gpu_context is allocated and initialized in the calling C application, as well as all the **pointer.

int numClients = 1;
	/*
	 * 		Only Pointer to Arrays
	 */
	gpu_ctx->stream = (cudaStream_t*)malloc( numClients * sizeof(cudaStream_t));
	gpu_ctx->result = (cudaError_t*)malloc( numClients * sizeof(cudaError_t));

	int counterSize = sizeof(int) * 20;
	gpu_ctx->cnt_h = (int**)malloc( numClients * counterSize );
	gpu_ctx->cnt_d = (int**)malloc( numClients * counterSize );

	for(int m=0; m<gpu_ctx->numClients; m++){
		gpu_ctx->result[m] = cudaStreamCreate(&gpu_ctx->stream[m]);

		checkCudaErrors(cudaMallocHost((void**)&gpu_ctx->cnt_h[m], counterSize ));
		checkCudaErrors(cudaMalloc((void**)&gpu_ctx->cnt_d[m], counterSize ));

		checkCudaErrors(cudaMemset(gpu_ctx->cnt_h[m], 0, counterSize ));
		checkCudaErrors(cudaMemset(gpu_ctx->cnt_d[m], 0, counterSize ));

		cudaError_t err = cudaGetLastError();
		if (err != cudaSuccess)
		    printf("Error: %s\n", cudaGetErrorString(err));
		else
			printf("Success\n");

	}

The HMM, on which the Viterbi algorithm shall operate, is initialized with test values.

gpu_ctx->backtrack_states_h = (hmm_hidden_state_t**)malloc( numClients * sizeof(hmm_hidden_state_t) * OBSERVATIONS );
	gpu_ctx->max_path_probability_h = (hmm_probability_t**)malloc( numClients * sizeof(hmm_probability_t) * 2 );

	gpu_ctx->backtrack_states_d = (hmm_hidden_state_t**)malloc( numClients * sizeof(hmm_hidden_state_t) * OBSERVATIONS );
	gpu_ctx->max_path_probability_d = (hmm_probability_t**)malloc( numClients * sizeof(hmm_probability_t) * 2 );

	for(int m=0; m<numClients; m++){
		printf("Malloc HMM %d\n", m);

		checkCudaErrors( cudaMalloc( (void**)hmm_observations, sizeof(hmm_observation_t) * OBSERVATIONS));
		checkCudaErrors( cudaMalloc( (void**)hmm_hidden_states, sizeof(hmm_hidden_state_t)* HIDDENSTATES));
		checkCudaErrors( cudaMalloc( (void**)hmm_start_probabilities, sizeof(start_probabilities_t) * HIDDENSTATES));
		checkCudaErrors( cudaMalloc( (void**)hmm_transition_matrix, sizeof(transition_matrix_t) * HIDDENSTATES * HIDDENSTATES));
		checkCudaErrors( cudaMalloc( (void**)hmm_emission_matrix, sizeof(emission_matrix_t) * HIDDENSTATES * OBSERVATIONS ) );

		hmm_observation_t observations[ OBSERVATIONS ] = {1, 2, 3, 4, 5, 6};
		hmm_hidden_state_t hidden_states[ HIDDENSTATES ] = {1, 2, 3};
		start_probabilities_t start_probabilities[ HIDDENSTATES ] = {0.5, 0.25, 0.25};
		transition_matrix_t transition_matrix[ HIDDENSTATES * HIDDENSTATES ] = {
				 0.1,   0.45,  0.45,
				 0.45,  0.1,   0.45,
				 0.45,  0.45,  0.1
		};
		emission_matrix_t emission_matrix[ HIDDENSTATES * OBSERVATIONS ] = {
				   0.5,     0.1,   0.1,   0.1,   0.1,  0.1  ,
				   0.001,   0.2,   0.2,   0.2,   0.2,  0.19 ,
				   0.001,   0.2,   0.2,   0.19,  0.2,  0.2
		};
		checkCudaErrors(cudaMemcpyToSymbol(hmm_observations, observations,( sizeof(hmm_observation_t) * OBSERVATIONS )));
		checkCudaErrors(cudaMemcpyToSymbol(hmm_hidden_states, hidden_states,( sizeof(hmm_hidden_state_t) * HIDDENSTATES )));
		checkCudaErrors(cudaMemcpyToSymbol(hmm_start_probabilities, start_probabilities,( sizeof(start_probabilities_t) * HIDDENSTATES )));
		checkCudaErrors(cudaMemcpyToSymbol(hmm_transition_matrix, transition_matrix,( sizeof(transition_matrix_t) * HIDDENSTATES * HIDDENSTATES )));
		checkCudaErrors(cudaMemcpyToSymbol(hmm_emission_matrix, emission_matrix,( sizeof(emission_matrix_t) * HIDDENSTATES * OBSERVATIONS )));

		int backt_len = sizeof(hmm_hidden_state_t) * OBSERVATIONS;
		int prob_len = sizeof(hmm_probability_t) * 2;

		checkCudaErrors(cudaMallocHost((void**)&gpu_ctx->backtrack_states_h[m], backt_len ));
		checkCudaErrors(cudaMalloc((void**)&gpu_ctx->backtrack_states_d[m], backt_len ));

		checkCudaErrors(cudaMemset(gpu_ctx->backtrack_states_h[m], 0, backt_len ));
		checkCudaErrors(cudaMemset(gpu_ctx->backtrack_states_d[m], 0, backt_len ));

		checkCudaErrors(cudaMallocHost((void**)&gpu_ctx->max_path_probability_h[m], prob_len ));
		checkCudaErrors(cudaMalloc((void**)&gpu_ctx->max_path_probability_d[m], prob_len ));

		checkCudaErrors(cudaMemset(gpu_ctx->max_path_probability_h[m], 0, prob_len ));
		checkCudaErrors(cudaMemset(gpu_ctx->max_path_probability_d[m], 0, prob_len ));
	}
__global__ void viterbiKernel(int clientNumber, int* cnt_d, hmm_hidden_state_t* backtrack_path_d, hmm_probability_t *max_probability_d) {
	viterbiNode_t trellis[ OBSERVATIONS * HIDDENSTATES ];

	*max_probability_d = 10.75;
	/*
	 * Init first Viterbi Stage
	 */
	for(int _state = 0; _state < HIDDENSTATES; _state++){
		cnt_d[0]++;
		trellis[ _state ].probability = hmm_start_probabilities[ _state ] * hmm_emission_matrix[ _state + OBSERVATIONS ];
		trellis[ _state ].previous_state = - 1;
		trellis[ _state ].state = hmm_start_probabilities[ _state ];
	}

	/*
	 * Run Viterbi Forward Algorithm for t > 0
	 */
	for(int t = 1; t < OBSERVATIONS; t++){
		cnt_d[1]++;
		for(int _state = 0; _state < HIDDENSTATES; _state++){
			float max_tr_probability = 0;
			cnt_d[2]++;
			for(int _prev_state = 0; _prev_state < HIDDENSTATES; _prev_state++){
				cnt_d[3]++;
				float tmp_prob = trellis[ (t - 1) * HIDDENSTATES + _prev_state ].probability * hmm_transition_matrix[ _prev_state + _state * HIDDENSTATES ];
				if( tmp_prob > max_tr_probability ){
					cnt_d[4]++;
					max_tr_probability = tmp_prob;
					break;
				}
			}

			for(int _prev_state = 0; _prev_state < HIDDENSTATES; _prev_state++){
				cnt_d[5]++;
				if( max_tr_probability == trellis[ (t - 1) * HIDDENSTATES + _prev_state ].probability * hmm_transition_matrix[ _prev_state + _state * HIDDENSTATES ] ){
					cnt_d[6]++;
					trellis[ t * HIDDENSTATES + _state ].probability = max_tr_probability ;///* MEMORY CRASH*/ * hmm_emission_matrix[ _state + t * OBSERVATIONS ];
					trellis[ t * HIDDENSTATES + _state ].previous_state = _prev_state;
					trellis[ t * HIDDENSTATES + _state ].state = hmm_hidden_states[ _state ];
					break;
				}
			}
		}
	}

	/*
	 *  Find the highest probability
	 */
	for(int _state = HIDDENSTATES; _state > 0; _state--){
		cnt_d[7]++;
		float tmp_prob = trellis[ OBSERVATIONS * HIDDENSTATES - _state ].probability;
		if( tmp_prob > 0 ){ ///*MEMORY CRASH*/*max_probability_d ){
			cnt_d[8]++;
			*max_probability_d = tmp_prob;
			break;
		}
	}

	cnt_d[12]++;
}

The Viterbi algorithm is under development and not correct yet.
The actual problems are invalid address accesses by the kernel.
I have two lines, one working, one crashing:

*max_probability_d = 10.75;

works fine.

*max_probability_d = tmp_prob;

crashes.

I tried debugging it with cuda-memcheck, gdb, valgrind, but I am not able to determine the problem.

Are there any suggestions?

BR,
Christoph Kuhr