Hi *,
I am trying to setup a data model for the Viterbi algorithm.
And I am having trouble with it.
this is my gpu_context structure:
typedef float hmm_probability_t;
typedef int hmm_observation_t;
typedef int hmm_hidden_state_t;
typedef float start_probabilities_t;
typedef float transition_matrix_t;
typedef float emission_matrix_t;
typedef struct gpgpu_context{
FILE *fd;
int numClients;
int clientInputs;
int clientOutputs;
int nFrames;
cudaStream_t *stream;
cudaError_t *result;
int **cnt_h;
int **cnt_d;
hmm_hidden_state_t **backtrack_states_h;
hmm_probability_t **max_path_probability_h;
hmm_hidden_state_t **backtrack_states_d;
hmm_probability_t **max_path_probability_d;
} gpgpu_context_t;
__constant__ hmm_hidden_state_t hmm_hidden_states[ HIDDENSTATES ];
__constant__ hmm_observation_t hmm_observations[ OBSERVATIONS ];
__constant__ start_probabilities_t hmm_start_probabilities[ HIDDENSTATES ];
__constant__ transition_matrix_t hmm_transition_matrix[ HIDDENSTATES * HIDDENSTATES ];
__constant__ emission_matrix_t hmm_emission_matrix[ HIDDENSTATES * OBSERVATIONS ];
The gpu_context is allocated and initialized in the calling C application, as well as all the **pointer.
int numClients = 1;
/*
* Only Pointer to Arrays
*/
gpu_ctx->stream = (cudaStream_t*)malloc( numClients * sizeof(cudaStream_t));
gpu_ctx->result = (cudaError_t*)malloc( numClients * sizeof(cudaError_t));
int counterSize = sizeof(int) * 20;
gpu_ctx->cnt_h = (int**)malloc( numClients * counterSize );
gpu_ctx->cnt_d = (int**)malloc( numClients * counterSize );
for(int m=0; m<gpu_ctx->numClients; m++){
gpu_ctx->result[m] = cudaStreamCreate(&gpu_ctx->stream[m]);
checkCudaErrors(cudaMallocHost((void**)&gpu_ctx->cnt_h[m], counterSize ));
checkCudaErrors(cudaMalloc((void**)&gpu_ctx->cnt_d[m], counterSize ));
checkCudaErrors(cudaMemset(gpu_ctx->cnt_h[m], 0, counterSize ));
checkCudaErrors(cudaMemset(gpu_ctx->cnt_d[m], 0, counterSize ));
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
else
printf("Success\n");
}
The HMM, on which the Viterbi algorithm shall operate, is initialized with test values.
gpu_ctx->backtrack_states_h = (hmm_hidden_state_t**)malloc( numClients * sizeof(hmm_hidden_state_t) * OBSERVATIONS );
gpu_ctx->max_path_probability_h = (hmm_probability_t**)malloc( numClients * sizeof(hmm_probability_t) * 2 );
gpu_ctx->backtrack_states_d = (hmm_hidden_state_t**)malloc( numClients * sizeof(hmm_hidden_state_t) * OBSERVATIONS );
gpu_ctx->max_path_probability_d = (hmm_probability_t**)malloc( numClients * sizeof(hmm_probability_t) * 2 );
for(int m=0; m<numClients; m++){
printf("Malloc HMM %d\n", m);
checkCudaErrors( cudaMalloc( (void**)hmm_observations, sizeof(hmm_observation_t) * OBSERVATIONS));
checkCudaErrors( cudaMalloc( (void**)hmm_hidden_states, sizeof(hmm_hidden_state_t)* HIDDENSTATES));
checkCudaErrors( cudaMalloc( (void**)hmm_start_probabilities, sizeof(start_probabilities_t) * HIDDENSTATES));
checkCudaErrors( cudaMalloc( (void**)hmm_transition_matrix, sizeof(transition_matrix_t) * HIDDENSTATES * HIDDENSTATES));
checkCudaErrors( cudaMalloc( (void**)hmm_emission_matrix, sizeof(emission_matrix_t) * HIDDENSTATES * OBSERVATIONS ) );
hmm_observation_t observations[ OBSERVATIONS ] = {1, 2, 3, 4, 5, 6};
hmm_hidden_state_t hidden_states[ HIDDENSTATES ] = {1, 2, 3};
start_probabilities_t start_probabilities[ HIDDENSTATES ] = {0.5, 0.25, 0.25};
transition_matrix_t transition_matrix[ HIDDENSTATES * HIDDENSTATES ] = {
0.1, 0.45, 0.45,
0.45, 0.1, 0.45,
0.45, 0.45, 0.1
};
emission_matrix_t emission_matrix[ HIDDENSTATES * OBSERVATIONS ] = {
0.5, 0.1, 0.1, 0.1, 0.1, 0.1 ,
0.001, 0.2, 0.2, 0.2, 0.2, 0.19 ,
0.001, 0.2, 0.2, 0.19, 0.2, 0.2
};
checkCudaErrors(cudaMemcpyToSymbol(hmm_observations, observations,( sizeof(hmm_observation_t) * OBSERVATIONS )));
checkCudaErrors(cudaMemcpyToSymbol(hmm_hidden_states, hidden_states,( sizeof(hmm_hidden_state_t) * HIDDENSTATES )));
checkCudaErrors(cudaMemcpyToSymbol(hmm_start_probabilities, start_probabilities,( sizeof(start_probabilities_t) * HIDDENSTATES )));
checkCudaErrors(cudaMemcpyToSymbol(hmm_transition_matrix, transition_matrix,( sizeof(transition_matrix_t) * HIDDENSTATES * HIDDENSTATES )));
checkCudaErrors(cudaMemcpyToSymbol(hmm_emission_matrix, emission_matrix,( sizeof(emission_matrix_t) * HIDDENSTATES * OBSERVATIONS )));
int backt_len = sizeof(hmm_hidden_state_t) * OBSERVATIONS;
int prob_len = sizeof(hmm_probability_t) * 2;
checkCudaErrors(cudaMallocHost((void**)&gpu_ctx->backtrack_states_h[m], backt_len ));
checkCudaErrors(cudaMalloc((void**)&gpu_ctx->backtrack_states_d[m], backt_len ));
checkCudaErrors(cudaMemset(gpu_ctx->backtrack_states_h[m], 0, backt_len ));
checkCudaErrors(cudaMemset(gpu_ctx->backtrack_states_d[m], 0, backt_len ));
checkCudaErrors(cudaMallocHost((void**)&gpu_ctx->max_path_probability_h[m], prob_len ));
checkCudaErrors(cudaMalloc((void**)&gpu_ctx->max_path_probability_d[m], prob_len ));
checkCudaErrors(cudaMemset(gpu_ctx->max_path_probability_h[m], 0, prob_len ));
checkCudaErrors(cudaMemset(gpu_ctx->max_path_probability_d[m], 0, prob_len ));
}
__global__ void viterbiKernel(int clientNumber, int* cnt_d, hmm_hidden_state_t* backtrack_path_d, hmm_probability_t *max_probability_d) {
viterbiNode_t trellis[ OBSERVATIONS * HIDDENSTATES ];
*max_probability_d = 10.75;
/*
* Init first Viterbi Stage
*/
for(int _state = 0; _state < HIDDENSTATES; _state++){
cnt_d[0]++;
trellis[ _state ].probability = hmm_start_probabilities[ _state ] * hmm_emission_matrix[ _state + OBSERVATIONS ];
trellis[ _state ].previous_state = - 1;
trellis[ _state ].state = hmm_start_probabilities[ _state ];
}
/*
* Run Viterbi Forward Algorithm for t > 0
*/
for(int t = 1; t < OBSERVATIONS; t++){
cnt_d[1]++;
for(int _state = 0; _state < HIDDENSTATES; _state++){
float max_tr_probability = 0;
cnt_d[2]++;
for(int _prev_state = 0; _prev_state < HIDDENSTATES; _prev_state++){
cnt_d[3]++;
float tmp_prob = trellis[ (t - 1) * HIDDENSTATES + _prev_state ].probability * hmm_transition_matrix[ _prev_state + _state * HIDDENSTATES ];
if( tmp_prob > max_tr_probability ){
cnt_d[4]++;
max_tr_probability = tmp_prob;
break;
}
}
for(int _prev_state = 0; _prev_state < HIDDENSTATES; _prev_state++){
cnt_d[5]++;
if( max_tr_probability == trellis[ (t - 1) * HIDDENSTATES + _prev_state ].probability * hmm_transition_matrix[ _prev_state + _state * HIDDENSTATES ] ){
cnt_d[6]++;
trellis[ t * HIDDENSTATES + _state ].probability = max_tr_probability ;///* MEMORY CRASH*/ * hmm_emission_matrix[ _state + t * OBSERVATIONS ];
trellis[ t * HIDDENSTATES + _state ].previous_state = _prev_state;
trellis[ t * HIDDENSTATES + _state ].state = hmm_hidden_states[ _state ];
break;
}
}
}
}
/*
* Find the highest probability
*/
for(int _state = HIDDENSTATES; _state > 0; _state--){
cnt_d[7]++;
float tmp_prob = trellis[ OBSERVATIONS * HIDDENSTATES - _state ].probability;
if( tmp_prob > 0 ){ ///*MEMORY CRASH*/*max_probability_d ){
cnt_d[8]++;
*max_probability_d = tmp_prob;
break;
}
}
cnt_d[12]++;
}
The Viterbi algorithm is under development and not correct yet.
The actual problems are invalid address accesses by the kernel.
I have two lines, one working, one crashing:
*max_probability_d = 10.75;
works fine.
*max_probability_d = tmp_prob;
crashes.
I tried debugging it with cuda-memcheck, gdb, valgrind, but I am not able to determine the problem.
Are there any suggestions?
BR,
Christoph Kuhr