Help needed in increment of score in CUDA

nahseez · June 21, 2011, 8:59am

Hi,

I am a student and i m working on CUDA technology to match two templates from database but i cant get the ans correcty it shows 0.

Please help me i m using Gforce 9800 with windows 7 and visual studio 2008 and cuda 4.0

Here is the code

#include<stdio.h>

#include<stdlib.h>

#include<math.h>

#include <String.h>

#include <conio.h>

#include <cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "Load_one_D_Encoded_List.h"

#include "Match_transformed_List.h"

#include<iostream>

//using namespace System;

__global__ void cmatcher(const int *Input_Encoded_List,const int *Temp_Encoded_List,const int Input_no_of_min,const int Temp_no_of_min ,int *score_matrix,int *score_matrix_for_match_tranformed,int min_minutiae_in_input_temp,int tscore )

{

	

	int i, j, k, min_no_minutiae_match, score,

	no_of_neigbour,Input_Starting_Point=0,Temp_Starting_Point=0,ist;

	no_of_neigbour = 20 ;

	//score_matrix_for_match_tranformed = (int*)malloc(sizeof(int)*(no_of_neigbour*no_of_neigbour));

	//score_matrix = (int*) malloc (sizeof(int)*(Input_no_of_min*Temp_no_of_min));

	for( i = 0 ; i < Temp_no_of_min*Input_no_of_min ; i++)

		*(score_matrix+i) = 0;

	//Input_Starting_Point = 0;

	k = 0;

	i=blockIdx.x*blockDim.x+threadIdx.x;

	j=blockIdx.y*blockDim.y+threadIdx.y;

	atomicAdd(&tscore,1);

	if(j<Input_no_of_min && i<Temp_no_of_min)

	{

		ist=Input_Starting_Point+Input_no_of_min*3*j;

		score = Match_Transformed_List(Input_Encoded_List,Temp_Encoded_List,score_matrix_for_match_tranformed,ist,Temp_Starting_Point+Temp_no_of_min*3*i,no_of_neigbour);

		if( score > min_no_minutiae_match)

		{

				

				*(score_matrix+i+j) = score;

			}

				

	}

}

#include<math.h>

#include <cuda.h>

#include <sm_11_atomic_functions.h>

__device__ int Match_Transformed_List(const int* Input_List,const int* Temp_List,int *score_matrix,int Input_start_Point,int Temp_start_Point,const int no_of_neigbour)

{

	 int i,j,k,no_of_iteration,Input_Loop_ending, Temp_Loop_ending,ii,jj;

	 int score;

	score=0;

	//no_of_iteration = no_of_neigbour*no_of_neigbour;

 //   

	for( i=0 ; i < no_of_iteration ; i++)

		*(score_matrix+i) = 50;

	Input_Loop_ending = (Input_start_Point + 3*no_of_neigbour);

	Temp_Loop_ending = (Temp_start_Point + 3*no_of_neigbour);

	k=0;

	for ( i=Input_start_Point ; i < Input_Loop_ending ; i=i+3)

	{

		for( j=Temp_start_Point ; j < Temp_Loop_ending ; j=j+3 )

		{

			

			if ( ( abs( ( *(Input_List+i) )-( ( *(Temp_List+j) ) ) ) < 7 ) && 

				( abs( ( *(Input_List+i+1) )-( ( *(Temp_List+j+1) ) ) ) < 10 ||

				abs( ( *(Input_List+i+1) )-( ( *(Temp_List+j+1) ) ) ) > 350 ) &&

			    ( abs( ( *(Input_List+i+2) )-( ( *(Temp_List+j+2) ) ) ) < 13 ||

				abs( ( *(Input_List+i+2) )-( ( *(Temp_List+j+2) ) ) ) > 347  ) )

			{

				*(score_matrix+k) = 1;

				atomicAdd(&score,1);

				//nmatch+=1;

				//atomicAdd(&nmatch,1);

			}

			

			

			atomicAdd(&k,1);

		}

	}

	

	int match_row=0,match_col=0,row_index_sum=0,col_index_sum=0;

	for( ii = 0 ; ii < no_of_neigbour ; ii++)

	{

		for ( jj = 0 ; jj < no_of_neigbour ; jj++)

		{

			col_index_sum = row_index_sum+jj;

			if( *(score_matrix+col_index_sum) <50 )

			{ 

				atomicAdd(&match_row,1);

				//break;

			}

		}

		atomicAdd(&row_index_sum,no_of_neigbour);

	}

	//row_index_sum = 0;

	//col_index_sum = 0;

	for( ii = 0 ; ii < no_of_neigbour ; ii++)

	{	

		col_index_sum = ii;

		for ( jj = 0 ; jj < no_of_neigbour ; jj++)

		{

			

			if( *(score_matrix+col_index_sum) <50 )

			{ 

				atomicAdd(&match_col,1);

				//break;

			}

			atomicAdd(&col_index_sum ,no_of_neigbour);

		}

		//row_index_sum +=(i+1);

	}

	score = match_row;

	if( match_row  > match_col)

		score = match_col;

	return score;

}

int Matcher(int *Input_Encoded_List,int *Temp_Encoded_List,const int Input_no_of_min,const int Temp_no_of_min)

{

	int i, j, k, min_minutiae_in_input_temp, min_no_minutiae_match, score,

	no_of_neigbour,*score_matrix,Input_Starting_Point,Temp_Starting_Point,mmatch=0,*score_matrixfor,input,temp;

	float score_divide;

	 int no_of_element1;int no_of_element;

	int size1,size2;

	no_of_neigbour = 20 ;

	min_minutiae_in_input_temp = Input_no_of_min;

	if(Temp_no_of_min < Input_no_of_min)

		min_minutiae_in_input_temp = Temp_no_of_min;

	if(min_minutiae_in_input_temp < no_of_neigbour)

		no_of_neigbour = min_minutiae_in_input_temp;

		 

		

	int tscore;

	

	score_matrix = (int*) malloc (sizeof(int)*(Input_no_of_min*Temp_no_of_min));

	

	for( i=0 ; i < Input_no_of_min*Temp_no_of_min ; i++)

		*(score_matrix+i) = 0;

		

		input=Input_no_of_min;

		 temp=Temp_no_of_min;

		 

		int *score_matrix_for_match_tranformed;

	 (cudaMalloc((void**)&score_matrix_for_match_tranformed,sizeof(int)*(400) ));

	 (cudaMemset(score_matrix_for_match_tranformed,50,sizeof(int)*(no_of_neigbour*no_of_neigbour)));

	int *score_matrix1;

		 (cudaMalloc((void**)&score_matrix1,sizeof(int)*(input*temp) ));

	 (cudaMemset(score_matrix1,0,sizeof(int)*(input*temp)));

	min_no_minutiae_match = (int)(8.0*(1-exp(-(float)min_minutiae_in_input_temp/20.0))+0.5);

	no_of_element = (input)*(input)*3;

				 size1 = (no_of_element);

				

				 no_of_element1 = (temp)*(temp)*3;

				 

				size2 = (no_of_element1);

				int *d_input,*d_temp;

				 ( cudaMalloc((void**)&d_input,sizeof(int)*size1) );

				 ( cudaMalloc((void**)&d_temp,sizeof(int)*size2) );

				 ( cudaMemcpy(d_input, Input_Encoded_List, sizeof(int)*size1, cudaMemcpyHostToDevice) );

				 ( cudaMemcpy(d_temp, Temp_Encoded_List, sizeof(int)*size2, cudaMemcpyHostToDevice) );

				dim3 gdim;

				dim3 bdim;

				bdim.x=16;

				bdim.y=16;

				bdim.z=1;

				gdim.x=(Temp_no_of_min+bdim.x-1)/bdim.x;

				gdim.y=(Input_no_of_min+bdim.y-1)/bdim.y;

				gdim.z=1;

	cmatcher<<<gdim,bdim>>>(d_input,d_temp,input,temp,score_matrix1,score_matrix_for_match_tranformed,min_minutiae_in_input_temp,tscore );

	 ( cudaMemcpy(score_matrix,score_matrix1,sizeof(int)*(input*temp), cudaMemcpyDeviceToHost) );

	cudaFree(d_temp);

	cudaFree(d_input);

	cudaFree(score_matrix1);

	//for(i=0;i<400;i++)

	printf("\t%d\t",tscore);

    score = maximum_score_dr_imtiaz(score_matrix,input,temp);

	score_divide = ((score*100.0)/min_minutiae_in_input_temp+0.5);

	score = (int)(score_divide);

	free(score_matrix);

	return score;

}

Please help me the score is not incrementing and the input and temp are the two lists.

1_2_Encoder.txt (45.6 KB)

tera · June 21, 2011, 9:17am

Check all return codes from CUDA calls for errors.

hyqneuron · June 21, 2011, 3:12pm

Seriously I think someone should put this message as sticky on the top of the forum. External Image

nahseez · June 22, 2011, 11:05am

i used safe calls but it gives error could you tell me the header file that i have to include for this.and also i am not able to debug it the nvidia nsight profiler doesn’t connect to the visual studio.

tera · June 22, 2011, 11:13am

There is no specific include. Usually you just put some definition like

#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) printf(#x " returned \"%s\"\n", cudaGetErrorString(cuda_error__));}

I’m not on Windows so I can’t help with the second question.

nahseez · June 23, 2011, 7:26am

There is no specific include. Usually you just put some definition like
#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) printf(#x " returned \"%s\"\n", cudaGetErrorString(cuda_error__));}
I’m not on Windows so I can’t help with the second question.

i use the function

void checkCUDAError(const char *msg)

{

cudaError_t err = cudaGetLastError();

// get the last error that has

// been produced by any of the

// runtime API calls

if( cudaSuccess != err)

// cudaSuccess is an enum indicating no errors.

{

fprintf(stderr, “Cuda error: %s: %s.\n”, msg, cudaGetErrorString( err));

// cudaGetErrorString gives the message string from an error code.

system(“pause”);

exit(EXIT_FAILURE);

}

and call the function after any cuda call but it gives error

cuda error:memcpy:invalid device function

please Help me.