Question, Algoritm Help with kernel invocation

Hello, I have problem with my algorithms I can not work well with threads. I got a problem, publish a comparison of a sorting method in C with a sorting method in CUDA. Following is the algorithm that tried to use CUDA, but he did not compile anymore. I need help or tips on how to work with the threads inside the sorting method. This model was compiling but I returned to the numbers entered. Algorithm:

Thank you in advance.

#include<stdlib.h>

#include<stdio.h>

void __global__ insertion(int *v, int tam){

	int i = blockDim.x * blockIdx.x + threadIdx.x;

	int j = blockDim.y * blockIdx.y + threadIdx.y;

	int a;

	if(i<=tam){

	a = v[i];

	j = i;

	while(v[j-1] > a){

	v[j] = v[j-1]; 

	j--;

}

	v[j] = a;	

		

	}

}

int main(void){

	

	int tamanho;

	int *h_vetor;

	int i,j;

	int *d_vetor;

	printf("Digite o tamanho do vetor: ");

	scanf("%d",&tamanho);

	h_vetor = (int*)malloc(sizeof(int));

	

	cudaMalloc((void**)&d_vetor,sizeof(int));

	

	

	for(i=0;i<tamanho;i++){

		printf("Digite os numeros:");

		scanf("%d",&h_vetor[i]);

	}

	cudaMemcpy(d_vetor,h_vetor,tamanho,cudaMemcpyHostToDevice);

			

	dim3 dimBlock(16,16);

	dim3 dimGrid((tamanho * dimBlock.x)/dimBlock.x,(tamanho * dimBlock.y)/dimBlock.y);

	

	insertion<<<dimGrid,dimBlock>>>(d_vetor,tamanho);	

	

	cudaMemcpy(h_vetor,d_vetor,tamanho,cudaMemcpyDeviceToHost);

	

	printf("Vetor ordenado: ");

	for(j=0;j<tamanho;j++){

		

		printf(" [ %d ] ",h_vetor[j]);

	}

	

	return 0;

}

Hello, I have problem with my algorithms I can not work well with threads. I got a problem, publish a comparison of a sorting method in C with a sorting method in CUDA. Following is the algorithm that tried to use CUDA, but he did not compile anymore. I need help or tips on how to work with the threads inside the sorting method. This model was compiling but I returned to the numbers entered. Algorithm:

Thank you in advance.

#include<stdlib.h>

#include<stdio.h>

void __global__ insertion(int *v, int tam){

	int i = blockDim.x * blockIdx.x + threadIdx.x;

	int j = blockDim.y * blockIdx.y + threadIdx.y;

	int a;

	if(i<=tam){

	a = v[i];

	j = i;

	while(v[j-1] > a){

	v[j] = v[j-1]; 

	j--;

}

	v[j] = a;	

		

	}

}

int main(void){

	

	int tamanho;

	int *h_vetor;

	int i,j;

	int *d_vetor;

	printf("Digite o tamanho do vetor: ");

	scanf("%d",&tamanho);

	h_vetor = (int*)malloc(sizeof(int));

	

	cudaMalloc((void**)&d_vetor,sizeof(int));

	

	

	for(i=0;i<tamanho;i++){

		printf("Digite os numeros:");

		scanf("%d",&h_vetor[i]);

	}

	cudaMemcpy(d_vetor,h_vetor,tamanho,cudaMemcpyHostToDevice);

			

	dim3 dimBlock(16,16);

	dim3 dimGrid((tamanho * dimBlock.x)/dimBlock.x,(tamanho * dimBlock.y)/dimBlock.y);

	

	insertion<<<dimGrid,dimBlock>>>(d_vetor,tamanho);	

	

	cudaMemcpy(h_vetor,d_vetor,tamanho,cudaMemcpyDeviceToHost);

	

	printf("Vetor ordenado: ");

	for(j=0;j<tamanho;j++){

		

		printf(" [ %d ] ",h_vetor[j]);

	}

	

	return 0;

}