kernal is not called

Hi I am new to cuda programming. Currently I am working on program that has three dimensional array that should be accessed in parallel. I wrote the code but I am getting the output as zero and not showing any error. Please help me,I am out of time.

This is my code

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <conio.h>
#include <time.h>
#include <string.h>
#include <malloc.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

__global__ void similarity(int *d_sim,int *d_bsim)
{
	//Variable declarations

	unsigned int idx = 100*blockDim.x+ threadIdx.x;
	unsigned int idy = 100*blockDim.y + threadIdx.y;
	unsigned int idz = 2000*blockDim.z + threadIdx.z;
	int a = 100;
	int b = 100;
	int s = 2000;
	int score = 0;
	int finScore = 0;
	*d_bsim = 0;
	
	if (idx > (a*b))
	{
		return;
	}

	
	//BlockSimilarity Score calculating function
	
	for (int i = 0; i < 100; i++)
	{
		for (int j = i + 1; j < 100; j++)
		{
			for (int k = 0; k < s; k++)
			{
				if (k == 0)
				{
					if (d_sim[idx*i + idy*j + k] == 1)
					{
						score++;
					}
				}
				else
				{
					if (d_sim[idx*i + idy*j + (k + 1)] != NULL)
					{
						if (d_sim[idx*i + idy*j + (k - 1)] == 1 && d_sim[idx*i + idy*j + (k)] == 1 && d_sim[idx*i + idy*j + (k + 1)])
						{
							score++;
						}
						else
						{
							if (score > 5)
							{
								finScore += score;
								score = 0;
							}
							else
							{
								score = 0;
							}
						}
					}
				}

			}
			*(d_bsim+idx*i + idy) = finScore;
			printf("%d\n", finScore);
		}
	}	
	
}

int main()
{
	FILE *fp;
	fp = fopen("C:\Users\Thiyagarajan R\Documents\NetBeansProjects\ProteinSequences\output.txt", "r");
	char str[2000];
	char str1[100][2000];
	int simf[100][100];
	int *sim = (int*)malloc(100 * 900 * 1500 * sizeof(int));
	int *bsim = (int*)malloc(100 * 100 * sizeof(int));
	//int *line[5000][2000];
	int line_number = 0;
	int x, y, s;
	x = 100;
	y = 100;
	s = 2000;
	int i = 0;

	//Kernal Memory giving
	int *d_sim, *d_bsim;

	//Allocate Memory to GPU
	cudaMalloc((	void	**) &d_sim, 100 * 900 *1500*	sizeof(int));
	cudaMalloc((void**)&d_bsim, 100 * 900 * sizeof(int));

	/* opening file for reading */
	
	if (fp == NULL)
	{
		perror("Error opening file");
		return(-1);
	}
	clock_t begin = clock();
	
	while (NULL != fgets(str, 2000, fp))
	{
		strcpy(str1[i], str);
		printf("%d: %s\n", ++line_number, str);
		i=++line_number;
		if (i==100)
		break;
	}
	for (int i = 0; str1[0][i] != '

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>
#include <conio.h>
#include <time.h>
#include <string.h>
#include <malloc.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

global void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}

global void similarity(int *d_sim,int *d_bsim)
{
//Variable declarations

unsigned int idx = 100*blockDim.x+ threadIdx.x;
unsigned int idy = 100*blockDim.y + threadIdx.y;
unsigned int idz = 2000*blockDim.z + threadIdx.z;
int a = 100;
int b = 100;
int s = 2000;
int score = 0;
int finScore = 0;
*d_bsim = 0;

if (idx > (a*b))
{
	return;
}


//BlockSimilarity Score calculating function

for (int i = 0; i < 100; i++)
{
	for (int j = i + 1; j < 100; j++)
	{
		for (int k = 0; k < s; k++)
		{
			if (k == 0)
			{
				if (d_sim[idx*i + idy*j + k] == 1)
				{
					score++;
				}
			}
			else
			{
				if (d_sim[idx*i + idy*j + (k + 1)] != NULL)
				{
					if (d_sim[idx*i + idy*j + (k - 1)] == 1 && d_sim[idx*i + idy*j + (k)] == 1 && d_sim[idx*i + idy*j + (k + 1)])
					{
						score++;
					}
					else
					{
						if (score > 5)
						{
							finScore += score;
							score = 0;
						}
						else
						{
							score = 0;
						}
					}
				}
			}

		}
		*(d_bsim+idx*i + idy) = finScore;
		printf("%d\n", finScore);
	}
}	

}

int main()
{
FILE *fp;
fp = fopen(“C:\Users\Thiyagarajan R\Documents\NetBeansProjects\ProteinSequences\output.txt”, “r”);
char str[2000];
char str1[100][2000];
int simf[100][100];
int sim = (int)malloc(100 * 900 * 1500 * sizeof(int));
int bsim = (int)malloc(100 * 100 * sizeof(int));
//int *line[5000][2000];
int line_number = 0;
int x, y, s;
x = 100;
y = 100;
s = 2000;
int i = 0;

//Kernal Memory giving
int *d_sim, *d_bsim;

//Allocate Memory to GPU
cudaMalloc((	void	**) &d_sim, 100 * 900 *1500*	sizeof(int));
cudaMalloc((void**)&d_bsim, 100 * 900 * sizeof(int));

/* opening file for reading */

if (fp == NULL)
{
	perror("Error opening file");
	return(-1);
}
clock_t begin = clock();

while (NULL != fgets(str, 2000, fp))
{
	strcpy(str1[i], str);
	printf("%d: %s\n", ++line_number, str);
	i=++line_number;
	if (i==100)
	break;
}
for (int i = 0; str1[0][i] != '\0'; i++)

{
	int x = (int)str1[0][i];
	printf("%d", x);
}

for (i = 0; i<100; i++)
{
	for (int k = i + 1; k<100; k++)
	{
		for (int s = 0; s<2000; s++)
		{
			if (str1[i][s] == str1[k][s])
			{
				*(sim + i*x *x+ y*k + s) = 1;
			}
			else
			{
				*(sim + i*x *x+ y*k + s) = 0;
			}
		//	printf("%d",*(sim+i*x*x+y*k+s));
		}
		printf("k\n");
	}
	printf("\n");
}

//CudaMemory copy

cudaMemcpy(d_sim, sim, 100 * 100 * 2000 * sizeof(int), cudaMemcpyHostToDevice);

//call the kernel

similarity << <1024, 1024 >> >(d_sim, d_bsim);

//CudaMemory copy 

cudaMemcpy(bsim, d_bsim, 100 * 100 * sizeof(int), cudaMemcpyDeviceToHost);

printf("\n%s\n", cudaGetErrorString(cudaGetLastError()));
for (int i = 0; i < 100; i++)
{
	for (int j = i + 1; j < 100; j++)
	{
		simf[i][j] = *(bsim + i * 100 + j);
		//printf("\n%d\t", simf[i][j]);
	}
	//printf("\n");
}

cudaFree(d_sim);
cudaFree(d_bsim);
fclose(fp);
//free(str1);
return 0;

}

'; i++)

	{
		int x = (int)str1[0][i];
		printf("%d", x);
	}

	for (i = 0; i<100; i++)
	{
		for (int k = i + 1; k<100; k++)
		{
			for (int s = 0; s<2000; s++)
			{
				if (str1[i][s] == str1[k][s])
				{
					*(sim + i*x *x+ y*k + s) = 1;
				}
				else
				{
					*(sim + i*x *x+ y*k + s) = 0;
				}
			//	printf("%d",*(sim+i*x*x+y*k+s));
			}
			printf("k\n");
		}
		printf("\n");
	}

	//CudaMemory copy

	cudaMemcpy(d_sim, sim, 100 * 100 * 2000 * sizeof(int), cudaMemcpyHostToDevice);

	//call the kernel

	similarity << <1024, 1024 >> >(d_sim, d_bsim);

	//CudaMemory copy 

	cudaMemcpy(bsim, d_bsim, 100 * 100 * sizeof(int), cudaMemcpyDeviceToHost);

	printf("\n%s\n", cudaGetErrorString(cudaGetLastError()));
	for (int i = 0; i < 100; i++)
	{
		for (int j = i + 1; j < 100; j++)
		{
			simf[i][j] = *(bsim + i * 100 + j);
			//printf("\n%d\t", simf[i][j]);
		}
		//printf("\n");
	}

	cudaFree(d_sim);
	cudaFree(d_bsim);
	fclose(fp);
	//free(str1);
	return 0;
   }

It may not be perfect code but I has to improve it a lot when the function works correctly. Please find the error. Thanks in advance!!