Code don't work for big arrays. Kernel return with unasigned array values.

Code don’t work for big arrays. Kernel return with unasigned array values.

It work (return values) when cin input is smaller then: xa=176, ya=176
Second kernel don’t work when cin input is: xa=200, ya=200
Both kernels don’t work when cin input is: xa=300, ya=300

I need this code work for all values up to xa=10000, ya=10000

This is begin of my Convolution Neural Net.
This code should make filter 3x3 convolution of flaten matrix h_a and return result h_b flaten matrix smaller with x-2,y-2

Then make Polling operation choose max value from 2x2 matrix sliding flaten matrix h_a and creating a flaten matrix h_b smaller with x-2,y-2

Main cpu function:

int main() 
{
	srand(time(NULL));

	int xa = 1000;
	int ya = 1000;

	cin >> xa;
	cin >> ya;

	int xb = xa - 2;
	int yb = ya - 2;

	int na = xa * ya;
	int nb = xb * yb;

	int sizeA = sizeof(float3) * na;
	int sizeB = sizeof(float3) * nb;

	int fx = 3;
	int fy = 3;
	int fn = fx * fy;

	int sizeF = sizeof(float3) * fn;
	int sizeI = sizeof(int);

	float3 *h_a;
	float3 *h_b;

	float3 *h_f0 = Create3x3Filter(0, -1, 0, -1, 4, -1, 0, -1, 0);
	float3 *h_f1 = Create3x3Filter(-1, -1, -1, -1, 8, -1, -1, -1, -1);

	float3 *d_a, *d_b;
	float3 *d_f0, *d_f1;

	int *d_xa, *d_ya, *d_nb;

	cudaMalloc((void **)&d_a, sizeA);
	cudaMalloc((void **)&d_b, sizeB);

	cudaMalloc((void **)&d_f0, sizeF);

	cudaMalloc((void **)&d_xa, sizeI);
	cudaMalloc((void **)&d_ya, sizeI);
	cudaMalloc((void **)&d_nb, sizeI);

	h_a = (float3 *)malloc(sizeA);
	h_b = (float3 *)malloc(sizeB);

	for (int i = 0; i < na; i++)
	{
		h_a[i].x = rand() % 256;
		h_a[i].y = rand() % 256;
		h_a[i].z = rand() % 256;
	}

	cudaMemcpy(d_a, h_a, sizeA, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, sizeB, cudaMemcpyHostToDevice);
	cudaMemcpy(d_f0, h_f0, sizeF, cudaMemcpyHostToDevice);

	cudaMemcpy(d_xa, &xa, sizeI, cudaMemcpyHostToDevice);
	cudaMemcpy(d_ya, &ya, sizeI, cudaMemcpyHostToDevice);
	cudaMemcpy(d_nb, &nb, sizeI, cudaMemcpyHostToDevice);

	printf("Convolution\n");
	
	Convolution<<<nb / 512 + 1, 512>>>(d_a, d_xa, d_ya, d_f0, d_b, d_nb);

	cudaMemcpy(h_b, d_b, sizeB, cudaMemcpyDeviceToHost);

	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_f0);

	cudaFree(d_xa);
	cudaFree(d_ya);
	cudaFree(d_nb);

	printf("A:\n");
	for (int i = 0; i < 9; i++)
	{
		cout << h_a[i].x << "\n";
	}

	printf("B:\n");
	for (int i = 0; i < 9; i++)
	{
		cout << h_b[i].x << "\n";
	}

	xa = xb;
	ya = yb;

	xb = xa - 2;
	yb = ya - 2;

	na = xa * ya;
	nb = xb * yb;

	sizeA = sizeof(float3) * na;
	sizeB = sizeof(float3) * nb;

	h_a = (float3 *)malloc(sizeA);

	h_a = h_b;

	h_b = (float3 *)malloc(sizeB);

	printf("A:\n");
	for (int i = 0; i < 9; i++)
	{
		cout << h_a[i].x << "\n";
	}

	cudaMalloc((void **)&d_a, sizeA);
	cudaMalloc((void **)&d_b, sizeB);

	cudaMalloc((void **)&d_xa, sizeI);
	cudaMalloc((void **)&d_ya, sizeI);
	cudaMalloc((void **)&d_nb, sizeI);

	cudaMemcpy(d_a, h_a, sizeA, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, sizeB, cudaMemcpyHostToDevice);

	cudaMemcpy(d_xa, &xa, sizeI, cudaMemcpyHostToDevice);
	cudaMemcpy(d_ya, &ya, sizeI, cudaMemcpyHostToDevice);
	cudaMemcpy(d_nb, &nb, sizeI, cudaMemcpyHostToDevice);

	printf("Pooling\n");

	MaxPooling<<<nb / 512 + 1, 512 >>>(d_a, d_xa, d_ya, d_b, d_nb);

	cudaMemcpy(h_b, d_b, sizeB, cudaMemcpyDeviceToHost);

	cudaFree(d_a);
	cudaFree(d_b);

	cudaFree(d_xa);
	cudaFree(d_ya);
	cudaFree(d_nb);

	printf("B:\n");
	for (int i = 0; i < 9; i++)
	{
		cout << h_b[i].x << "\n";
	}
	
	free(h_a);
	free(h_b);

	return 0;
}

Convolution kernel:

__global__ void Convolution(float3 *a, int *ax, int *ay, float3 *f, float3 *b, int *count)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;

	if (id < *count)
	{
		float3 *aPart = new float3[9];

		int idxb = id % (*ax - 2);
		int idyb = id / (*ay - 2);

		for (int i = 0; i < 3; i++)
		{
			for (int j = 0; j < 3; j++)
			{
				aPart[i * 3 + j] = a[*ax * (idyb + i) + j + idxb];
			}
		}

		Float3Multiply<<<1, 9>>>(aPart, f);

		b[id] = Float3FilterSum(aPart, 9);
	}
}

Sum and multiply functions:

__global__ void Float3Multiply(float3 *a, float3 *b)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;

	if (id < 9)
	{
		a[id].x *= b[id].x;
		a[id].y *= b[id].y;
		a[id].z *= b[id].z;
	}
}

__device__ float3 Float3FilterSum(float3 *a, int n)
{
	float3 sum;

	for (int i = 0; i < n; i++)
	{
		sum.x += a[i].x;
		sum.y += a[i].y;
		sum.z += a[i].z;
	}

	return sum;
}

Pooling kernel:

__global__ void MaxPooling(float3 *a, int *ax, int *ay, float3 *b, int *count)
{
	int id = blockIdx.x * blockDim.x + threadIdx.x;

	if (id < *count)
	{
		float3 *aPart = new float3[4];

		int idxb = id % (*ax - 2);
		int idyb = id / (*ay - 2);

		for (int i = 0; i < 2; i++)
		{
			for (int j = 0; j < 2; j++)
			{
				aPart[i * 2 + j] = a[*ax * (idyb + i) + j + idxb];
			}
		}		

		float3 high = aPart[0];

		for (int i = 1; i < 4; i++)
		{
			if (aPart[i].x > high.x)
			{
				high.x = aPart[i].x;
			}

			if (aPart[i].y > high.y)
			{
				high.y = aPart[i].y;
			}

			if (aPart[i].z > high.z)
			{
				high.z = aPart[i].z;
			}
		}

		b[id] = high;
	}
}

Assign float3 same floats:

float3 Samefloat3(float a)
{
	float3 Sf3;
	Sf3.x = a;
	Sf3.y = a;
	Sf3.z = a;

	return Sf3;
}

You have to do error checking by testing the return code of the CUDA API calls. Do a search for “how to do CUDA error checking”.
Then use cuda-memcheck to see if there is any operation out of an array’s bounds.

I find solution!

Kernel wrong code all kernels init array:

float3 *aPart = new float3[9];

float3 *aPart = new float3[4];

Solution array are defined at kernel start:

float3 aPart[9];

float3 aPart[4];