Code don’t work for big arrays. Kernel return with unasigned array values.
It work (return values) when cin input is smaller then: xa=176, ya=176
Second kernel don’t work when cin input is: xa=200, ya=200
Both kernels don’t work when cin input is: xa=300, ya=300
I need this code work for all values up to xa=10000, ya=10000
This is begin of my Convolution Neural Net.
This code should make filter 3x3 convolution of flaten matrix h_a and return result h_b flaten matrix smaller with x-2,y-2
Then make Polling operation choose max value from 2x2 matrix sliding flaten matrix h_a and creating a flaten matrix h_b smaller with x-2,y-2
Main cpu function:
int main()
{
srand(time(NULL));
int xa = 1000;
int ya = 1000;
cin >> xa;
cin >> ya;
int xb = xa - 2;
int yb = ya - 2;
int na = xa * ya;
int nb = xb * yb;
int sizeA = sizeof(float3) * na;
int sizeB = sizeof(float3) * nb;
int fx = 3;
int fy = 3;
int fn = fx * fy;
int sizeF = sizeof(float3) * fn;
int sizeI = sizeof(int);
float3 *h_a;
float3 *h_b;
float3 *h_f0 = Create3x3Filter(0, -1, 0, -1, 4, -1, 0, -1, 0);
float3 *h_f1 = Create3x3Filter(-1, -1, -1, -1, 8, -1, -1, -1, -1);
float3 *d_a, *d_b;
float3 *d_f0, *d_f1;
int *d_xa, *d_ya, *d_nb;
cudaMalloc((void **)&d_a, sizeA);
cudaMalloc((void **)&d_b, sizeB);
cudaMalloc((void **)&d_f0, sizeF);
cudaMalloc((void **)&d_xa, sizeI);
cudaMalloc((void **)&d_ya, sizeI);
cudaMalloc((void **)&d_nb, sizeI);
h_a = (float3 *)malloc(sizeA);
h_b = (float3 *)malloc(sizeB);
for (int i = 0; i < na; i++)
{
h_a[i].x = rand() % 256;
h_a[i].y = rand() % 256;
h_a[i].z = rand() % 256;
}
cudaMemcpy(d_a, h_a, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeB, cudaMemcpyHostToDevice);
cudaMemcpy(d_f0, h_f0, sizeF, cudaMemcpyHostToDevice);
cudaMemcpy(d_xa, &xa, sizeI, cudaMemcpyHostToDevice);
cudaMemcpy(d_ya, &ya, sizeI, cudaMemcpyHostToDevice);
cudaMemcpy(d_nb, &nb, sizeI, cudaMemcpyHostToDevice);
printf("Convolution\n");
Convolution<<<nb / 512 + 1, 512>>>(d_a, d_xa, d_ya, d_f0, d_b, d_nb);
cudaMemcpy(h_b, d_b, sizeB, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_f0);
cudaFree(d_xa);
cudaFree(d_ya);
cudaFree(d_nb);
printf("A:\n");
for (int i = 0; i < 9; i++)
{
cout << h_a[i].x << "\n";
}
printf("B:\n");
for (int i = 0; i < 9; i++)
{
cout << h_b[i].x << "\n";
}
xa = xb;
ya = yb;
xb = xa - 2;
yb = ya - 2;
na = xa * ya;
nb = xb * yb;
sizeA = sizeof(float3) * na;
sizeB = sizeof(float3) * nb;
h_a = (float3 *)malloc(sizeA);
h_a = h_b;
h_b = (float3 *)malloc(sizeB);
printf("A:\n");
for (int i = 0; i < 9; i++)
{
cout << h_a[i].x << "\n";
}
cudaMalloc((void **)&d_a, sizeA);
cudaMalloc((void **)&d_b, sizeB);
cudaMalloc((void **)&d_xa, sizeI);
cudaMalloc((void **)&d_ya, sizeI);
cudaMalloc((void **)&d_nb, sizeI);
cudaMemcpy(d_a, h_a, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeB, cudaMemcpyHostToDevice);
cudaMemcpy(d_xa, &xa, sizeI, cudaMemcpyHostToDevice);
cudaMemcpy(d_ya, &ya, sizeI, cudaMemcpyHostToDevice);
cudaMemcpy(d_nb, &nb, sizeI, cudaMemcpyHostToDevice);
printf("Pooling\n");
MaxPooling<<<nb / 512 + 1, 512 >>>(d_a, d_xa, d_ya, d_b, d_nb);
cudaMemcpy(h_b, d_b, sizeB, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_xa);
cudaFree(d_ya);
cudaFree(d_nb);
printf("B:\n");
for (int i = 0; i < 9; i++)
{
cout << h_b[i].x << "\n";
}
free(h_a);
free(h_b);
return 0;
}
Convolution kernel:
__global__ void Convolution(float3 *a, int *ax, int *ay, float3 *f, float3 *b, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < *count)
{
float3 *aPart = new float3[9];
int idxb = id % (*ax - 2);
int idyb = id / (*ay - 2);
for (int i = 0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
aPart[i * 3 + j] = a[*ax * (idyb + i) + j + idxb];
}
}
Float3Multiply<<<1, 9>>>(aPart, f);
b[id] = Float3FilterSum(aPart, 9);
}
}
Sum and multiply functions:
__global__ void Float3Multiply(float3 *a, float3 *b)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < 9)
{
a[id].x *= b[id].x;
a[id].y *= b[id].y;
a[id].z *= b[id].z;
}
}
__device__ float3 Float3FilterSum(float3 *a, int n)
{
float3 sum;
for (int i = 0; i < n; i++)
{
sum.x += a[i].x;
sum.y += a[i].y;
sum.z += a[i].z;
}
return sum;
}
Pooling kernel:
__global__ void MaxPooling(float3 *a, int *ax, int *ay, float3 *b, int *count)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id < *count)
{
float3 *aPart = new float3[4];
int idxb = id % (*ax - 2);
int idyb = id / (*ay - 2);
for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 2; j++)
{
aPart[i * 2 + j] = a[*ax * (idyb + i) + j + idxb];
}
}
float3 high = aPart[0];
for (int i = 1; i < 4; i++)
{
if (aPart[i].x > high.x)
{
high.x = aPart[i].x;
}
if (aPart[i].y > high.y)
{
high.y = aPart[i].y;
}
if (aPart[i].z > high.z)
{
high.z = aPart[i].z;
}
}
b[id] = high;
}
}
Assign float3 same floats:
float3 Samefloat3(float a)
{
float3 Sf3;
Sf3.x = a;
Sf3.y = a;
Sf3.z = a;
return Sf3;
}