you need a basic compute that looks like this:
for (int x=1; x < width-1; x++) {
for (int y=1, y < height-1; y++) {
int acc = 0;
for (int cx=0; cx < 2; cx++) {
for (int cy=0;cy < 2; cy++) {
acc += inpic[x][y]*matrix[cx][cy]
}
}
outpic[x][y]=acc/normalise;
}
}
This is very inefficient though as it computes the array offsets for each pixel, much better to manipulate the pointers and unroll
the inner loop. Also watch out for overflow blowing up the maths, make sure that intermediate computations use more bits than the input and output data.
This is probably why you’re getting black picture.
Not too hard, here’s a 5 X 5 sample set up with a cuda friendly interface, but not yet set up to run on the GPU:
it takes RGB 8 bit pic and generates a 1 byte or RGB grayscale transformed image. The inner loop is already unrolled
for speed, so the code is tedious, but fast.
void convolve5x(const guint8 *inpic, guint8 *outpic, const gint16 *mat,
const int width, const int height, const bool sout) {
const guint8 *r1, *r2, *r3, *r4, *r5;
const int loopwidth = width-4;
guint8 *outr;
int norm = 0;
int acc, acc1, acc2, acc3, acc4, acc5;
int outwidth = 3;
if (sout) outwidth = 1;
printf("outwidth is %i in convolver\n",outwidth);
for (int i = 0; i <25; i++) {
norm += mat[i]*3;
}
if (norm == 0) norm = 1;
int maxval = -32000, minval = 32000;
for (int y = 0; y < height-4; y++) {
r1 = &inpic[y*width*3];
r2 = r1+width*3;
r3 = r2+width*3;
r4 = r3+width*3;
r5 = r4+width*3;
outr = &outpic[y*(width-4)*outwidth];
for (int x = 0; x < loopwidth; x++) {
acc1 = ((int)r1[ 0]+(int)r1[ 1]+(int)r1[ 2])*(int)mat[ 0] + ((int)r1[ 3]+(int)r1[ 4]+(int)r1[ 5])*(int)mat[ 1] + ((int)r1[ 6]+(int)r1[ 7]+(int)r1[ 8])*(int)mat[ 2]
+ ((int)r1[ 9]+(int)r1[10]+(int)r1[11])*(int)mat[ 3] + ((int)r1[12]+(int)r1[13]+(int)r1[14])*(int)mat[ 4];
acc2 = ((int)r2[ 0]+(int)r2[ 1]+(int)r2[ 2])*(int)mat[ 5] + ((int)r2[ 3]+(int)r2[ 4]+(int)r2[ 5])*(int)mat[ 6] + ((int)r2[ 6]+(int)r2[ 7]+(int)r2[ 8])*(int)mat[ 7]
+ ((int)r2[ 9]+(int)r2[10]+(int)r2[11])*(int)mat[ 8] + ((int)r2[12]+(int)r2[13]+(int)r2[14])*(int)mat[ 9];
acc3 = ((int)r3[ 0]+(int)r3[ 1]+(int)r3[ 2])*(int)mat[10] + ((int)r3[ 3]+(int)r3[ 4]+(int)r3[ 5])*(int)mat[11] + ((int)r3[ 6]+(int)r3[ 7]+(int)r3[ 8])*(int)mat[12]
+ ((int)r3[ 9]+(int)r3[10]+(int)r3[11])*(int)mat[13] + ((int)r3[12]+(int)r3[13]+(int)r3[14])*(int)mat[14];
acc4 = ((int)r4[ 0]+(int)r4[ 1]+(int)r4[ 2])*(int)mat[15] + ((int)r4[ 3]+(int)r4[ 4]+(int)r4[ 5])*(int)mat[16] + ((int)r4[ 6]+(int)r4[ 7]+(int)r4[ 8])*(int)mat[17]
+ ((int)r4[ 9]+(int)r4[10]+(int)r4[11])*(int)mat[18] + ((int)r4[12]+(int)r4[13]+(int)r4[14])*(int)mat[19];
acc5 = ((int)r5[ 0]+(int)r5[ 1]+(int)r5[ 2])*(int)mat[20] + ((int)r5[ 3]+(int)r5[ 4]+(int)r5[ 5])*(int)mat[21] + ((int)r5[ 6]+(int)r5[ 7]+(int)r5[ 8])*(int)mat[22]
+ ((int)r5[ 9]+(int)r5[10]+(int)r5[11])*(int)mat[23] + ((int)r5[12]+(int)r5[13]+(int)r5[14])*(int)mat[24];
acc = (acc1+acc2+acc3+acc4+acc5)/norm;
if (sout) {
outr[0] = acc;
outr++;
} else {
outr[0] = outr[1] = outr[2] = acc / 3;
outr += 3;
}
if (acc < minval) minval = acc;
if (acc > maxval) maxval = acc;
r1 += 3;
r2 += 3;
r3 += 3;
r4 += 3;
r5 += 3;
}
}
printf("min found: %i, max found: %i, norm is %i\n",minval,maxval,norm);
}