can somebody test this

its OK code, but i need to see performance on different devices…

its few (integer) matrix multiplication algorithms.

you can change VELMAT in mm.cu, but (for this test) must be multiple of 480.

just copy/paste results (like:

dimenzije matrice su 24002400
algoritam 1 (bez dijeljene memorije)=10266.582031 ms=100%
algoritam 11 (podjela 1
1)=2133.398682 ms=20%
algoritam 16 (podjela 16)=2292.128662 ms=22%
algoritam 15 (podjela 1
5)=2290.939209 ms=22%
algoritam 22 (podjela 22)=1807.912109 ms=17%
algoritam 26 (podjela 2
6)=1736.550293 ms=16%
algoritam 33 (podjela 3*3)=1444.341064 ms=14%

) and what device is used.

for very big VELMAT you may need to comment line 121 ( vremena[0]=pomnozi (A,B,velmat,C); ) in mm.cu

VELMAT iz size of matrix.
algoritam 1 is algorithm without shared memory.
algoritam xy use x submatrix from A and 1 submatrix from B to calculate xy submatrix of result. so it use x+1+xy submatrix total (each of size 16164 bytes=1K) of shared memory (except algoritam11 wich use only 2 submatrix to calculate 1 submatrix of result)

mm_O.rar (158 KB)

Hello db,

I would like to help you, but I principally can’t download a rar archive because of security issues.
Please attache the pure code.

Best regards

OK thanks.

here is (i removed some comments)

in original, there are some other files (.bmp pixels for html page but never mind)

whats problem with rar?

should i attach as .zip or something else?

mm.cu>

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil.h>

#include <mm_kernel.cu>

//#define VELMAT 480

//#define VELMAT 960

//#define VELMAT 1920

#define VELMAT 2400

//#define VELMAT 4800

void  test(int argc, char** argv, int velmat, float *vremena);

float pomnoziCPU(int *A,int *B, int velmat, int *C);

float pomnozi33 (int *A,int *B, int velmat, int *C);

float pomnozi26 (int *A,int *B, int velmat, int *C);

float pomnozi22 (int *A,int *B, int velmat, int *C);

float pomnozi15 (int *A,int *B, int velmat, int *C);

float pomnozi16 (int *A,int *B, int velmat, int *C);

float pomnozi11 (int *A,int *B, int velmat, int *C);

float pomnozi   (int *A,int *B, int velmat, int *C);

int main( int argc, char** argv){

	float vremena[8];

	char *tmpUlaz;

	int velmat=VELMAT;

	srand(time(NULL));

	

	tmpUlaz=(argv[1]);

	if(tmpUlaz){/**/

		tmpUlaz=(argv[1]);

			 if(!strcmp(tmpUlaz,"480"))

				velmat=480;

		else if(!strcmp(tmpUlaz,"960"))

				velmat=960;

		else if(!strcmp(tmpUlaz,"1920"))

				velmat=1920;

		else if(!strcmp(tmpUlaz,"2400"))

				velmat=2400;

	}

	test( argc, argv, velmat, vremena);

	FILE *fhtml=fopen("izlaz.html","w");

	if(!fhtml){

		printf("\nGRESKA-nije otvoren fajl za izlaz\n");

		fflush(stdin);getchar();return 0;

	}

	float MaX=vremena[0];

	for(int oo=0;oo<7;oo++)

		if(vremena[oo]>MaX)	

			MaX=vremena[oo];

	fprintf(fhtml,"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">\n<html><head><title>rezultati</title></head>\n");

	fprintf(fhtml,"<body>\n");

	fprintf(fhtml,"<table width=\"400px\">\n");

	fprintf(fhtml,"<tr><td> dimenzije matrice su %d*%d</td></tr>\n",velmat,velmat);

	fprintf(fhtml,"<tr><td>algoritam 1 (bez dijeljene memorije)=%f ms=%d%%</td></tr>\n",vremena[0],(int)(100.0f*vremena[0]/MaX));

	fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU1.bmp\"></td></tr>\n",(int)(600.0f*vremena[0]/MaX));

	

	fprintf(fhtml,"<tr><td>algoritam 11 (podjela 1*1)=%f ms=%d%%</td></tr>\n",vremena[1],(int)(100.0f*vremena[1]/MaX));

	fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU11.bmp\"></td></tr>\n",(int)(600.0f*vremena[1]/MaX) );

	

	fprintf(fhtml,"<tr><td>algoritam 16 (podjela 1*6)=%f ms=%d%%</td></tr>\n",vremena[2],(int)(100.0f*vremena[2]/MaX));

	fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU16.bmp\"></td></tr>\n",(int)(600.0f*vremena[2]/MaX) );

	

	fprintf(fhtml,"<tr><td>algoritam 15 (podjela 1*5)=%f ms=%d%%</td></tr>\n",vremena[3],(int)(100.0f*vremena[3]/MaX));

	fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU15.bmp\"></td></tr>\n",(int)(600.0f*vremena[3]/MaX) );

	

	fprintf(fhtml,"<tr><td>algoritam 22 (podjela 2*2)=%f ms=%d%%</td></tr>\n",vremena[4],(int)(100.0f*vremena[4]/MaX));

	fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU22.bmp\"></td></tr>\n",(int)(600.0f*vremena[4]/MaX) );

	

	fprintf(fhtml,"<tr><td>algoritam 26 (podjela 2*6)=%f ms=%d%%</td></tr>\n",vremena[5],(int)(100.0f*vremena[5]/MaX));

	fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU26.bmp\"></td></tr>\n",(int)(600.0f*vremena[5]/MaX) );

	

	fprintf(fhtml,"<tr><td>algoritam 33 (podjela 3*3)=%f ms=%d%%</td></tr>\n",vremena[6],(int)(100.0f*vremena[6]/MaX));

	fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU33.bmp\"></td></tr>\n",(int)(600.0f*vremena[6]/MaX) );

//	fprintf(fhtml,"<tr><td>algoritam CPU=%f ms=%d%%
prevelik je pa nije prikazan graficki</td></tr>\n",vremena[7],(int)(100.0f*vremena[7]/MaX));

	fprintf(fhtml,"</table>\n\n");

	fprintf(fhtml,"</html>\n\n");

	fclose(fhtml);

	system("izlaz.html");

}

void test( int argc, char** argv, int velmat, float *vremena) {

	int *A, *B, *C;

	int mem;

	mem=velmat*velmat*sizeof(int);

	CUDA_SAFE_CALL(cudaMallocHost((void**)&A ,mem));

	CUDA_SAFE_CALL(cudaMallocHost((void**)&B ,mem));

	CUDA_SAFE_CALL(cudaMallocHost((void**)&C ,mem));

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	for(int i=0;i<velmat;++i)

		for(int j=0;j<velmat;j++){

			A[i*velmat+j]=rand()%1024;

			B[i*velmat+j]=rand()%1024;

			C[i*velmat+j]=0;

		}	

	vremena[0]=pomnozi   (A,B,velmat,C);

 	vremena[1]=pomnozi11 (A,B,velmat,C);

	vremena[2]=pomnozi16 (A,B,velmat,C);

	vremena[3]=pomnozi15 (A,B,velmat,C);

	vremena[4]=pomnozi22 (A,B,velmat,C);

	vremena[5]=pomnozi26 (A,B,velmat,C);

	vremena[6]=pomnozi33 (A,B,velmat,C);

//	vremena[7]=pomnoziCPU(A,B,velmat,C);

	

	cudaFree(A),cudaFree(B),cudaFree(C);

}

/**///GPU1

float pomnozi(int *A,int *B, int velmat, int *C){

	int *d_A,*d_B,*d_C;

	unsigned int vrijeme1=0;

	dim3 mreza, blok;

	int kol=velmat*velmat*sizeof(int);

	CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));

	CUT_SAFE_CALL(cutStartTimer(vrijeme1));	

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));

	CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));

	CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));

	kol=velmat>>4;

	mreza.x=velmat>>4;	mreza.y=velmat>>4;

	blok.x =16;			blok.y =16;

	

	mnozenjeMatrica1<<<mreza,blok>>>(d_A,d_B,d_C, velmat,velmat,velmat);

	

	cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);

	CUT_SAFE_CALL(cutStopTimer(vrijeme1));

	

	return cutGetTimerValue(vrijeme1);

}

/**///GPU2

float pomnozi11(int *A,int *B, int velmat, int *C){

	int *d_A,*d_B,*d_C;

	unsigned int vrijeme1=0;

	dim3 mreza, blok;

	int kol=velmat*velmat*sizeof(int);

	CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));

	CUT_SAFE_CALL(cutStartTimer(vrijeme1));	

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));

	CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));

	CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));

	kol=velmat>>4;

	

	mreza.x=velmat>>4;	mreza.y=velmat>>4;

	blok.x =16;			blok.y =16;

	mnozenjeMatrica11<<<mreza,blok>>>(d_A,d_B,d_C, kol);

	cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);

	CUT_SAFE_CALL(cutStopTimer(vrijeme1));

	

	return cutGetTimerValue(vrijeme1);

}

/**///GPU3

float pomnozi16(int *A,int *B, int velmat, int *C){

	int *d_A,*d_B,*d_C;

	unsigned int vrijeme1=0;

	dim3 mreza, blok;

	int kol=velmat*velmat*sizeof(int);

	CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));

	CUT_SAFE_CALL(cutStartTimer(vrijeme1));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));

	CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));

	CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));

	kol=velmat>>4;

	mreza.y=velmat>>4;	mreza.x=velmat/96;//96=6*16 (6 blokova "u sirinu")

	blok.x =16;			blok.y =16;

	mnozenjeMatrica1_6<<<mreza,blok>>>(d_A,d_B,d_C, kol);

	cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);

	CUT_SAFE_CALL(cutStopTimer(vrijeme1));

	return cutGetTimerValue(vrijeme1);

}

/**///GPU4

float pomnozi15(int *A,int *B, int velmat, int *C){

	int *d_A,*d_B,*d_C;

	unsigned int vrijeme1=0;

	dim3 mreza, blok;

	int kol=velmat*velmat*sizeof(int);

	CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));

	CUT_SAFE_CALL(cutStartTimer(vrijeme1));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));

	CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));

	CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));

	kol=velmat>>4;

	

	mreza.y=velmat>>4;	mreza.x=velmat/80;//80=5*16 (5 blokova "u sirinu")

	blok.x =16;			blok.y =16;

	mnozenjeMatrica1_5<<<mreza,blok>>>(d_A,d_B,d_C, kol);

	cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);

	CUT_SAFE_CALL(cutStopTimer(vrijeme1));

	return cutGetTimerValue(vrijeme1);

}

/**///GPU5

float pomnozi22(int *A,int *B, int velmat, int *C){

	int *d_A,*d_B,*d_C;

	unsigned int vrijeme1=0;

	dim3 mreza, blok;

	int kol=velmat*velmat*sizeof(int);

	CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));

	CUT_SAFE_CALL(cutStartTimer(vrijeme1));	

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));

	CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));

	CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));

	kol=velmat>>4;

	

	mreza.x=velmat>>5;	mreza.y=velmat>>5;//32=2*16 (2 bloka "u sirinu")

	blok.x =16;			blok.y =16;

	

	mnozenjeMatrica2_2<<<mreza,blok>>>(d_A,d_B,d_C, kol);

	cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);

	CUT_SAFE_CALL(cutStopTimer(vrijeme1));

	return cutGetTimerValue(vrijeme1);

}

/**///GPU6

float pomnozi26(int *A,int *B, int velmat, int *C){

	int *d_A,*d_B,*d_C;

	unsigned int vrijeme1=0;

	dim3 mreza, blok;

	int kol=velmat*velmat*sizeof(int);

	CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));

	CUT_SAFE_CALL(cutStartTimer(vrijeme1));	

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));

	CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));

	CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));

	kol=velmat>>4;

	

	mreza.x=velmat/96;	mreza.y=velmat>>5;

	blok.x =16;			blok.y =16;

	

	mnozenjeMatrica2_6<<<mreza,blok>>>(d_A,d_B,d_C, kol);

	cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);

	

	return cutGetTimerValue(vrijeme1);

}

/**///GPU7

float pomnozi33(int *A,int *B, int velmat, int *C){

	int *d_A,*d_B,*d_C;

	unsigned int vrijeme1=0;

	dim3 mreza, blok;

	int kol=velmat*velmat*sizeof(int);

	CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));

	CUT_SAFE_CALL(cutStartTimer(vrijeme1));	

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));

	CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));

	CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));

	CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));

	kol=velmat>>4;

	

	mreza.x=velmat/48;	mreza.y=velmat/48;

	blok.x =16;			blok.y =16;

	

	mnozenjeMatrica3_3<<<mreza,blok>>>(d_A,d_B,d_C, kol);

	

	cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);

	

	return cutGetTimerValue(vrijeme1);

}

//CPU

float pomnoziCPU(int *A,int *B, int velmat, int *C){

	unsigned int vrijeme=0;

	int rez;

	CUT_SAFE_CALL(cutCreateTimer(&vrijeme));

	CUT_SAFE_CALL(cutStartTimer(vrijeme));

	for(int j=0;j<velmat;j++)

		for(int i=0;i<velmat;i++){

			rez=0;

			for(int tt=0;tt<velmat;tt++)

				rez+=A[i*velmat+tt]*B[tt*velmat+j];

				C[i*velmat+j]=rez;

			}

	CUT_CHECK_ERROR("greska tokom izvrsavanja");

	CUT_SAFE_CALL(cutStopTimer(vrijeme));

	

	return cutGetTimerValue(vrijeme);

}

/**/

mm_kernel.cu>

#ifndef _MM_KERNEL_H_

#define _MM_KERNEL_H_

#include <stdio.h>

__global__ void mnozenjeMatrica1(int *A, int *B, int *C, int pitchA, int pitchB, int velicina){

	int rezultat=0,v;

	for(v=0;v<velicina;++v)

		rezultat  +=	A[ ((blockIdx.y<<4) + threadIdx.y)*pitchA + v ]  

						* 

						B[ v*pitchB + (blockIdx.x<<4)+threadIdx.x ];

	C[ (blockIdx.x<<4)+threadIdx.x + ((blockIdx.y<<4)+threadIdx.y) * pitchA ]=rezultat;//cuvanje rezultata

}

__global__ void mnozenjeMatrica11(int *A, int *B, int *C, int kolicina){

	__shared__ int podA[16][16], podB[16][16];

	int rezultat=0,v,tv;

	int Ai, Bi;	//adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju

	int velicinaReda;

	velicinaReda=kolicina<<4;

	Ai=((blockIdx.y<<4) + threadIdx.y)*velicinaReda +threadIdx.x;

	Bi=(blockIdx.x<<4) + threadIdx.x +(threadIdx.y*velicinaReda);

	for(v=0;v<kolicina;++v){

		podA[threadIdx.x][threadIdx.y]=A[ Ai ];

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		Ai+=16;			//sledeci blok (Ai po koloni, Bi po redu)

		Bi+=(velicinaReda<<4);	

		__syncthreads();	//racunanje sledeceg dijela ceka dok svi threadovi ne izracunaju ovaj dio`

	}

	v=( (blockIdx.x<<4) + threadIdx.x) + ((blockIdx.y<<4) + threadIdx.y)*velicinaReda;

//(blockIdx.x<<4) + threadIdx.x	je x od rezultata C[x][y]

//(blockIdx.y<<4) + threadIdx.y	je y od rezultata C[x][y]

	__syncthreads();

	C[v]=rezultat;

}

__global__ void mnozenjeMatrica1_6(int *A, int *B, int *C, int kolicina){

	__shared__ int podA[16][16], podB[16][16];

	__shared__ int rez1[16][16], rez2[16][16], rez3[16][16], rez4[16][16], rez5[16][16], rez6[16][16];

//promjenljive-koriste registre

	int rezultat,v,tv;

	int Ai, Bi;	//adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju

	int velicinaReda;

	rez1[threadIdx.x][threadIdx.y]=0;

	rez2[threadIdx.x][threadIdx.y]=0;

	rez3[threadIdx.x][threadIdx.y]=0;

	rez4[threadIdx.x][threadIdx.y]=0;

	rez5[threadIdx.x][threadIdx.y]=0;

	rez6[threadIdx.x][threadIdx.y]=0;

	__syncthreads();	//inicijalno su namjestene sve nule

	velicinaReda=kolicina<<4;

	Ai=((blockIdx.y<<4) + threadIdx.y)*velicinaReda +threadIdx.x;

	Bi=blockIdx.x*96+threadIdx.x +( threadIdx.y*velicinaReda);

	for(v=0;v<kolicina;++v){

		podA[threadIdx.x][threadIdx.y]=A[ Ai ];//(matrica A se ucitava jednom u 6 puta!)

		__syncthreads();

//ucitavanje prve podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		rezultat=rez1[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez1[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje druge podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		rezultat=rez2[threadIdx.x][threadIdx.y];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez2[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje trece podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		rezultat=rez3[threadIdx.x][threadIdx.y];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez3[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje cetvrte podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		rezultat=rez4[threadIdx.x][threadIdx.y];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv)	

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez4[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje pete podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		rezultat=rez5[threadIdx.x][threadIdx.y];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez5[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje seste podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		rezultat=rez6[threadIdx.x][threadIdx.y];

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez6[threadIdx.x][threadIdx.y]=rezultat;

		Ai+=16;			//sledeci blok (Ai po koloni, Bi po redu)

		Bi+=(kolicina<<8) - 80;	//==16*(kolicina*16)-80 gdje je kolicina*16 velicina reda, a za 80 je citanje ukupno pomjereno od pocetka petlje

		__syncthreads();	//racunanje sledeceg dijela ceka dok svi threadovi ne izracunaju ovaj dio

	}

	v=((blockIdx.x*96) + threadIdx.x) + ((blockIdx.y<<4) + threadIdx.y)*velicinaReda;

//(blockIdx.x<<4) + threadIdx.x	je x od rezultata C[x][y]

//(blockIdx.y<<4) + threadIdx.y	je y od rezultata C[x][y]

	__syncthreads();

	C[v]   =rez1[threadIdx.x][threadIdx.y];

	C[v+16]=rez2[threadIdx.x][threadIdx.y];

	C[v+32]=rez3[threadIdx.x][threadIdx.y];

	C[v+48]=rez4[threadIdx.x][threadIdx.y];

	C[v+64]=rez5[threadIdx.x][threadIdx.y];

	C[v+80]=rez6[threadIdx.x][threadIdx.y];

}

__global__ void mnozenjeMatrica1_5(int *A, int *B, int *C, int kolicina){

	__shared__ int podA[16][16], podB[16][16];

	__shared__ int rez1[16][16], rez2[16][16], rez3[16][16], rez4[16][16], rez5[16][16];

//promjenljive-koriste registre

	int rezultat,v,tv;

	int Ai, Bi;	//adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju

	int velicinaReda;

	rez1[threadIdx.x][threadIdx.y]=0;

	rez2[threadIdx.x][threadIdx.y]=0;

	rez3[threadIdx.x][threadIdx.y]=0;

	rez4[threadIdx.x][threadIdx.y]=0;

	rez5[threadIdx.x][threadIdx.y]=0;

	__syncthreads();	//inicijalno su namjestene sve nule

	velicinaReda=kolicina<<4;

	Ai=((blockIdx.y<<4) + threadIdx.y)*velicinaReda +threadIdx.x;

	Bi=blockIdx.x*80+threadIdx.x +( threadIdx.y*velicinaReda);

	for(v=0;v<kolicina;++v){

		podA[threadIdx.x][threadIdx.y]=A[ Ai ];//(matrica A se ucitava jednom u 5 puta!)

		__syncthreads();

//ucitavanje prve podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		rezultat=rez1[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez1[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje druge podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		rezultat=rez2[threadIdx.x][threadIdx.y];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez2[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje trece podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		rezultat=rez3[threadIdx.x][threadIdx.y];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez3[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje cetvrte podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		rezultat=rez4[threadIdx.x][threadIdx.y];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv)	

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez4[threadIdx.x][threadIdx.y]=rezultat;

//ucitavanje pete podmatrice

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		rezultat=rez5[threadIdx.x][threadIdx.y];

		__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju

		for(tv=0;tv<16;++tv)

			rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];

		rez5[threadIdx.x][threadIdx.y]=rezultat;

		Ai+=16;			//sledeci blok (Ai po koloni, Bi po redu)

		Bi+=(kolicina<<8) - 64;	//==16*(kolicina*16)-64 gdje je kolicina*16 velicina reda, a za 64 je citanje ukupno pomjereno od pocetka petlje

		__syncthreads();	//racunanje sledeceg dijela ceka dok svi threadovi ne izracunaju ovaj dio

	}

	v=((blockIdx.x*80) + threadIdx.x) + ((blockIdx.y<<4) + threadIdx.y)*velicinaReda;

//(blockIdx.x<<4) + threadIdx.x	je x od rezultata C[x][y]

//(blockIdx.y<<4) + threadIdx.y	je y od rezultata C[x][y]

	__syncthreads();

	C[v]   =rez1[threadIdx.x][threadIdx.y];

	C[v+16]=rez2[threadIdx.x][threadIdx.y];

	C[v+32]=rez3[threadIdx.x][threadIdx.y];

	C[v+48]=rez4[threadIdx.x][threadIdx.y];

	C[v+64]=rez5[threadIdx.x][threadIdx.y];

}

__global__ void mnozenjeMatrica2_2(int *A, int *B, int *C, int kolicina){

	__shared__ int podA1[16][16], podA2[16][16], podB[16][16];

	__shared__ int rezA1B1[16][16], rezA1B2[16][16];

	__shared__ int rezA2B1[16][16], rezA2B2[16][16];

	

	int r1,r2, v,tv;

	int Ai,Bi;	//adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju

	int velicinaReda;

	rezA1B1[threadIdx.x][threadIdx.y]=0;

	rezA1B2[threadIdx.x][threadIdx.y]=0;

	rezA2B1[threadIdx.x][threadIdx.y]=0;

	rezA2B2[threadIdx.x][threadIdx.y]=0;

	__syncthreads();	//inicijalno su namjestene sve nule

	velicinaReda=kolicina<<4;

	Ai=((blockIdx.y<<5) + threadIdx.y)*velicinaReda +threadIdx.x;	//32=2*16 (2 visine matrice)

	Bi= (blockIdx.x<<5) + threadIdx.x +threadIdx.y*velicinaReda;		//32=2*16 (2 duzine matrice)

	for(v=0;v<kolicina;++v){

		podA1[threadIdx.x][threadIdx.y]=A[ Ai ];

		podA2[threadIdx.x][threadIdx.y]=A[ Ai+16*velicinaReda ];

//ucitavanje prve podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B1[threadIdx.x][threadIdx.y];

		r2=rezA2B1[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B1[threadIdx.x][threadIdx.y]=r1;

		rezA2B1[threadIdx.x][threadIdx.y]=r2;

//		__syncthreads();

//ucitavanje druge podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B2[threadIdx.x][threadIdx.y];

		r2=rezA2B2[threadIdx.x][threadIdx.y];

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B2[threadIdx.x][threadIdx.y]=r1;

		rezA2B2[threadIdx.x][threadIdx.y]=r2;

//		__syncthreads();

		Ai+=16;//16 jer se ide 1 kolona naprijed!

		Bi+=(velicinaReda<<4)-16;//1 blok (16 redova) naprijed i -32 koja su dodata u dva puta po +16 

	}

	v=(blockIdx.x*32 + threadIdx.x) + (blockIdx.y*32 + threadIdx.y)*velicinaReda; 

	__syncthreads();

	C[v]   =rezA1B1[threadIdx.x][threadIdx.y];

	C[v+16]=rezA1B2[threadIdx.x][threadIdx.y];

	v+=velicinaReda<<4;

	C[v]   =rezA2B1[threadIdx.x][threadIdx.y];

	C[v+16]=rezA2B2[threadIdx.x][threadIdx.y];

}

__global__ void mnozenjeMatrica3_3(int *A, int *B, int *C, int kolicina){

	__shared__ int podA1[16][16], podA2[16][16], podA3[16][16], podB[16][16];

	__shared__ int rezA1B1[16][16], rezA1B2[16][16], rezA1B3[16][16];

	__shared__ int rezA2B1[16][16], rezA2B2[16][16], rezA2B3[16][16];

	__shared__ int rezA3B1[16][16], rezA3B2[16][16], rezA3B3[16][16];

//promjenljive-koriste registre

	int r1,r2,r3, v,tv;

	int Ai,Bi;	//adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju

	int velicinaReda;

	rezA1B1[threadIdx.x][threadIdx.y]=0;

	rezA1B2[threadIdx.x][threadIdx.y]=0;

	rezA1B3[threadIdx.x][threadIdx.y]=0;

	rezA2B1[threadIdx.x][threadIdx.y]=0;

	rezA2B2[threadIdx.x][threadIdx.y]=0;

	rezA2B3[threadIdx.x][threadIdx.y]=0;

	rezA3B1[threadIdx.x][threadIdx.y]=0;

	rezA3B2[threadIdx.x][threadIdx.y]=0;

	rezA3B3[threadIdx.x][threadIdx.y]=0;

	__syncthreads();	//inicijalno su namjestene sve nule

	velicinaReda=kolicina<<4;

	Ai=(blockIdx.y*48 + threadIdx.y)*velicinaReda +threadIdx.x;	//48=3*16 (3 visine matrice)

	Bi=blockIdx.x*48 + threadIdx.x +threadIdx.y*velicinaReda;					//48=3*16 (3 duzine matrice)

	for(v=0;v<kolicina;++v){

		podA1[threadIdx.x][threadIdx.y]=A[ Ai ];

		podA2[threadIdx.x][threadIdx.y]=A[ Ai+16*velicinaReda ];

		podA3[threadIdx.x][threadIdx.y]=A[ Ai+32*velicinaReda ];

//ucitavanje prve podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B1[threadIdx.x][threadIdx.y];

		r2=rezA2B1[threadIdx.x][threadIdx.y];

		r3=rezA3B1[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r3+=podA3[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B1[threadIdx.x][threadIdx.y]=r1;

		rezA2B1[threadIdx.x][threadIdx.y]=r2;

		rezA3B1[threadIdx.x][threadIdx.y]=r3;

//		__syncthreads();

//ucitavanje druge podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B2[threadIdx.x][threadIdx.y];

		r2=rezA2B2[threadIdx.x][threadIdx.y];

		r3=rezA3B2[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r3+=podA3[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B2[threadIdx.x][threadIdx.y]=r1;

		rezA2B2[threadIdx.x][threadIdx.y]=r2;

		rezA3B2[threadIdx.x][threadIdx.y]=r3;

//		__syncthreads();

//ucitavanje trece podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B3[threadIdx.x][threadIdx.y];

		r2=rezA2B3[threadIdx.x][threadIdx.y];

		r3=rezA3B3[threadIdx.x][threadIdx.y];

//		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r3+=podA3[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B3[threadIdx.x][threadIdx.y]=r1;

		rezA2B3[threadIdx.x][threadIdx.y]=r2;

		rezA3B3[threadIdx.x][threadIdx.y]=r3;

//		__syncthreads();

		Ai+=16;//16 jer se ide 1 kolona naprijed!

		Bi+=(velicinaReda<<4)-32;//1 blok (16 redova) naprijed i -32 koja su dodata u dva puta po +16 

	}

	v=(blockIdx.x*48 + threadIdx.x) + (blockIdx.y*48 + threadIdx.y)*velicinaReda; 

	__syncthreads();

	C[v]   =rezA1B1[threadIdx.x][threadIdx.y];

	C[v+16]=rezA1B2[threadIdx.x][threadIdx.y];

	C[v+32]=rezA1B3[threadIdx.x][threadIdx.y];

	v+=velicinaReda<<4;

	C[v]   =rezA2B1[threadIdx.x][threadIdx.y];

	C[v+16]=rezA2B2[threadIdx.x][threadIdx.y];

	C[v+32]=rezA2B3[threadIdx.x][threadIdx.y];

	v+=velicinaReda<<4;

	C[v]   =rezA3B1[threadIdx.x][threadIdx.y];

	C[v+16]=rezA3B2[threadIdx.x][threadIdx.y];

	C[v+32]=rezA3B3[threadIdx.x][threadIdx.y];

}

__global__ void mnozenjeMatrica2_6(int *A, int *B, int *C, int kolicina){

	__shared__ int podA1[16][16], podA2[16][16], podB[16][16];

	__shared__ int rezA1B1[16][16], rezA1B2[16][16], rezA1B3[16][16], rezA1B4[16][16], rezA1B5[16][16], rezA1B6[16][16];

	__shared__ int rezA2B1[16][16], rezA2B2[16][16], rezA2B3[16][16], rezA2B4[16][16], rezA2B5[16][16], rezA2B6[16][16];

//promjenljive-koriste registre

	int r1,r2, v,tv;

	int Ai,Bi;	//adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju

	int velicinaReda;

	rezA1B1[threadIdx.x][threadIdx.y]=0;

	rezA1B2[threadIdx.x][threadIdx.y]=0;

	rezA1B3[threadIdx.x][threadIdx.y]=0;

	rezA1B4[threadIdx.x][threadIdx.y]=0;

	rezA1B5[threadIdx.x][threadIdx.y]=0;

	rezA1B6[threadIdx.x][threadIdx.y]=0;

	rezA2B1[threadIdx.x][threadIdx.y]=0;

	rezA2B2[threadIdx.x][threadIdx.y]=0;

	rezA2B3[threadIdx.x][threadIdx.y]=0;

	rezA2B4[threadIdx.x][threadIdx.y]=0;

	rezA2B5[threadIdx.x][threadIdx.y]=0;

	rezA2B6[threadIdx.x][threadIdx.y]=0;

	__syncthreads();	//inicijalno su namjestene sve nule

	velicinaReda=kolicina<<4;

	Ai=((blockIdx.y<<5) + threadIdx.y)*velicinaReda +threadIdx.x;	//32=2*16 (2 visine matrice)

	Bi=blockIdx.x*96 + threadIdx.x +threadIdx.y*velicinaReda;		//96=6*16 (6 duzina matrice)

	for(v=0;v<kolicina;++v){

		podA1[threadIdx.x][threadIdx.y]=A[ Ai ];

		podA2[threadIdx.x][threadIdx.y]=A[ Ai+16*velicinaReda ];

//ucitavanje prve podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B1[threadIdx.x][threadIdx.y];

		r2=rezA2B1[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B1[threadIdx.x][threadIdx.y]=r1;

		rezA2B1[threadIdx.x][threadIdx.y]=r2;

//		__syncthreads();

//ucitavanje druge podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B2[threadIdx.x][threadIdx.y];

		r2=rezA2B2[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B2[threadIdx.x][threadIdx.y]=r1;

		rezA2B2[threadIdx.x][threadIdx.y]=r2;

//		__syncthreads();

//ucitavanje trece podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B3[threadIdx.x][threadIdx.y];

		r2=rezA2B3[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B3[threadIdx.x][threadIdx.y]=r1;

		rezA2B3[threadIdx.x][threadIdx.y]=r2;

//		__syncthreads();

//ucitavanje cetvrte podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B4[threadIdx.x][threadIdx.y];

		r2=rezA2B4[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B4[threadIdx.x][threadIdx.y]=r1;

		rezA2B4[threadIdx.x][threadIdx.y]=r2;

//		__syncthreads();

//ucitavanje pete podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B5[threadIdx.x][threadIdx.y];

		r2=rezA2B5[threadIdx.x][threadIdx.y];

		Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B5[threadIdx.x][threadIdx.y]=r1;

		rezA2B5[threadIdx.x][threadIdx.y]=r2;

//		__syncthreads();

//ucitavanje seste podmatrice od matrice B

		podB[threadIdx.x][threadIdx.y]=B[ Bi ];

		__syncthreads();

		r1=rezA1B6[threadIdx.x][threadIdx.y];

		r2=rezA2B6[threadIdx.x][threadIdx.y];

		for(tv=0;tv<16;++tv){

			r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];

			r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];

		}

		rezA1B6[threadIdx.x][threadIdx.y]=r1;

		rezA2B6[threadIdx.x][threadIdx.y]=r2;

//		__syncthreads();

		Ai+=16;//16 jer se ide 1 kolona naprijed!

		Bi+=(velicinaReda<<4)-80;//1 blok (16 redova) naprijed i -80 koja su dodata u pet puta po +16 

	}

	v=(blockIdx.x*96 + threadIdx.x) + ((blockIdx.y<<5) + threadIdx.y)*velicinaReda; 

	__syncthreads();

	C[v]   =rezA1B1[threadIdx.x][threadIdx.y];

	C[v+16]=rezA1B2[threadIdx.x][threadIdx.y];

	C[v+32]=rezA1B3[threadIdx.x][threadIdx.y];

	C[v+48]=rezA1B4[threadIdx.x][threadIdx.y];

	C[v+64]=rezA1B5[threadIdx.x][threadIdx.y];

	C[v+80]=rezA1B6[threadIdx.x][threadIdx.y];

	v+=velicinaReda<<4;

	C[v]   =rezA2B1[threadIdx.x][threadIdx.y];

	C[v+16]=rezA2B2[threadIdx.x][threadIdx.y];

	C[v+32]=rezA2B3[threadIdx.x][threadIdx.y];

	C[v+48]=rezA2B4[threadIdx.x][threadIdx.y];

	C[v+64]=rezA2B5[threadIdx.x][threadIdx.y];

	C[v+80]=rezA2B6[threadIdx.x][threadIdx.y];

}

#endif

GPU26.bmp (58 Bytes)
GPU33.bmp (58 Bytes)
GPU22.bmp (58 Bytes)
GPU16.bmp (58 Bytes)
GPU15.bmp (58 Bytes)
GPU11.bmp (58 Bytes)
GPU1.bmp (58 Bytes)

I made four runs and got one segmentation fault. Here are the three succuessful results:

  1. ++++++++++++++++++++++++++++++++++++++++++++++

[codebox]dimenzije matrice su 2400*2400

algoritam 1 (bez dijeljene memorije)=10201.549805 ms=100%

algoritam 11 (podjela 1*1)=2109.927002 ms=20%

algoritam 16 (podjela 1*6)=2252.961914 ms=22%

algoritam 15 (podjela 1*5)=2274.898926 ms=22%

algoritam 22 (podjela 2*2)=1831.567993 ms=17%

algoritam 26 (podjela 2*6)=1714.729980 ms=16%

algoritam 33 (podjela 3*3)=1405.598022 ms=13%[/codebox]

  1. ++++++++++++++++++++++++++++++++++++++++++++++

[codebox]dimenzije matrice su 2400*2400

algoritam 1 (bez dijeljene memorije)=10209.481445 ms=100%

algoritam 11 (podjela 1*1)=2113.551025 ms=20%

algoritam 16 (podjela 1*6)=2256.562988 ms=22%

algoritam 15 (podjela 1*5)=2275.031982 ms=22%

algoritam 22 (podjela 2*2)=1833.338989 ms=17%

algoritam 26 (podjela 2*6)=1718.305054 ms=16%

algoritam 33 (podjela 3*3)=1408.024048 ms=13%[/codebox]

  1. ++++++++++++++++++++++++++++++++++++++++++++++

[codebox]dimenzije matrice su 2400*2400

algoritam 1 (bez dijeljene memorije)=10214.548828 ms=100%

algoritam 11 (podjela 1*1)=2112.477051 ms=20%

algoritam 16 (podjela 1*6)=2253.239014 ms=22%

algoritam 15 (podjela 1*5)=2275.447998 ms=22%

algoritam 22 (podjela 2*2)=1831.712036 ms=17%

algoritam 26 (podjela 2*6)=1715.071045 ms=16%

algoritam 33 (podjela 3*3)=1408.667969 ms=13%

[/codebox]

…$> ./deviceQuery

[codebox]Device 0: “GeForce 9800 GT”

Major revision number: 1

Minor revision number: 1

Total amount of global memory: 1073414144 bytes

Number of multiprocessors: 14

Number of cores: 112

Total amount of constant memory: 65536 bytes

Total amount of shared memory per block: 16384 bytes

Total number of registers available per block: 8192

Warp size: 32

Maximum number of threads per block: 512

Maximum sizes of each dimension of a block: 512 x 512 x 64

Maximum sizes of each dimension of a grid: 65535 x 65535 x 1

Maximum memory pitch: 262144 bytes

Texture alignment: 256 bytes

Clock rate: 1.51 GHz

Concurrent copy and execution: Yes[/codebox]

Good Luck :)