OK thanks.
here is (i removed some comments)
in original, there are some other files (.bmp pixels for html page but never mind)
whats problem with rar?
should i attach as .zip or something else?
mm.cu>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <cutil.h>
#include <mm_kernel.cu>
//#define VELMAT 480
//#define VELMAT 960
//#define VELMAT 1920
#define VELMAT 2400
//#define VELMAT 4800
void test(int argc, char** argv, int velmat, float *vremena);
float pomnoziCPU(int *A,int *B, int velmat, int *C);
float pomnozi33 (int *A,int *B, int velmat, int *C);
float pomnozi26 (int *A,int *B, int velmat, int *C);
float pomnozi22 (int *A,int *B, int velmat, int *C);
float pomnozi15 (int *A,int *B, int velmat, int *C);
float pomnozi16 (int *A,int *B, int velmat, int *C);
float pomnozi11 (int *A,int *B, int velmat, int *C);
float pomnozi (int *A,int *B, int velmat, int *C);
int main( int argc, char** argv){
float vremena[8];
char *tmpUlaz;
int velmat=VELMAT;
srand(time(NULL));
tmpUlaz=(argv[1]);
if(tmpUlaz){/**/
tmpUlaz=(argv[1]);
if(!strcmp(tmpUlaz,"480"))
velmat=480;
else if(!strcmp(tmpUlaz,"960"))
velmat=960;
else if(!strcmp(tmpUlaz,"1920"))
velmat=1920;
else if(!strcmp(tmpUlaz,"2400"))
velmat=2400;
}
test( argc, argv, velmat, vremena);
FILE *fhtml=fopen("izlaz.html","w");
if(!fhtml){
printf("\nGRESKA-nije otvoren fajl za izlaz\n");
fflush(stdin);getchar();return 0;
}
float MaX=vremena[0];
for(int oo=0;oo<7;oo++)
if(vremena[oo]>MaX)
MaX=vremena[oo];
fprintf(fhtml,"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">\n<html><head><title>rezultati</title></head>\n");
fprintf(fhtml,"<body>\n");
fprintf(fhtml,"<table width=\"400px\">\n");
fprintf(fhtml,"<tr><td> dimenzije matrice su %d*%d</td></tr>\n",velmat,velmat);
fprintf(fhtml,"<tr><td>algoritam 1 (bez dijeljene memorije)=%f ms=%d%%</td></tr>\n",vremena[0],(int)(100.0f*vremena[0]/MaX));
fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU1.bmp\"></td></tr>\n",(int)(600.0f*vremena[0]/MaX));
fprintf(fhtml,"<tr><td>algoritam 11 (podjela 1*1)=%f ms=%d%%</td></tr>\n",vremena[1],(int)(100.0f*vremena[1]/MaX));
fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU11.bmp\"></td></tr>\n",(int)(600.0f*vremena[1]/MaX) );
fprintf(fhtml,"<tr><td>algoritam 16 (podjela 1*6)=%f ms=%d%%</td></tr>\n",vremena[2],(int)(100.0f*vremena[2]/MaX));
fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU16.bmp\"></td></tr>\n",(int)(600.0f*vremena[2]/MaX) );
fprintf(fhtml,"<tr><td>algoritam 15 (podjela 1*5)=%f ms=%d%%</td></tr>\n",vremena[3],(int)(100.0f*vremena[3]/MaX));
fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU15.bmp\"></td></tr>\n",(int)(600.0f*vremena[3]/MaX) );
fprintf(fhtml,"<tr><td>algoritam 22 (podjela 2*2)=%f ms=%d%%</td></tr>\n",vremena[4],(int)(100.0f*vremena[4]/MaX));
fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU22.bmp\"></td></tr>\n",(int)(600.0f*vremena[4]/MaX) );
fprintf(fhtml,"<tr><td>algoritam 26 (podjela 2*6)=%f ms=%d%%</td></tr>\n",vremena[5],(int)(100.0f*vremena[5]/MaX));
fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU26.bmp\"></td></tr>\n",(int)(600.0f*vremena[5]/MaX) );
fprintf(fhtml,"<tr><td>algoritam 33 (podjela 3*3)=%f ms=%d%%</td></tr>\n",vremena[6],(int)(100.0f*vremena[6]/MaX));
fprintf(fhtml,"<tr><td><img width=\"%dpx\" height=\"16px\" src=\"GPU33.bmp\"></td></tr>\n",(int)(600.0f*vremena[6]/MaX) );
// fprintf(fhtml,"<tr><td>algoritam CPU=%f ms=%d%%
prevelik je pa nije prikazan graficki</td></tr>\n",vremena[7],(int)(100.0f*vremena[7]/MaX));
fprintf(fhtml,"</table>\n\n");
fprintf(fhtml,"</html>\n\n");
fclose(fhtml);
system("izlaz.html");
}
void test( int argc, char** argv, int velmat, float *vremena) {
int *A, *B, *C;
int mem;
mem=velmat*velmat*sizeof(int);
CUDA_SAFE_CALL(cudaMallocHost((void**)&A ,mem));
CUDA_SAFE_CALL(cudaMallocHost((void**)&B ,mem));
CUDA_SAFE_CALL(cudaMallocHost((void**)&C ,mem));
CUT_CHECK_ERROR("greska tokom izvrsavanja");
for(int i=0;i<velmat;++i)
for(int j=0;j<velmat;j++){
A[i*velmat+j]=rand()%1024;
B[i*velmat+j]=rand()%1024;
C[i*velmat+j]=0;
}
vremena[0]=pomnozi (A,B,velmat,C);
vremena[1]=pomnozi11 (A,B,velmat,C);
vremena[2]=pomnozi16 (A,B,velmat,C);
vremena[3]=pomnozi15 (A,B,velmat,C);
vremena[4]=pomnozi22 (A,B,velmat,C);
vremena[5]=pomnozi26 (A,B,velmat,C);
vremena[6]=pomnozi33 (A,B,velmat,C);
// vremena[7]=pomnoziCPU(A,B,velmat,C);
cudaFree(A),cudaFree(B),cudaFree(C);
}
/**///GPU1
float pomnozi(int *A,int *B, int velmat, int *C){
int *d_A,*d_B,*d_C;
unsigned int vrijeme1=0;
dim3 mreza, blok;
int kol=velmat*velmat*sizeof(int);
CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));
CUT_SAFE_CALL(cutStartTimer(vrijeme1));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));
CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));
kol=velmat>>4;
mreza.x=velmat>>4; mreza.y=velmat>>4;
blok.x =16; blok.y =16;
mnozenjeMatrica1<<<mreza,blok>>>(d_A,d_B,d_C, velmat,velmat,velmat);
cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("greska tokom izvrsavanja");
cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
CUT_SAFE_CALL(cutStopTimer(vrijeme1));
return cutGetTimerValue(vrijeme1);
}
/**///GPU2
float pomnozi11(int *A,int *B, int velmat, int *C){
int *d_A,*d_B,*d_C;
unsigned int vrijeme1=0;
dim3 mreza, blok;
int kol=velmat*velmat*sizeof(int);
CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));
CUT_SAFE_CALL(cutStartTimer(vrijeme1));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));
CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));
kol=velmat>>4;
mreza.x=velmat>>4; mreza.y=velmat>>4;
blok.x =16; blok.y =16;
mnozenjeMatrica11<<<mreza,blok>>>(d_A,d_B,d_C, kol);
cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("greska tokom izvrsavanja");
cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
CUT_SAFE_CALL(cutStopTimer(vrijeme1));
return cutGetTimerValue(vrijeme1);
}
/**///GPU3
float pomnozi16(int *A,int *B, int velmat, int *C){
int *d_A,*d_B,*d_C;
unsigned int vrijeme1=0;
dim3 mreza, blok;
int kol=velmat*velmat*sizeof(int);
CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));
CUT_SAFE_CALL(cutStartTimer(vrijeme1));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));
CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));
kol=velmat>>4;
mreza.y=velmat>>4; mreza.x=velmat/96;//96=6*16 (6 blokova "u sirinu")
blok.x =16; blok.y =16;
mnozenjeMatrica1_6<<<mreza,blok>>>(d_A,d_B,d_C, kol);
cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("greska tokom izvrsavanja");
cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
CUT_SAFE_CALL(cutStopTimer(vrijeme1));
return cutGetTimerValue(vrijeme1);
}
/**///GPU4
float pomnozi15(int *A,int *B, int velmat, int *C){
int *d_A,*d_B,*d_C;
unsigned int vrijeme1=0;
dim3 mreza, blok;
int kol=velmat*velmat*sizeof(int);
CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));
CUT_SAFE_CALL(cutStartTimer(vrijeme1));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));
CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));
kol=velmat>>4;
mreza.y=velmat>>4; mreza.x=velmat/80;//80=5*16 (5 blokova "u sirinu")
blok.x =16; blok.y =16;
mnozenjeMatrica1_5<<<mreza,blok>>>(d_A,d_B,d_C, kol);
cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("greska tokom izvrsavanja");
cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
CUT_SAFE_CALL(cutStopTimer(vrijeme1));
return cutGetTimerValue(vrijeme1);
}
/**///GPU5
float pomnozi22(int *A,int *B, int velmat, int *C){
int *d_A,*d_B,*d_C;
unsigned int vrijeme1=0;
dim3 mreza, blok;
int kol=velmat*velmat*sizeof(int);
CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));
CUT_SAFE_CALL(cutStartTimer(vrijeme1));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));
CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));
kol=velmat>>4;
mreza.x=velmat>>5; mreza.y=velmat>>5;//32=2*16 (2 bloka "u sirinu")
blok.x =16; blok.y =16;
mnozenjeMatrica2_2<<<mreza,blok>>>(d_A,d_B,d_C, kol);
cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("greska tokom izvrsavanja");
cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
CUT_SAFE_CALL(cutStopTimer(vrijeme1));
return cutGetTimerValue(vrijeme1);
}
/**///GPU6
float pomnozi26(int *A,int *B, int velmat, int *C){
int *d_A,*d_B,*d_C;
unsigned int vrijeme1=0;
dim3 mreza, blok;
int kol=velmat*velmat*sizeof(int);
CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));
CUT_SAFE_CALL(cutStartTimer(vrijeme1));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));
CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));
kol=velmat>>4;
mreza.x=velmat/96; mreza.y=velmat>>5;
blok.x =16; blok.y =16;
mnozenjeMatrica2_6<<<mreza,blok>>>(d_A,d_B,d_C, kol);
cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("greska tokom izvrsavanja");
cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
return cutGetTimerValue(vrijeme1);
}
/**///GPU7
float pomnozi33(int *A,int *B, int velmat, int *C){
int *d_A,*d_B,*d_C;
unsigned int vrijeme1=0;
dim3 mreza, blok;
int kol=velmat*velmat*sizeof(int);
CUT_SAFE_CALL(cutCreateTimer(&vrijeme1));
CUT_SAFE_CALL(cutStartTimer(vrijeme1));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_A,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_B,kol ));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_C,kol ));
CUDA_SAFE_CALL(cudaMemcpy(d_A,A,kol,cudaMemcpyHostToDevice ));
CUDA_SAFE_CALL(cudaMemcpy(d_B,B,kol,cudaMemcpyHostToDevice ));
kol=velmat>>4;
mreza.x=velmat/48; mreza.y=velmat/48;
blok.x =16; blok.y =16;
mnozenjeMatrica3_3<<<mreza,blok>>>(d_A,d_B,d_C, kol);
cudaMemcpy(C,d_C,velmat*velmat*sizeof(int), cudaMemcpyDeviceToHost);
CUT_CHECK_ERROR("greska tokom izvrsavanja");
cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);
return cutGetTimerValue(vrijeme1);
}
//CPU
float pomnoziCPU(int *A,int *B, int velmat, int *C){
unsigned int vrijeme=0;
int rez;
CUT_SAFE_CALL(cutCreateTimer(&vrijeme));
CUT_SAFE_CALL(cutStartTimer(vrijeme));
for(int j=0;j<velmat;j++)
for(int i=0;i<velmat;i++){
rez=0;
for(int tt=0;tt<velmat;tt++)
rez+=A[i*velmat+tt]*B[tt*velmat+j];
C[i*velmat+j]=rez;
}
CUT_CHECK_ERROR("greska tokom izvrsavanja");
CUT_SAFE_CALL(cutStopTimer(vrijeme));
return cutGetTimerValue(vrijeme);
}
/**/
mm_kernel.cu>
#ifndef _MM_KERNEL_H_
#define _MM_KERNEL_H_
#include <stdio.h>
__global__ void mnozenjeMatrica1(int *A, int *B, int *C, int pitchA, int pitchB, int velicina){
int rezultat=0,v;
for(v=0;v<velicina;++v)
rezultat += A[ ((blockIdx.y<<4) + threadIdx.y)*pitchA + v ]
*
B[ v*pitchB + (blockIdx.x<<4)+threadIdx.x ];
C[ (blockIdx.x<<4)+threadIdx.x + ((blockIdx.y<<4)+threadIdx.y) * pitchA ]=rezultat;//cuvanje rezultata
}
__global__ void mnozenjeMatrica11(int *A, int *B, int *C, int kolicina){
__shared__ int podA[16][16], podB[16][16];
int rezultat=0,v,tv;
int Ai, Bi; //adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju
int velicinaReda;
velicinaReda=kolicina<<4;
Ai=((blockIdx.y<<4) + threadIdx.y)*velicinaReda +threadIdx.x;
Bi=(blockIdx.x<<4) + threadIdx.x +(threadIdx.y*velicinaReda);
for(v=0;v<kolicina;++v){
podA[threadIdx.x][threadIdx.y]=A[ Ai ];
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
Ai+=16; //sledeci blok (Ai po koloni, Bi po redu)
Bi+=(velicinaReda<<4);
__syncthreads(); //racunanje sledeceg dijela ceka dok svi threadovi ne izracunaju ovaj dio`
}
v=( (blockIdx.x<<4) + threadIdx.x) + ((blockIdx.y<<4) + threadIdx.y)*velicinaReda;
//(blockIdx.x<<4) + threadIdx.x je x od rezultata C[x][y]
//(blockIdx.y<<4) + threadIdx.y je y od rezultata C[x][y]
__syncthreads();
C[v]=rezultat;
}
__global__ void mnozenjeMatrica1_6(int *A, int *B, int *C, int kolicina){
__shared__ int podA[16][16], podB[16][16];
__shared__ int rez1[16][16], rez2[16][16], rez3[16][16], rez4[16][16], rez5[16][16], rez6[16][16];
//promjenljive-koriste registre
int rezultat,v,tv;
int Ai, Bi; //adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju
int velicinaReda;
rez1[threadIdx.x][threadIdx.y]=0;
rez2[threadIdx.x][threadIdx.y]=0;
rez3[threadIdx.x][threadIdx.y]=0;
rez4[threadIdx.x][threadIdx.y]=0;
rez5[threadIdx.x][threadIdx.y]=0;
rez6[threadIdx.x][threadIdx.y]=0;
__syncthreads(); //inicijalno su namjestene sve nule
velicinaReda=kolicina<<4;
Ai=((blockIdx.y<<4) + threadIdx.y)*velicinaReda +threadIdx.x;
Bi=blockIdx.x*96+threadIdx.x +( threadIdx.y*velicinaReda);
for(v=0;v<kolicina;++v){
podA[threadIdx.x][threadIdx.y]=A[ Ai ];//(matrica A se ucitava jednom u 6 puta!)
__syncthreads();
//ucitavanje prve podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
rezultat=rez1[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez1[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje druge podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
rezultat=rez2[threadIdx.x][threadIdx.y];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez2[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje trece podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
rezultat=rez3[threadIdx.x][threadIdx.y];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez3[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje cetvrte podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
rezultat=rez4[threadIdx.x][threadIdx.y];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez4[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje pete podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
rezultat=rez5[threadIdx.x][threadIdx.y];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez5[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje seste podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
rezultat=rez6[threadIdx.x][threadIdx.y];
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez6[threadIdx.x][threadIdx.y]=rezultat;
Ai+=16; //sledeci blok (Ai po koloni, Bi po redu)
Bi+=(kolicina<<8) - 80; //==16*(kolicina*16)-80 gdje je kolicina*16 velicina reda, a za 80 je citanje ukupno pomjereno od pocetka petlje
__syncthreads(); //racunanje sledeceg dijela ceka dok svi threadovi ne izracunaju ovaj dio
}
v=((blockIdx.x*96) + threadIdx.x) + ((blockIdx.y<<4) + threadIdx.y)*velicinaReda;
//(blockIdx.x<<4) + threadIdx.x je x od rezultata C[x][y]
//(blockIdx.y<<4) + threadIdx.y je y od rezultata C[x][y]
__syncthreads();
C[v] =rez1[threadIdx.x][threadIdx.y];
C[v+16]=rez2[threadIdx.x][threadIdx.y];
C[v+32]=rez3[threadIdx.x][threadIdx.y];
C[v+48]=rez4[threadIdx.x][threadIdx.y];
C[v+64]=rez5[threadIdx.x][threadIdx.y];
C[v+80]=rez6[threadIdx.x][threadIdx.y];
}
__global__ void mnozenjeMatrica1_5(int *A, int *B, int *C, int kolicina){
__shared__ int podA[16][16], podB[16][16];
__shared__ int rez1[16][16], rez2[16][16], rez3[16][16], rez4[16][16], rez5[16][16];
//promjenljive-koriste registre
int rezultat,v,tv;
int Ai, Bi; //adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju
int velicinaReda;
rez1[threadIdx.x][threadIdx.y]=0;
rez2[threadIdx.x][threadIdx.y]=0;
rez3[threadIdx.x][threadIdx.y]=0;
rez4[threadIdx.x][threadIdx.y]=0;
rez5[threadIdx.x][threadIdx.y]=0;
__syncthreads(); //inicijalno su namjestene sve nule
velicinaReda=kolicina<<4;
Ai=((blockIdx.y<<4) + threadIdx.y)*velicinaReda +threadIdx.x;
Bi=blockIdx.x*80+threadIdx.x +( threadIdx.y*velicinaReda);
for(v=0;v<kolicina;++v){
podA[threadIdx.x][threadIdx.y]=A[ Ai ];//(matrica A se ucitava jednom u 5 puta!)
__syncthreads();
//ucitavanje prve podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
rezultat=rez1[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez1[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje druge podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
rezultat=rez2[threadIdx.x][threadIdx.y];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez2[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje trece podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
rezultat=rez3[threadIdx.x][threadIdx.y];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez3[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje cetvrte podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
rezultat=rez4[threadIdx.x][threadIdx.y];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez4[threadIdx.x][threadIdx.y]=rezultat;
//ucitavanje pete podmatrice
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
rezultat=rez5[threadIdx.x][threadIdx.y];
__syncthreads();//u ovom trenutku su uchitane podmatrice od A i B u dijeljenu memoriju
for(tv=0;tv<16;++tv)
rezultat+=podA[tv][threadIdx.y] * podB[threadIdx.x][tv];
rez5[threadIdx.x][threadIdx.y]=rezultat;
Ai+=16; //sledeci blok (Ai po koloni, Bi po redu)
Bi+=(kolicina<<8) - 64; //==16*(kolicina*16)-64 gdje je kolicina*16 velicina reda, a za 64 je citanje ukupno pomjereno od pocetka petlje
__syncthreads(); //racunanje sledeceg dijela ceka dok svi threadovi ne izracunaju ovaj dio
}
v=((blockIdx.x*80) + threadIdx.x) + ((blockIdx.y<<4) + threadIdx.y)*velicinaReda;
//(blockIdx.x<<4) + threadIdx.x je x od rezultata C[x][y]
//(blockIdx.y<<4) + threadIdx.y je y od rezultata C[x][y]
__syncthreads();
C[v] =rez1[threadIdx.x][threadIdx.y];
C[v+16]=rez2[threadIdx.x][threadIdx.y];
C[v+32]=rez3[threadIdx.x][threadIdx.y];
C[v+48]=rez4[threadIdx.x][threadIdx.y];
C[v+64]=rez5[threadIdx.x][threadIdx.y];
}
__global__ void mnozenjeMatrica2_2(int *A, int *B, int *C, int kolicina){
__shared__ int podA1[16][16], podA2[16][16], podB[16][16];
__shared__ int rezA1B1[16][16], rezA1B2[16][16];
__shared__ int rezA2B1[16][16], rezA2B2[16][16];
int r1,r2, v,tv;
int Ai,Bi; //adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju
int velicinaReda;
rezA1B1[threadIdx.x][threadIdx.y]=0;
rezA1B2[threadIdx.x][threadIdx.y]=0;
rezA2B1[threadIdx.x][threadIdx.y]=0;
rezA2B2[threadIdx.x][threadIdx.y]=0;
__syncthreads(); //inicijalno su namjestene sve nule
velicinaReda=kolicina<<4;
Ai=((blockIdx.y<<5) + threadIdx.y)*velicinaReda +threadIdx.x; //32=2*16 (2 visine matrice)
Bi= (blockIdx.x<<5) + threadIdx.x +threadIdx.y*velicinaReda; //32=2*16 (2 duzine matrice)
for(v=0;v<kolicina;++v){
podA1[threadIdx.x][threadIdx.y]=A[ Ai ];
podA2[threadIdx.x][threadIdx.y]=A[ Ai+16*velicinaReda ];
//ucitavanje prve podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B1[threadIdx.x][threadIdx.y];
r2=rezA2B1[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B1[threadIdx.x][threadIdx.y]=r1;
rezA2B1[threadIdx.x][threadIdx.y]=r2;
// __syncthreads();
//ucitavanje druge podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B2[threadIdx.x][threadIdx.y];
r2=rezA2B2[threadIdx.x][threadIdx.y];
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B2[threadIdx.x][threadIdx.y]=r1;
rezA2B2[threadIdx.x][threadIdx.y]=r2;
// __syncthreads();
Ai+=16;//16 jer se ide 1 kolona naprijed!
Bi+=(velicinaReda<<4)-16;//1 blok (16 redova) naprijed i -32 koja su dodata u dva puta po +16
}
v=(blockIdx.x*32 + threadIdx.x) + (blockIdx.y*32 + threadIdx.y)*velicinaReda;
__syncthreads();
C[v] =rezA1B1[threadIdx.x][threadIdx.y];
C[v+16]=rezA1B2[threadIdx.x][threadIdx.y];
v+=velicinaReda<<4;
C[v] =rezA2B1[threadIdx.x][threadIdx.y];
C[v+16]=rezA2B2[threadIdx.x][threadIdx.y];
}
__global__ void mnozenjeMatrica3_3(int *A, int *B, int *C, int kolicina){
__shared__ int podA1[16][16], podA2[16][16], podA3[16][16], podB[16][16];
__shared__ int rezA1B1[16][16], rezA1B2[16][16], rezA1B3[16][16];
__shared__ int rezA2B1[16][16], rezA2B2[16][16], rezA2B3[16][16];
__shared__ int rezA3B1[16][16], rezA3B2[16][16], rezA3B3[16][16];
//promjenljive-koriste registre
int r1,r2,r3, v,tv;
int Ai,Bi; //adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju
int velicinaReda;
rezA1B1[threadIdx.x][threadIdx.y]=0;
rezA1B2[threadIdx.x][threadIdx.y]=0;
rezA1B3[threadIdx.x][threadIdx.y]=0;
rezA2B1[threadIdx.x][threadIdx.y]=0;
rezA2B2[threadIdx.x][threadIdx.y]=0;
rezA2B3[threadIdx.x][threadIdx.y]=0;
rezA3B1[threadIdx.x][threadIdx.y]=0;
rezA3B2[threadIdx.x][threadIdx.y]=0;
rezA3B3[threadIdx.x][threadIdx.y]=0;
__syncthreads(); //inicijalno su namjestene sve nule
velicinaReda=kolicina<<4;
Ai=(blockIdx.y*48 + threadIdx.y)*velicinaReda +threadIdx.x; //48=3*16 (3 visine matrice)
Bi=blockIdx.x*48 + threadIdx.x +threadIdx.y*velicinaReda; //48=3*16 (3 duzine matrice)
for(v=0;v<kolicina;++v){
podA1[threadIdx.x][threadIdx.y]=A[ Ai ];
podA2[threadIdx.x][threadIdx.y]=A[ Ai+16*velicinaReda ];
podA3[threadIdx.x][threadIdx.y]=A[ Ai+32*velicinaReda ];
//ucitavanje prve podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B1[threadIdx.x][threadIdx.y];
r2=rezA2B1[threadIdx.x][threadIdx.y];
r3=rezA3B1[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
r3+=podA3[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B1[threadIdx.x][threadIdx.y]=r1;
rezA2B1[threadIdx.x][threadIdx.y]=r2;
rezA3B1[threadIdx.x][threadIdx.y]=r3;
// __syncthreads();
//ucitavanje druge podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B2[threadIdx.x][threadIdx.y];
r2=rezA2B2[threadIdx.x][threadIdx.y];
r3=rezA3B2[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
r3+=podA3[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B2[threadIdx.x][threadIdx.y]=r1;
rezA2B2[threadIdx.x][threadIdx.y]=r2;
rezA3B2[threadIdx.x][threadIdx.y]=r3;
// __syncthreads();
//ucitavanje trece podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B3[threadIdx.x][threadIdx.y];
r2=rezA2B3[threadIdx.x][threadIdx.y];
r3=rezA3B3[threadIdx.x][threadIdx.y];
// Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
r3+=podA3[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B3[threadIdx.x][threadIdx.y]=r1;
rezA2B3[threadIdx.x][threadIdx.y]=r2;
rezA3B3[threadIdx.x][threadIdx.y]=r3;
// __syncthreads();
Ai+=16;//16 jer se ide 1 kolona naprijed!
Bi+=(velicinaReda<<4)-32;//1 blok (16 redova) naprijed i -32 koja su dodata u dva puta po +16
}
v=(blockIdx.x*48 + threadIdx.x) + (blockIdx.y*48 + threadIdx.y)*velicinaReda;
__syncthreads();
C[v] =rezA1B1[threadIdx.x][threadIdx.y];
C[v+16]=rezA1B2[threadIdx.x][threadIdx.y];
C[v+32]=rezA1B3[threadIdx.x][threadIdx.y];
v+=velicinaReda<<4;
C[v] =rezA2B1[threadIdx.x][threadIdx.y];
C[v+16]=rezA2B2[threadIdx.x][threadIdx.y];
C[v+32]=rezA2B3[threadIdx.x][threadIdx.y];
v+=velicinaReda<<4;
C[v] =rezA3B1[threadIdx.x][threadIdx.y];
C[v+16]=rezA3B2[threadIdx.x][threadIdx.y];
C[v+32]=rezA3B3[threadIdx.x][threadIdx.y];
}
__global__ void mnozenjeMatrica2_6(int *A, int *B, int *C, int kolicina){
__shared__ int podA1[16][16], podA2[16][16], podB[16][16];
__shared__ int rezA1B1[16][16], rezA1B2[16][16], rezA1B3[16][16], rezA1B4[16][16], rezA1B5[16][16], rezA1B6[16][16];
__shared__ int rezA2B1[16][16], rezA2B2[16][16], rezA2B3[16][16], rezA2B4[16][16], rezA2B5[16][16], rezA2B6[16][16];
//promjenljive-koriste registre
int r1,r2, v,tv;
int Ai,Bi; //adresa vrijednosti iz globalne memorije koja treba da se ucita u dijeljenu memoriju
int velicinaReda;
rezA1B1[threadIdx.x][threadIdx.y]=0;
rezA1B2[threadIdx.x][threadIdx.y]=0;
rezA1B3[threadIdx.x][threadIdx.y]=0;
rezA1B4[threadIdx.x][threadIdx.y]=0;
rezA1B5[threadIdx.x][threadIdx.y]=0;
rezA1B6[threadIdx.x][threadIdx.y]=0;
rezA2B1[threadIdx.x][threadIdx.y]=0;
rezA2B2[threadIdx.x][threadIdx.y]=0;
rezA2B3[threadIdx.x][threadIdx.y]=0;
rezA2B4[threadIdx.x][threadIdx.y]=0;
rezA2B5[threadIdx.x][threadIdx.y]=0;
rezA2B6[threadIdx.x][threadIdx.y]=0;
__syncthreads(); //inicijalno su namjestene sve nule
velicinaReda=kolicina<<4;
Ai=((blockIdx.y<<5) + threadIdx.y)*velicinaReda +threadIdx.x; //32=2*16 (2 visine matrice)
Bi=blockIdx.x*96 + threadIdx.x +threadIdx.y*velicinaReda; //96=6*16 (6 duzina matrice)
for(v=0;v<kolicina;++v){
podA1[threadIdx.x][threadIdx.y]=A[ Ai ];
podA2[threadIdx.x][threadIdx.y]=A[ Ai+16*velicinaReda ];
//ucitavanje prve podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B1[threadIdx.x][threadIdx.y];
r2=rezA2B1[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B1[threadIdx.x][threadIdx.y]=r1;
rezA2B1[threadIdx.x][threadIdx.y]=r2;
// __syncthreads();
//ucitavanje druge podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B2[threadIdx.x][threadIdx.y];
r2=rezA2B2[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B2[threadIdx.x][threadIdx.y]=r1;
rezA2B2[threadIdx.x][threadIdx.y]=r2;
// __syncthreads();
//ucitavanje trece podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B3[threadIdx.x][threadIdx.y];
r2=rezA2B3[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B3[threadIdx.x][threadIdx.y]=r1;
rezA2B3[threadIdx.x][threadIdx.y]=r2;
// __syncthreads();
//ucitavanje cetvrte podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B4[threadIdx.x][threadIdx.y];
r2=rezA2B4[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B4[threadIdx.x][threadIdx.y]=r1;
rezA2B4[threadIdx.x][threadIdx.y]=r2;
// __syncthreads();
//ucitavanje pete podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B5[threadIdx.x][threadIdx.y];
r2=rezA2B5[threadIdx.x][threadIdx.y];
Bi+=16;//sledeca podmatrica je jedna podmatrica "u desno"
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B5[threadIdx.x][threadIdx.y]=r1;
rezA2B5[threadIdx.x][threadIdx.y]=r2;
// __syncthreads();
//ucitavanje seste podmatrice od matrice B
podB[threadIdx.x][threadIdx.y]=B[ Bi ];
__syncthreads();
r1=rezA1B6[threadIdx.x][threadIdx.y];
r2=rezA2B6[threadIdx.x][threadIdx.y];
for(tv=0;tv<16;++tv){
r1+=podA1[tv][threadIdx.y] * podB[threadIdx.x][tv];
r2+=podA2[tv][threadIdx.y] * podB[threadIdx.x][tv];
}
rezA1B6[threadIdx.x][threadIdx.y]=r1;
rezA2B6[threadIdx.x][threadIdx.y]=r2;
// __syncthreads();
Ai+=16;//16 jer se ide 1 kolona naprijed!
Bi+=(velicinaReda<<4)-80;//1 blok (16 redova) naprijed i -80 koja su dodata u pet puta po +16
}
v=(blockIdx.x*96 + threadIdx.x) + ((blockIdx.y<<5) + threadIdx.y)*velicinaReda;
__syncthreads();
C[v] =rezA1B1[threadIdx.x][threadIdx.y];
C[v+16]=rezA1B2[threadIdx.x][threadIdx.y];
C[v+32]=rezA1B3[threadIdx.x][threadIdx.y];
C[v+48]=rezA1B4[threadIdx.x][threadIdx.y];
C[v+64]=rezA1B5[threadIdx.x][threadIdx.y];
C[v+80]=rezA1B6[threadIdx.x][threadIdx.y];
v+=velicinaReda<<4;
C[v] =rezA2B1[threadIdx.x][threadIdx.y];
C[v+16]=rezA2B2[threadIdx.x][threadIdx.y];
C[v+32]=rezA2B3[threadIdx.x][threadIdx.y];
C[v+48]=rezA2B4[threadIdx.x][threadIdx.y];
C[v+64]=rezA2B5[threadIdx.x][threadIdx.y];
C[v+80]=rezA2B6[threadIdx.x][threadIdx.y];
}
#endif
GPU26.bmp (58 Bytes)
GPU33.bmp (58 Bytes)
GPU22.bmp (58 Bytes)
GPU16.bmp (58 Bytes)
GPU15.bmp (58 Bytes)
GPU11.bmp (58 Bytes)
GPU1.bmp (58 Bytes)