some pb with N=40000 i think it s ok but with N=50000 N2=50000*49999/2 is wrong i am not good on c so i dont find the good type
and the float NBbou= sizereel/sizemax; dont give a float i need a float External Image
the idea is to do always the same number of addition (x time for do all)
a way to see if it s good for N=10 you must have 9 “0” 8 “1” 7 “2” 6 “3” 5 “4” 4 “5” 3 “6” 2 “7” 1 “8”
for N=40000 must have 39999 “0” 39998 “1” … 1 “39998”
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <math.h>
#include "cutil_inline.h"
__global__ void square_array(int *a,int *b, int *c, int N,int N2,int N3,int bou)
{
int sh= threadIdx.x+32*threadIdx.y;
int tid = 512 * blockIdx.x + 1048576 * blockIdx.y + sh;
if ( tid<N3)
{
int linIdx=N2 - tid-bou;
int i = int(N - 0.5 - sqrt(0.25 - 2 * (1 - linIdx)));
int z =(N+N-1-i)*i;
int j= tid+bou - z/2 + 1 + i;
if (i==j)
{
i=i-1;
j=N-1;
}
c[tid]=a[i]+b[j];
}
}
// main routine that executes on the host
int main(void)
{
int *memoiregraphique1; // Pointer to host & device arrays
int *memoiregraphique2; // Pointer to host & device arrays
int *memoiregraphique3; // Pointer to host & device arrays
int *memoirecpu1; // Pointer to host & device arrays
int *memoirecpu2; // Pointer to host & device arrays
int *memoirecpu3; // Pointer to host & device arrays
long N;
long N2;
long N3;
N=40000; // max 40000 because 50000 * 49999/2 is false
N2=N*(N-1)/2;
size_t size = N * sizeof(int);
size_t sizereel = N2 * sizeof(int);
N3 =33554432; //number of result who can go in memory 33554432 or less i think for have more
size_t sizemax =N3*sizeof(int);
memoirecpu1 = (int *)malloc(size); // Allocate array on host
memoirecpu2 = (int *)malloc(size); // Allocate array on host
memoirecpu3 = (int *)malloc(sizemax); // Allocate array on host
for (int i=0; i<N; i=i+1)
{
memoirecpu1[i]=i;
memoirecpu2[i]=0;
}
for (int i=0; i<N3; i=i+1)
{
memoirecpu3[i]=0;
}
cutilSafeCall( cudaThreadSynchronize() );
cudaMalloc((void **) &memoiregraphique1, size); // Allocate array on device
cudaMalloc((void **) &memoiregraphique2, size); // Allocate array on device
cudaMalloc((void **) &memoiregraphique3, sizemax ); // Allocate array on device
cudaMemcpy(memoiregraphique1, memoirecpu1, size, cudaMemcpyHostToDevice);
cudaMemcpy(memoiregraphique2, memoirecpu2, size, cudaMemcpyHostToDevice);
int bou=0;
int N4=N3;
float NBbou= sizereel/sizemax; //must cast in float i dont know do 760/376 must give2.02 and not 2 :(
if (NBbou==0 )
{
NBbou=1;
}
//
for ( bou=0 ; bou< NBbou ;bou=bou+1)
{
//number of thread never change =33554432
int threadnumber =33554432;
if (bou+1 >NBbou)
{
N4=N2-bou*N3;
}
square_array <<< dim3(2048,32,1),dim3(32,16,1) >>> (memoiregraphique1, memoiregraphique2, memoiregraphique3,N,N2,N3,bou*N3);
cudaMemcpy(memoirecpu3, memoiregraphique3, sizemax, cudaMemcpyDeviceToHost);
for (int i=0; i<N4; i=i+1)
{
//-----to do save memoirecpu3 on hard drive-------------------------
}
}
free(memoirecpu1); cudaFree(memoiregraphique1);
free(memoirecpu2); cudaFree(memoiregraphique2);
free(memoirecpu3); cudaFree(memoiregraphique3);
}