Sum vectors

some pb with N=40000 i think it s ok but with N=50000 N2=50000*49999/2 is wrong i am not good on c so i dont find the good type

and the float NBbou= sizereel/sizemax; dont give a float i need a float External Image

the idea is to do always the same number of addition (x time for do all)

a way to see if it s good for N=10 you must have 9 “0” 8 “1” 7 “2” 6 “3” 5 “4” 4 “5” 3 “6” 2 “7” 1 “8”

for N=40000 must have 39999 “0” 39998 “1” … 1 “39998”

#include <stdio.h>  

 #include <cuda.h>  

 #include <time.h>

 #include <math.h>

 #include "cutil_inline.h"

__global__ void square_array(int *a,int *b, int *c, int N,int N2,int N3,int bou)  

 {  

      int sh=  threadIdx.x+32*threadIdx.y;

	   int tid = 512 * blockIdx.x +  1048576 * blockIdx.y + sh;

	

if ( tid<N3)

	{

	int linIdx=N2 - tid-bou;

int i = int(N - 0.5 - sqrt(0.25 - 2 * (1 - linIdx)));

int z =(N+N-1-i)*i;

   int j= tid+bou - z/2 + 1 + i;

if (i==j)

     {

i=i-1;

       j=N-1;

     }

c[tid]=a[i]+b[j];	

}  

}

// main routine that executes on the host  

 int main(void)  

 {  

   int  *memoiregraphique1;  // Pointer to host & device arrays  

   int  *memoiregraphique2;  // Pointer to host & device arrays  

   int  *memoiregraphique3;  // Pointer to host & device arrays  

   int  *memoirecpu1;  // Pointer to host & device arrays  

   int  *memoirecpu2;  // Pointer to host & device arrays  

   int  *memoirecpu3;  // Pointer to host & device arrays  

long N;

long N2;

long N3;

N=40000;  // max 40000 because 50000 * 49999/2 is false

N2=N*(N-1)/2;  

size_t size = N *  sizeof(int);  

   size_t sizereel = N2 *  sizeof(int);  

N3 =33554432;  //number of result who can go in memory 33554432 or less i think for have more

   size_t sizemax =N3*sizeof(int); 

memoirecpu1 = (int *)malloc(size);        // Allocate array on host  

   memoirecpu2 = (int *)malloc(size);        // Allocate array on host  

   memoirecpu3 = (int *)malloc(sizemax);        // Allocate array on host  

for (int i=0; i<N; i=i+1) 

 {

   memoirecpu1[i]=i;

   memoirecpu2[i]=0;

}

   for (int i=0; i<N3; i=i+1) 

{

   memoirecpu3[i]=0;

}

    cutilSafeCall( cudaThreadSynchronize() );

    cudaMalloc((void **) &memoiregraphique1, size);   // Allocate array on device  

    cudaMalloc((void **) &memoiregraphique2, size);   // Allocate array on device  

    cudaMalloc((void **) &memoiregraphique3, sizemax );   // Allocate array on device  

   cudaMemcpy(memoiregraphique1, memoirecpu1, size, cudaMemcpyHostToDevice);

   cudaMemcpy(memoiregraphique2, memoirecpu2, size, cudaMemcpyHostToDevice);

int bou=0;

int N4=N3;

float NBbou= sizereel/sizemax;  //must cast in float i dont know do 760/376 must give2.02 and not 2 :(

if (NBbou==0 )

  {

  NBbou=1;

  }

// 

for ( bou=0 ; bou< NBbou ;bou=bou+1)

 {

//number of thread never change =33554432

    int threadnumber =33554432;

    if (bou+1 >NBbou)

    {

     N4=N2-bou*N3;

    }

    square_array <<< dim3(2048,32,1),dim3(32,16,1) >>> (memoiregraphique1, memoiregraphique2, memoiregraphique3,N,N2,N3,bou*N3);  

    cudaMemcpy(memoirecpu3, memoiregraphique3, sizemax, cudaMemcpyDeviceToHost);  

   for (int i=0; i<N4; i=i+1) 

 {

//-----to do save memoirecpu3 on hard drive-------------------------

}

}

free(memoirecpu1); cudaFree(memoiregraphique1);  

   free(memoirecpu2); cudaFree(memoiregraphique2);  

   free(memoirecpu3); cudaFree(memoiregraphique3);  

}

[font=“Courier New”]long[/font] is a 32 bit type on 32 bit systems. Use [font=“Courier New”]long long[/font] instead.

long long ok for c

but in cuda i must have interger like 1234567890 or more so what type ?
and sqrt(1234567890) is good on cuda ?

Sorry, I don’t understand what you are trying to say. Why is [font=“Courier New”]long long[/font] not good for CUDA?

The algorithm you want is almost the easiest thing to do with cuda, so i think you should read more about cuda before you start.

check out this post how to achieve a unique index using 1D-3D grid dimensions.

@pasoleatis

With Cuda 2.0 you can have up to: 65.535³ * 1.024 = 288.217.182.213.504.000 Threads

(64 limits only the z-Dimension of a Block but not of a Grid, which is irrelevant here)