new in cuda some question

i found a program and try to go to the max of graphic card
so i can do 33 550 000 multiplication = 134 GO of memory but my card have 512 mo can i do more using the blockIdx.Y and how ?
all the 33 550 000 are do in same time ? or 512 are do and after 512 more … ?

#include <stdio.h>
#include <cuda.h>
#include <TIME.H>

#include “cutil_inline.h”
// Kernel that executes on the CUDA device
global void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}

// main routine that executes on the host
int main(void)
{
float *memoirecpu, *memoiregraphique; // Pointer to host & device arrays
const int N = 33550000; // Number of elements in arrays
size_t size = N * sizeof(float);
memoirecpu = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &memoiregraphique, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) memoirecpu[i] = (float)i;

cudaMemcpy(memoiregraphique, memoirecpu, size, cudaMemcpyHostToDevice);

// Do calculation on device:
int block_size = 512;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);

clock_t clock();
long t1;
long t2;
long t3;
t1=clock();
square_array <<< n_blocks, block_size >>> (memoiregraphique, N);
t2=clock();
t3=t2-t1;
// Retrieve result from device and store it in host array
cudaMemcpy(memoirecpu, memoiregraphique, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=N-4; i<N; i++) printf(“%d %f\n”, i, memoirecpu[i]);
// Cleanup
printf(“%d”,t3);
free(memoirecpu); cudaFree(memoiregraphique);

}

Hello,

I guess this program does not run the way you’d like…

the threads are organized in a grid of 65536x65536x64 so you have 3 components on which you can address your memory informations and in this case it’s enough an X x Y grid:

If you want to perform operations on an element you should do this:

// Kernel that executes on the CUDA device

global void square_array(float *a, int N)

{

int ix = blockIdx.x * blockDim.x + threadIdx.x;

int iy = blockIdx.y * blockDim.y + threadIdx.y;

int idx = ix+65536*iy

if (idx<N) a[idx] = a[idx] * a[idx];

}

then you have to call this kernel telling CUDA you need to reference to a 2d grid:

Ok you have 33550000 elements. That is 65536 x 511.932etc (round it to 512)

your call will be:

square_array<<<dim3(65536/16,512/16),dim3(16,16)>>>(memoiregraphique, N);

                *1       *2           *A *B

*1 is 65536/16=4096 blocks on X coord

*2 is 512/16 =32 blocks on y coord

*A is 16 threads per block on x coord

*B is 16 threads per block on y coord

so in a block there will be 256 threads running

That’s all :-)

question all 30 000 000 square are do realy on the same time ??

thanks works but i still cant go more than 65536 block even i do 809632
for number 33 618 432 he return x and not x
x

#include <stdio.h>
#include <cuda.h>
#include <TIME.H>

#include “cutil_inline.h”
// Kernel that executes on the CUDA device
global void square_array(float a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int id =idx+65536
idy;

if (id<N) a[id] = a[id] * a[id];
}

// main routine that executes on the host
int main(void)
{
float *memoirecpu, *memoiregraphique; // Pointer to host & device arrays
const int N = 34550000; // Number of elements in arrays
size_t size = N * sizeof(float);
memoirecpu = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &memoiregraphique, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) memoirecpu[i] = (float)i;

cudaMemcpy(memoiregraphique, memoirecpu, size, cudaMemcpyHostToDevice);

// Do calculation on device:
int threadsPerBlock = 16;
int n_blocks = N/threadsPerBlock + (N%threadsPerBlock == 0 ? 0:1);

clock_t clock();
long t1;
long t2;
long t3;
t1=clock();
square_array <<< dim3(8096,32),dim3(16,16) >>> (memoiregraphique, N);
t2=clock();
t3=t2-t1;

// Retrieve result from device and store it in host array
cudaMemcpy(memoirecpu, memoiregraphique, sizeof(float)N, cudaMemcpyDeviceToHost);
// Print results
for (int i=1; i<N; i++)
// Cleanup
{
//printf(“%d %f\n”, i, memoirecpu[i]);
float j=0;
j=i;
if ((float)memoirecpu[i] < ((float)j
j*.98))
{
printf(“%d %f\n”, i, memoirecpu[i]);
}
}
printf(“%d”,t3);
free(memoirecpu); cudaFree(memoiregraphique);

}

square_array <<< dim3(4096,50,1),dim3(16,32,1) >>> (memoiregraphique, N);
i understand that 409616 must be <=65536 5032 <=65536
now i do 104 857 600 square for 419 Mo more crash
it s works thanks thanks

#include <stdio.h>
#include <cuda.h>
#include <TIME.H>

#include “cutil_inline.h”
// Kernel that executes on the CUDA device
global void square_array(float a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int id =idx+65536
idy;

if (id<N) a[id] = a[id] * a[id];
}

// main routine that executes on the host
int main(void)
{
float *memoirecpu, *memoiregraphique; // Pointer to host & device arrays
const int N = 104857600; // Number of elements in arrays
size_t size = N * sizeof(float);
memoirecpu = (float *)malloc(size); // Allocate array on host
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) memoirecpu[i] = (float)i;

cudaMalloc((void **) &memoiregraphique, size); // Allocate array on device

cudaMemcpy(memoiregraphique, memoirecpu, size, cudaMemcpyHostToDevice);

// Do calculation on device:
int threadsPerBlock = 16;
int n_blocks = N/threadsPerBlock + (N%threadsPerBlock == 0 ? 0:1);

clock_t clock();
long t1;
long t2;
long t3;
t1=clock();
square_array <<< dim3(4096,50,1),dim3(16,32,1) >>> (memoiregraphique, N);
t2=clock();
t3=t2-t1;

// Retrieve result from device and store it in host array
cudaMemcpy(memoirecpu, memoiregraphique, sizeof(float)N, cudaMemcpyDeviceToHost);
// Print results
for (int i=1; i<N; i++)
// Cleanup
{
//printf(“%d %f\n”, i, memoirecpu[i]);
float j=0;
j=i;
if ((float)memoirecpu[i] < ((float)j
j*.98))
{
printf(“%d %f\n”, i, memoirecpu[i]);
}
}
printf(“%d”,t3);
free(memoirecpu); cudaFree(memoiregraphique);

}