i found a program and try to go to the max of graphic card
so i can do 33 550 000 multiplication = 134 GO of memory but my card have 512 mo can i do more using the blockIdx.Y and how ?
all the 33 550 000 are do in same time ? or 512 are do and after 512 more … ?
#include <stdio.h>
#include <cuda.h>
#include <TIME.H>
#include “cutil_inline.h”
// Kernel that executes on the CUDA device
global void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
float *memoirecpu, *memoiregraphique; // Pointer to host & device arrays
const int N = 33550000; // Number of elements in arrays
size_t size = N * sizeof(float);
memoirecpu = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &memoiregraphique, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) memoirecpu[i] = (float)i;
cudaMemcpy(memoiregraphique, memoirecpu, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 512;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
clock_t clock();
long t1;
long t2;
long t3;
t1=clock();
square_array <<< n_blocks, block_size >>> (memoiregraphique, N);
t2=clock();
t3=t2-t1;
// Retrieve result from device and store it in host array
cudaMemcpy(memoirecpu, memoiregraphique, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=N-4; i<N; i++) printf(“%d %f\n”, i, memoirecpu[i]);
// Cleanup
printf(“%d”,t3);
free(memoirecpu); cudaFree(memoiregraphique);
}