new in cuda some question

cricri1 · February 3, 2011, 1:27pm

i found a program and try to go to the max of graphic card
so i can do 33 550 000 multiplication = 134 GO of memory but my card have 512 mo can i do more using the blockIdx.Y and how ?
all the 33 550 000 are do in same time ? or 512 are do and after 512 more … ?

#include <stdio.h>
#include <cuda.h>
#include <TIME.H>

#include “cutil_inline.h”
// Kernel that executes on the CUDA device
global void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}

// main routine that executes on the host
int main(void)
{
float *memoirecpu, *memoiregraphique; // Pointer to host & device arrays
const int N = 33550000; // Number of elements in arrays
size_t size = N * sizeof(float);
memoirecpu = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &memoiregraphique, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) memoirecpu[i] = (float)i;

cudaMemcpy(memoiregraphique, memoirecpu, size, cudaMemcpyHostToDevice);

// Do calculation on device:
int block_size = 512;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);

clock_t clock();
long t1;
long t2;
long t3;
t1=clock();
square_array <<< n_blocks, block_size >>> (memoiregraphique, N);
t2=clock();
t3=t2-t1;
// Retrieve result from device and store it in host array
cudaMemcpy(memoirecpu, memoiregraphique, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=N-4; i<N; i++) printf(“%d %f\n”, i, memoirecpu[i]);
// Cleanup
printf(“%d”,t3);
free(memoirecpu); cudaFree(memoiregraphique);

}

spiker · February 3, 2011, 1:51pm

i found a program and try to go to the max of graphic card

so i can do 33 550 000 multiplication = 134 GO of memory but my card have 512 mo can i do more using the blockIdx.Y and how ?

all the 33 550 000 are do in same time ? or 512 are do and after 512 more … ?

include <stdio.h>

include <cuda.h>

include <TIME.H>

include “cutil_inline.h”

// Kernel that executes on the CUDA device

global void square_array(float *a, int N)

{

int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx<N) a[idx] = a[idx] * a[idx];

}

// main routine that executes on the host

int main(void)

{

float *memoirecpu, *memoiregraphique; // Pointer to host & device arrays

const int N = 33550000; // Number of elements in arrays

size_t size = N * sizeof(float);

memoirecpu = (float *)malloc(size); // Allocate array on host

cudaMalloc((void **) &memoiregraphique, size); // Allocate array on device

// Initialize host array and copy it to CUDA device

for (int i=0; i<N; i++) memoirecpu[i] = (float)i;

cudaMemcpy(memoiregraphique, memoirecpu, size, cudaMemcpyHostToDevice);

// Do calculation on device:

int block_size = 512;

int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);

clock_t clock();

long t1;

long t2;

long t3;

t1=clock();

square_array <<< n_blocks, block_size >>> (memoiregraphique, N);

t2=clock();

t3=t2-t1;

// Retrieve result from device and store it in host array

cudaMemcpy(memoirecpu, memoiregraphique, sizeof(float)*N, cudaMemcpyDeviceToHost);

// Print results

for (int i=N-4; i<N; i++) printf(“%d %f\n”, i, memoirecpu[i]);

// Cleanup

printf(“%d”,t3);

free(memoirecpu); cudaFree(memoiregraphique);

}

Hello,

I guess this program does not run the way you’d like…

the threads are organized in a grid of 65536x65536x64 so you have 3 components on which you can address your memory informations and in this case it’s enough an X x Y grid:

If you want to perform operations on an element you should do this:

// Kernel that executes on the CUDA device

global void square_array(float *a, int N)

{

int ix = blockIdx.x * blockDim.x + threadIdx.x;

int iy = blockIdx.y * blockDim.y + threadIdx.y;

int idx = ix+65536*iy

if (idx<N) a[idx] = a[idx] * a[idx];

}

then you have to call this kernel telling CUDA you need to reference to a 2d grid:

Ok you have 33550000 elements. That is 65536 x 511.932etc (round it to 512)

your call will be:

square_array<<<dim3(65536/16,512/16),dim3(16,16)>>>(memoiregraphique, N);

                *1       *2           *A *B

*1 is 65536/16=4096 blocks on X coord

*2 is 512/16 =32 blocks on y coord

*A is 16 threads per block on x coord

*B is 16 threads per block on y coord

so in a block there will be 256 threads running

That’s all :-)

cricri1 · February 3, 2011, 7:41pm

question all 30 000 000 square are do realy on the same time ??

thanks works but i still cant go more than 65536 block even i do 809632
for number 33 618 432 he return x and not xx

#include <stdio.h>
#include <cuda.h>
#include <TIME.H>

#include “cutil_inline.h”
// Kernel that executes on the CUDA device
global void square_array(float a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int id =idx+65536idy;

if (id<N) a[id] = a[id] * a[id];
}

// main routine that executes on the host
int main(void)
{
float *memoirecpu, *memoiregraphique; // Pointer to host & device arrays
const int N = 34550000; // Number of elements in arrays
size_t size = N * sizeof(float);
memoirecpu = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &memoiregraphique, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) memoirecpu[i] = (float)i;

cudaMemcpy(memoiregraphique, memoirecpu, size, cudaMemcpyHostToDevice);

// Do calculation on device:
int threadsPerBlock = 16;
int n_blocks = N/threadsPerBlock + (N%threadsPerBlock == 0 ? 0:1);

clock_t clock();
long t1;
long t2;
long t3;
t1=clock();
square_array <<< dim3(8096,32),dim3(16,16) >>> (memoiregraphique, N);
t2=clock();
t3=t2-t1;

// Retrieve result from device and store it in host array
cudaMemcpy(memoirecpu, memoiregraphique, sizeof(float)N, cudaMemcpyDeviceToHost);
// Print results
for (int i=1; i<N; i++)
// Cleanup
{
//printf(“%d %f\n”, i, memoirecpu[i]);
float j=0;
j=i;
if ((float)memoirecpu[i] < ((float)jj*.98))
{
printf(“%d %f\n”, i, memoirecpu[i]);
}
}
printf(“%d”,t3);
free(memoirecpu); cudaFree(memoiregraphique);

}

cricri1 · February 3, 2011, 9:48pm

square_array <<< dim3(4096,50,1),dim3(16,32,1) >>> (memoiregraphique, N);
i understand that 409616 must be <=65536 5032 <=65536
now i do 104 857 600 square for 419 Mo more crash
it s works thanks thanks

#include <stdio.h>
#include <cuda.h>
#include <TIME.H>

#include “cutil_inline.h”
// Kernel that executes on the CUDA device
global void square_array(float a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
int id =idx+65536idy;

if (id<N) a[id] = a[id] * a[id];
}

// main routine that executes on the host
int main(void)
{
float *memoirecpu, *memoiregraphique; // Pointer to host & device arrays
const int N = 104857600; // Number of elements in arrays
size_t size = N * sizeof(float);
memoirecpu = (float *)malloc(size); // Allocate array on host
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) memoirecpu[i] = (float)i;

cudaMalloc((void **) &memoiregraphique, size); // Allocate array on device

cudaMemcpy(memoiregraphique, memoirecpu, size, cudaMemcpyHostToDevice);

// Do calculation on device:
int threadsPerBlock = 16;
int n_blocks = N/threadsPerBlock + (N%threadsPerBlock == 0 ? 0:1);

clock_t clock();
long t1;
long t2;
long t3;
t1=clock();
square_array <<< dim3(4096,50,1),dim3(16,32,1) >>> (memoiregraphique, N);
t2=clock();
t3=t2-t1;

// Retrieve result from device and store it in host array
cudaMemcpy(memoirecpu, memoiregraphique, sizeof(float)N, cudaMemcpyDeviceToHost);
// Print results
for (int i=1; i<N; i++)
// Cleanup
{
//printf(“%d %f\n”, i, memoirecpu[i]);
float j=0;
j=i;
if ((float)memoirecpu[i] < ((float)jj*.98))
{
printf(“%d %f\n”, i, memoirecpu[i]);
}
}
printf(“%d”,t3);
free(memoirecpu); cudaFree(memoiregraphique);

}

Topic		Replies	Views
Kernel configuration and maximum array size problem. CUDA Programming and Performance	8	6476	January 25, 2009
Urgent help with threads please! CUDA Programming and Performance	21	10787	March 6, 2008
Number of items that can be processed in CUDA CUDA Programming and Performance cuda , kernel	5	340	February 17, 2024
How would you do this? CUDA Programming and Performance	12	4467	August 5, 2008
Can a Kernel be too big?? CUDA_ERROR_NO_BINARY_FOR_GPU error 209 CUDA Programming and Performance	11	3047	November 13, 2017
Memory problem? ...incredible slowdown CUDA Programming and Performance	29	16308	January 30, 2011
Probably a simple answer Simple CUDA code - unexpected result CUDA Programming and Performance	7	4855	October 27, 2010
limit of computation CUDA Programming and Performance	44	32905	April 8, 2008
2D reduction using CUDA The use a cuda and cublas library for a 2D simple reduction CUDA Programming and Performance	11	4440	February 7, 2012
CUDA Matrix Multiplication Issues threads and blocks problem CUDA Programming and Performance	2	3631	March 1, 2009

new in cuda some question

Related topics