Hi guys.
I’m trying to run with large number of blocks.
e.g. 3628800 (=10!) blocks
I know the max number of blocks is 65535.
I think that if the program exceeds this limit, the exceeding rest blocks are run later.
Is it wrong?
And,
I wrote test code. (using NVIDIA Parallel Nsight)
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
__global__ void isPrimeNumber(int *px, int *py)
{
int tid = blockIdx.x;
int i, divisor = 0;
int num = px[tid];
for(i=1; i<=num; i++){
if(num%i == 0) divisor++;
}
if(divisor == 2){
py[tid] = 1; //correct
}else{
py[tid] = 0; //not prime number
}
}
cudaError_t getPrimeCount(int num, int count){
int *xs, *ys;
xs = (int *)malloc(sizeof(int)*num);
ys = (int *)malloc(sizeof(int)*num);
int *d_xs, *d_ys;
cudaError_t cudaStatus = cudaMalloc((void**)&d_xs, sizeof(int)*num);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&d_ys, sizeof(int)*num);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
int i;
for(i=0; i<num; i++){
xs[i] = i;
ys[i] = 0;
}
cudaStatus = cudaMemcpy (d_xs, xs, sizeof(int)*num, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
//CPU -> GPU
isPrimeNumber<<<num, 1>>>(d_xs, d_ys);
cudaStatus = cudaMemcpy (ys , d_ys , sizeof(int)*num, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
for(i=0; i<num; i++){
if(ys[i] == 1){ //xs[i] is prime number
count++;
}
}
printf("Count of Prime Number(<=%d) = %d\n",num ,count);
free(xs);
free(ys);
Error:
cudaFree(d_xs);
cudaFree(d_ys);
return cudaStatus;
}
int main()
{
int num = 10000; // <- run
//int num = 10000; // does not run
int count = 0;
cudaError_t cudaStatus = getPrimeCount(num, count);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "getPrimeCount failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
This test code calculates the number of the prime number.
When the variable “num” in main function is 10000, this code run.
But, When “num” is 70000, this code does not run.
I don’t know what is wrong :-(
Thanks for your time, any help will be highly appreciated.