Hi, I’m trying to build a very simple Cellular Automata application using CUDA. The application ideia is to replicate rule 90 from Stephen Wolfram.
What I’m trying to make is that each block should scan the lattice searching for a pre-setted pattern, if it finds it’ll set the central cell as the next from the pattern. Each block should have a size of the Lattice, so that each Thread can look at each cell (and it’s neighbours).
I think I got almost everything right, but on the build I get this warnings (the error lines are not aligned with the code bellow, they are actually the “if(l.cells[idx1] == p[patIdx].cells[0] && l.cells[idx2] == p[patIdx].cells[1] && l.cells[idx3] == p[patIdx].cells[2])” line)
and on the run, I get this error:
I could trim down the execution error to be on this line:
l.cells[idx2] = p[patIdx].next;
This is the final code, if anyone can help me I would highly appreciate:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <cuda.h>
#define MAX_SIZE 255
#define MAX_RULE 255
#define MAX_BIN_SIZE 9
typedef struct
{
int width;
char *cells;
}Lattice;
typedef struct
{
int width;
char *cells;
char next;
}Pattern;
//Kernel declaration
__global__ void executeRule(Lattice l, int rule, Pattern p[8]);
//binary to decimal converter
void decToBin(int number,char *bin);
void createPatterns(Pattern p[8], int rule);
//Host call function
int executeCA(int latWidth, int rule)
{
if(latWidth > MAX_SIZE)
latWidth = MAX_SIZE;
if(rule < 0) rule = 0;
if(rule > MAX_RULE) rule = MAX_RULE;
//Host lattice and pattern
Lattice h_lat;
Pattern h_patterns[8];
size_t memSize = latWidth;
h_lat.width = latWidth;
if((h_lat.cells = (char*)malloc(memSize))==NULL)
{
perror("malloc");
return 1;
}
memset(h_lat.cells,'0',memSize);
h_lat.cells[latWidth/2]='1'; //Central cell is on
for(int i = 0; i < 8; i++)
{
h_patterns[i].width = 3;
if((h_patterns[i].cells = (char*)malloc(h_patterns[i].width))==NULL)
{
perror("malloc");
return 1;
}
memset(h_patterns[i].cells,'
include <stdio.h>
include <stdlib.h>
include <string.h>
include <errno.h>
include <cuda.h>
define MAX_SIZE 255
define MAX_RULE 255
define MAX_BIN_SIZE 9
typedef struct
{
int width;
char *cells;
}Lattice;
typedef struct
{
int width;
char *cells;
char next;
}Pattern;
//Kernel declaration
global void executeRule(Lattice l, int rule, Pattern p[8]);
//binary to decimal converter
void decToBin(int number,char *bin);
void createPatterns(Pattern p[8], int rule);
//Host call function
int executeCA(int latWidth, int rule)
{
if(latWidth > MAX_SIZE)
latWidth = MAX_SIZE;
if(rule < 0) rule = 0;
if(rule > MAX_RULE) rule = MAX_RULE;
//Host lattice and pattern
Lattice h_lat;
Pattern h_patterns[8];
size_t memSize = latWidth;
h_lat.width = latWidth;
if((h_lat.cells = (char*)malloc(memSize))==NULL)
{
perror("malloc");
return 1;
}
memset(h_lat.cells,‘0’,memSize);
h_lat.cells[latWidth/2]=‘1’; //Central cell is on
for(int i = 0; i < 8; i++)
{
h_patterns[i].width = 3;
if((h_patterns[i].cells = (char*)malloc(h_patterns[i].width))==NULL)
{
perror("malloc");
return 1;
}
memset(h_patterns[i].cells,'\0',h_patterns[i].width);
}
createPatterns(h_patterns,rule);
//Device lattice and pattern
Lattice d_lat;
Pattern d_patterns[8];
cudaError_t d_error;
memSize = latWidth;
d_lat.width = h_lat.width;
d_error = cudaMalloc((void**)&d_lat.cells, memSize);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(d_lat.cells, h_lat.cells, memSize, cudaMemcpyHostToDevice);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
for(int i = 0; i < 8; i++)
{
d_patterns[i].width = h_patterns[i].width;
d_error = cudaMalloc((void**)&d_patterns[i].cells, d_patterns[i].width);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(d_patterns[i].cells, h_patterns[i].cells, d_patterns[i].width, cudaMemcpyHostToDevice);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
}
//Execute Kernel
dim3 dimGrid(8);
dim3 dimBlock(h_lat.width);
executeRule<<<dimGrid, dimBlock>>>(d_lat, rule, d_patterns);
cudaThreadSynchronize();
d_error = cudaGetLastError();
if(d_error != cudaSuccess)
{
fprintf(stderr,"executeRule<<<%d, %d>>>:%s\n",dimGrid.x,dimBlock.x,cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(h_lat.cells, d_lat.cells, memSize, cudaMemcpyDeviceToHost);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
for(int i = 0; i < h_lat.width; i++)
printf("%c",h_lat.cells[i]);
printf(“\n”);
//Cleanup
free(h_lat.cells);
cudaFree(d_lat.cells);
for(int i = 0; i < 8; i++)
{
free(h_patterns[i].cells);
cudaFree(d_patterns[i].cells);
}
return 0;
}
void decToBin(int number, char *bin, int width)
{
if(number > MAX_SIZE) return;
int remain = 0;
int i = width-2;
memset(bin,‘0’,width);
while(number > 0)
{
remain = number % 2;
number = number / 2;
if(i<0) i=0;
bin[i--] = (remain==0 ? '0': '1');
}
bin[width-1]=‘\0’;
}
void createPatterns(Pattern p[8], int rule)
{
int num=7;
int j = 0;
char bin[MAX_BIN_SIZE];
decToBin(rule,bin,MAX_BIN_SIZE);
for(int i = 0; i < 8; i++)
{
decToBin(num,p[i].cells,p[i].width+1);
p[i].next=bin[j++];
num--;
if(num<0) num=0;
}
}
global void executeRule(Lattice l, int rule, Pattern p[8])
{
int idx2 = threadIdx.x;
int patIdx = blockIdx.x;
int idx1 = idx2-1;
int idx3 = idx2+1;
if(idx2 > 1 && idx2 < l.width-1)
{
if(l.cells[idx1] == p[patIdx].cells[0] && l.cells[idx2] == p[patIdx].cells[1] && l.cells[idx3] == p[patIdx].cells[2])
l.cells[idx2] = p[patIdx].next;
}
}
int main(int argc, char *argv)
{
executeCA(149, 90);
return 0;
}
',h_patterns[i].width);
}
createPatterns(h_patterns,rule);
//Device lattice and pattern
Lattice d_lat;
Pattern d_patterns[8];
cudaError_t d_error;
memSize = latWidth;
d_lat.width = h_lat.width;
d_error = cudaMalloc((void**)&d_lat.cells, memSize);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(d_lat.cells, h_lat.cells, memSize, cudaMemcpyHostToDevice);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
for(int i = 0; i < 8; i++)
{
d_patterns[i].width = h_patterns[i].width;
d_error = cudaMalloc((void**)&d_patterns[i].cells, d_patterns[i].width);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(d_patterns[i].cells, h_patterns[i].cells, d_patterns[i].width, cudaMemcpyHostToDevice);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
}
//Execute Kernel
dim3 dimGrid(8);
dim3 dimBlock(h_lat.width);
executeRule<<<dimGrid, dimBlock>>>(d_lat, rule, d_patterns);
cudaThreadSynchronize();
d_error = cudaGetLastError();
if(d_error != cudaSuccess)
{
fprintf(stderr,"executeRule<<<%d, %d>>>:%s\n",dimGrid.x,dimBlock.x,cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(h_lat.cells, d_lat.cells, memSize, cudaMemcpyDeviceToHost);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
for(int i = 0; i < h_lat.width; i++)
printf("%c",h_lat.cells[i]);
printf("\n");
//Cleanup
free(h_lat.cells);
cudaFree(d_lat.cells);
for(int i = 0; i < 8; i++)
{
free(h_patterns[i].cells);
cudaFree(d_patterns[i].cells);
}
return 0;
}
void decToBin(int number, char *bin, int width)
{
if(number > MAX_SIZE) return;
int remain = 0;
int i = width-2;
memset(bin,'0',width);
while(number > 0)
{
remain = number % 2;
number = number / 2;
if(i<0) i=0;
bin[i--] = (remain==0 ? '0': '1');
}
bin[width-1]='
include <stdio.h>
include <stdlib.h>
include <string.h>
include <errno.h>
include <cuda.h>
define MAX_SIZE 255
define MAX_RULE 255
define MAX_BIN_SIZE 9
typedef struct
{
int width;
char *cells;
}Lattice;
typedef struct
{
int width;
char *cells;
char next;
}Pattern;
//Kernel declaration
global void executeRule(Lattice l, int rule, Pattern p[8]);
//binary to decimal converter
void decToBin(int number,char *bin);
void createPatterns(Pattern p[8], int rule);
//Host call function
int executeCA(int latWidth, int rule)
{
if(latWidth > MAX_SIZE)
latWidth = MAX_SIZE;
if(rule < 0) rule = 0;
if(rule > MAX_RULE) rule = MAX_RULE;
//Host lattice and pattern
Lattice h_lat;
Pattern h_patterns[8];
size_t memSize = latWidth;
h_lat.width = latWidth;
if((h_lat.cells = (char*)malloc(memSize))==NULL)
{
perror("malloc");
return 1;
}
memset(h_lat.cells,‘0’,memSize);
h_lat.cells[latWidth/2]=‘1’; //Central cell is on
for(int i = 0; i < 8; i++)
{
h_patterns[i].width = 3;
if((h_patterns[i].cells = (char*)malloc(h_patterns[i].width))==NULL)
{
perror("malloc");
return 1;
}
memset(h_patterns[i].cells,'\0',h_patterns[i].width);
}
createPatterns(h_patterns,rule);
//Device lattice and pattern
Lattice d_lat;
Pattern d_patterns[8];
cudaError_t d_error;
memSize = latWidth;
d_lat.width = h_lat.width;
d_error = cudaMalloc((void**)&d_lat.cells, memSize);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(d_lat.cells, h_lat.cells, memSize, cudaMemcpyHostToDevice);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
for(int i = 0; i < 8; i++)
{
d_patterns[i].width = h_patterns[i].width;
d_error = cudaMalloc((void**)&d_patterns[i].cells, d_patterns[i].width);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(d_patterns[i].cells, h_patterns[i].cells, d_patterns[i].width, cudaMemcpyHostToDevice);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
}
//Execute Kernel
dim3 dimGrid(8);
dim3 dimBlock(h_lat.width);
executeRule<<<dimGrid, dimBlock>>>(d_lat, rule, d_patterns);
cudaThreadSynchronize();
d_error = cudaGetLastError();
if(d_error != cudaSuccess)
{
fprintf(stderr,"executeRule<<<%d, %d>>>:%s\n",dimGrid.x,dimBlock.x,cudaGetErrorString(d_error));
return 1;
}
d_error = cudaMemcpy(h_lat.cells, d_lat.cells, memSize, cudaMemcpyDeviceToHost);
if(d_error != cudaSuccess)
{
fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));
return 1;
}
for(int i = 0; i < h_lat.width; i++)
printf("%c",h_lat.cells[i]);
printf(“\n”);
//Cleanup
free(h_lat.cells);
cudaFree(d_lat.cells);
for(int i = 0; i < 8; i++)
{
free(h_patterns[i].cells);
cudaFree(d_patterns[i].cells);
}
return 0;
}
void decToBin(int number, char *bin, int width)
{
if(number > MAX_SIZE) return;
int remain = 0;
int i = width-2;
memset(bin,‘0’,width);
while(number > 0)
{
remain = number % 2;
number = number / 2;
if(i<0) i=0;
bin[i--] = (remain==0 ? '0': '1');
}
bin[width-1]=‘\0’;
}
void createPatterns(Pattern p[8], int rule)
{
int num=7;
int j = 0;
char bin[MAX_BIN_SIZE];
decToBin(rule,bin,MAX_BIN_SIZE);
for(int i = 0; i < 8; i++)
{
decToBin(num,p[i].cells,p[i].width+1);
p[i].next=bin[j++];
num--;
if(num<0) num=0;
}
}
global void executeRule(Lattice l, int rule, Pattern p[8])
{
int idx2 = threadIdx.x;
int patIdx = blockIdx.x;
int idx1 = idx2-1;
int idx3 = idx2+1;
if(idx2 > 1 && idx2 < l.width-1)
{
if(l.cells[idx1] == p[patIdx].cells[0] && l.cells[idx2] == p[patIdx].cells[1] && l.cells[idx3] == p[patIdx].cells[2])
l.cells[idx2] = p[patIdx].next;
}
}
int main(int argc, char *argv)
{
executeCA(149, 90);
return 0;
}
';
}
void createPatterns(Pattern p[8], int rule)
{
int num=7;
int j = 0;
char bin[MAX_BIN_SIZE];
decToBin(rule,bin,MAX_BIN_SIZE);
for(int i = 0; i < 8; i++)
{
decToBin(num,p[i].cells,p[i].width+1);
p[i].next=bin[j++];
num--;
if(num<0) num=0;
}
}
__global__ void executeRule(Lattice l, int rule, Pattern p[8])
{
int idx2 = threadIdx.x;
int patIdx = blockIdx.x;
int idx1 = idx2-1;
int idx3 = idx2+1;
if(idx2 > 1 && idx2 < l.width-1)
{
if(l.cells[idx1] == p[patIdx].cells[0] && l.cells[idx2] == p[patIdx].cells[1] && l.cells[idx3] == p[patIdx].cells[2])
l.cells[idx2] = p[patIdx].next;
}
}
int main(int argc, char *argv[])
{
executeCA(149, 90);
return 0;
}
First of all thanks, and sorry for the long post.