Trying to make a CA application with CUDA

Hi, I’m trying to build a very simple Cellular Automata application using CUDA. The application ideia is to replicate rule 90 from Stephen Wolfram.

What I’m trying to make is that each block should scan the lattice searching for a pre-setted pattern, if it finds it’ll set the central cell as the next from the pattern. Each block should have a size of the Lattice, so that each Thread can look at each cell (and it’s neighbours).

I think I got almost everything right, but on the build I get this warnings (the error lines are not aligned with the code bellow, they are actually the “if(l.cells[idx1] == p[patIdx].cells[0] && l.cells[idx2] == p[patIdx].cells[1] && l.cells[idx3] == p[patIdx].cells[2])” line)

and on the run, I get this error:

I could trim down the execution error to be on this line:

l.cells[idx2] = p[patIdx].next;

This is the final code, if anyone can help me I would highly appreciate:

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <errno.h>

#include <cuda.h>

#define MAX_SIZE 255

#define MAX_RULE 255

#define MAX_BIN_SIZE 9

typedef struct

{

  int width;

  char *cells;

}Lattice;

typedef struct

{

  int width;

  char *cells;

  char next;

}Pattern;

//Kernel declaration

__global__ void executeRule(Lattice l, int rule, Pattern p[8]);

//binary to decimal converter

void decToBin(int number,char *bin);

void createPatterns(Pattern p[8], int rule);

//Host call function

int executeCA(int latWidth, int rule)

{

  if(latWidth > MAX_SIZE)

	latWidth = MAX_SIZE;

  if(rule < 0) rule = 0;

  if(rule > MAX_RULE) rule = MAX_RULE;

  //Host lattice and pattern

  Lattice h_lat;

  Pattern h_patterns[8];

  size_t memSize = latWidth;

  h_lat.width = latWidth;

  if((h_lat.cells = (char*)malloc(memSize))==NULL)

  {

	perror("malloc");

	return 1;

  }

  memset(h_lat.cells,'0',memSize);

  h_lat.cells[latWidth/2]='1';  //Central cell is on

  for(int i = 0; i < 8; i++)

  {

	h_patterns[i].width = 3;

	if((h_patterns[i].cells = (char*)malloc(h_patterns[i].width))==NULL)

	{

	  perror("malloc");

	  return 1;

	}

	memset(h_patterns[i].cells,'

include <stdio.h>

include <stdlib.h>

include <string.h>

include <errno.h>

include <cuda.h>

define MAX_SIZE 255

define MAX_RULE 255

define MAX_BIN_SIZE 9

typedef struct

{

int width;

char *cells;

}Lattice;

typedef struct

{

int width;

char *cells;

char next;

}Pattern;

//Kernel declaration

global void executeRule(Lattice l, int rule, Pattern p[8]);

//binary to decimal converter

void decToBin(int number,char *bin);

void createPatterns(Pattern p[8], int rule);

//Host call function

int executeCA(int latWidth, int rule)

{

if(latWidth > MAX_SIZE)

latWidth = MAX_SIZE;

if(rule < 0) rule = 0;

if(rule > MAX_RULE) rule = MAX_RULE;

//Host lattice and pattern

Lattice h_lat;

Pattern h_patterns[8];

size_t memSize = latWidth;

h_lat.width = latWidth;

if((h_lat.cells = (char*)malloc(memSize))==NULL)

{

perror("malloc");

return 1;

}

memset(h_lat.cells,‘0’,memSize);

h_lat.cells[latWidth/2]=‘1’; //Central cell is on

for(int i = 0; i < 8; i++)

{

h_patterns[i].width = 3;

if((h_patterns[i].cells = (char*)malloc(h_patterns[i].width))==NULL)

{

  perror("malloc");

  return 1;

}

memset(h_patterns[i].cells,'\0',h_patterns[i].width);

}

createPatterns(h_patterns,rule);

//Device lattice and pattern

Lattice d_lat;

Pattern d_patterns[8];

cudaError_t d_error;

memSize = latWidth;

d_lat.width = h_lat.width;

d_error = cudaMalloc((void**)&d_lat.cells, memSize);

if(d_error != cudaSuccess)

{

fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));

return 1;

}

d_error = cudaMemcpy(d_lat.cells, h_lat.cells, memSize, cudaMemcpyHostToDevice);

if(d_error != cudaSuccess)

{

fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

return 1;

}

for(int i = 0; i < 8; i++)

{

d_patterns[i].width = h_patterns[i].width;

d_error = cudaMalloc((void**)&d_patterns[i].cells, d_patterns[i].width);

if(d_error != cudaSuccess)

{

  fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));

  return 1;

}

d_error = cudaMemcpy(d_patterns[i].cells, h_patterns[i].cells, d_patterns[i].width, cudaMemcpyHostToDevice);

if(d_error != cudaSuccess)

{

  fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

  return 1;

}

}

//Execute Kernel

dim3 dimGrid(8);

dim3 dimBlock(h_lat.width);

executeRule<<<dimGrid, dimBlock>>>(d_lat, rule, d_patterns);

cudaThreadSynchronize();

d_error = cudaGetLastError();

if(d_error != cudaSuccess)

{

fprintf(stderr,"executeRule<<<%d, %d>>>:%s\n",dimGrid.x,dimBlock.x,cudaGetErrorString(d_error));

return 1;

}

d_error = cudaMemcpy(h_lat.cells, d_lat.cells, memSize, cudaMemcpyDeviceToHost);

if(d_error != cudaSuccess)

{

fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

return 1;

}

for(int i = 0; i < h_lat.width; i++)

printf("%c",h_lat.cells[i]);

printf(“\n”);

//Cleanup

free(h_lat.cells);

cudaFree(d_lat.cells);

for(int i = 0; i < 8; i++)

{

free(h_patterns[i].cells);

cudaFree(d_patterns[i].cells);

}

return 0;

}

void decToBin(int number, char *bin, int width)

{

if(number > MAX_SIZE) return;

int remain = 0;

int i = width-2;

memset(bin,‘0’,width);

while(number > 0)

{

remain = number % 2;

number = number / 2;

if(i<0) i=0;

bin[i--] = (remain==0 ? '0': '1');

}

bin[width-1]=‘\0’;

}

void createPatterns(Pattern p[8], int rule)

{

int num=7;

int j = 0;

char bin[MAX_BIN_SIZE];

decToBin(rule,bin,MAX_BIN_SIZE);

for(int i = 0; i < 8; i++)

{

decToBin(num,p[i].cells,p[i].width+1);

p[i].next=bin[j++];

num--;

if(num<0) num=0;

}

}

global void executeRule(Lattice l, int rule, Pattern p[8])

{

int idx2 = threadIdx.x;

int patIdx = blockIdx.x;

int idx1 = idx2-1;

int idx3 = idx2+1;

if(idx2 > 1 && idx2 < l.width-1)

{

if(l.cells[idx1] == p[patIdx].cells[0] && l.cells[idx2] == p[patIdx].cells[1] && l.cells[idx3] == p[patIdx].cells[2])

  l.cells[idx2] = p[patIdx].next;

}

}

int main(int argc, char *argv)

{

executeCA(149, 90);

return 0;

}

',h_patterns[i].width);

  }

  createPatterns(h_patterns,rule);

  //Device lattice and pattern

  Lattice d_lat;

  Pattern d_patterns[8];

  cudaError_t d_error;

  memSize = latWidth;

  d_lat.width = h_lat.width;

  d_error = cudaMalloc((void**)&d_lat.cells, memSize);

  if(d_error != cudaSuccess)

  {

	fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));

	return 1;

  }

  d_error = cudaMemcpy(d_lat.cells, h_lat.cells, memSize, cudaMemcpyHostToDevice);

  if(d_error != cudaSuccess)

  {

	fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

	return 1;

  }

  for(int i = 0; i < 8; i++)

  {

	d_patterns[i].width = h_patterns[i].width;

	d_error = cudaMalloc((void**)&d_patterns[i].cells, d_patterns[i].width);

	if(d_error != cudaSuccess)

	{

	  fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));

	  return 1;

	}

	d_error = cudaMemcpy(d_patterns[i].cells, h_patterns[i].cells, d_patterns[i].width, cudaMemcpyHostToDevice);

	if(d_error != cudaSuccess)

	{

	  fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

	  return 1;

	}

  }

  //Execute Kernel

  dim3 dimGrid(8);

  dim3 dimBlock(h_lat.width);

  executeRule<<<dimGrid, dimBlock>>>(d_lat, rule, d_patterns);

  cudaThreadSynchronize();

  d_error = cudaGetLastError();

  if(d_error != cudaSuccess)

  {

	fprintf(stderr,"executeRule<<<%d, %d>>>:%s\n",dimGrid.x,dimBlock.x,cudaGetErrorString(d_error));

	return 1;

  }

  d_error = cudaMemcpy(h_lat.cells, d_lat.cells, memSize, cudaMemcpyDeviceToHost);

  if(d_error != cudaSuccess)

  {

	fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

	return 1;

  }

  for(int i = 0; i < h_lat.width; i++)

	printf("%c",h_lat.cells[i]);

  printf("\n");

  //Cleanup

  free(h_lat.cells);

  cudaFree(d_lat.cells);

  for(int i = 0; i < 8; i++)

  {

	free(h_patterns[i].cells);

	cudaFree(d_patterns[i].cells);

  }

  return 0;

}

void decToBin(int number, char *bin, int width)

{

  if(number > MAX_SIZE) return;

  int remain = 0;

  int i = width-2;

  memset(bin,'0',width);

  while(number > 0)

  {

	remain = number % 2;

	number = number / 2;

	if(i<0) i=0;

	bin[i--] = (remain==0 ? '0': '1');

  }

  bin[width-1]='

include <stdio.h>

include <stdlib.h>

include <string.h>

include <errno.h>

include <cuda.h>

define MAX_SIZE 255

define MAX_RULE 255

define MAX_BIN_SIZE 9

typedef struct

{

int width;

char *cells;

}Lattice;

typedef struct

{

int width;

char *cells;

char next;

}Pattern;

//Kernel declaration

global void executeRule(Lattice l, int rule, Pattern p[8]);

//binary to decimal converter

void decToBin(int number,char *bin);

void createPatterns(Pattern p[8], int rule);

//Host call function

int executeCA(int latWidth, int rule)

{

if(latWidth > MAX_SIZE)

latWidth = MAX_SIZE;

if(rule < 0) rule = 0;

if(rule > MAX_RULE) rule = MAX_RULE;

//Host lattice and pattern

Lattice h_lat;

Pattern h_patterns[8];

size_t memSize = latWidth;

h_lat.width = latWidth;

if((h_lat.cells = (char*)malloc(memSize))==NULL)

{

perror("malloc");

return 1;

}

memset(h_lat.cells,‘0’,memSize);

h_lat.cells[latWidth/2]=‘1’; //Central cell is on

for(int i = 0; i < 8; i++)

{

h_patterns[i].width = 3;

if((h_patterns[i].cells = (char*)malloc(h_patterns[i].width))==NULL)

{

  perror("malloc");

  return 1;

}

memset(h_patterns[i].cells,'\0',h_patterns[i].width);

}

createPatterns(h_patterns,rule);

//Device lattice and pattern

Lattice d_lat;

Pattern d_patterns[8];

cudaError_t d_error;

memSize = latWidth;

d_lat.width = h_lat.width;

d_error = cudaMalloc((void**)&d_lat.cells, memSize);

if(d_error != cudaSuccess)

{

fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));

return 1;

}

d_error = cudaMemcpy(d_lat.cells, h_lat.cells, memSize, cudaMemcpyHostToDevice);

if(d_error != cudaSuccess)

{

fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

return 1;

}

for(int i = 0; i < 8; i++)

{

d_patterns[i].width = h_patterns[i].width;

d_error = cudaMalloc((void**)&d_patterns[i].cells, d_patterns[i].width);

if(d_error != cudaSuccess)

{

  fprintf(stderr,"cudaMalloc: %s\n",cudaGetErrorString(d_error));

  return 1;

}

d_error = cudaMemcpy(d_patterns[i].cells, h_patterns[i].cells, d_patterns[i].width, cudaMemcpyHostToDevice);

if(d_error != cudaSuccess)

{

  fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

  return 1;

}

}

//Execute Kernel

dim3 dimGrid(8);

dim3 dimBlock(h_lat.width);

executeRule<<<dimGrid, dimBlock>>>(d_lat, rule, d_patterns);

cudaThreadSynchronize();

d_error = cudaGetLastError();

if(d_error != cudaSuccess)

{

fprintf(stderr,"executeRule<<<%d, %d>>>:%s\n",dimGrid.x,dimBlock.x,cudaGetErrorString(d_error));

return 1;

}

d_error = cudaMemcpy(h_lat.cells, d_lat.cells, memSize, cudaMemcpyDeviceToHost);

if(d_error != cudaSuccess)

{

fprintf(stderr,"cudaMemcpy: %s\n",cudaGetErrorString(d_error));

return 1;

}

for(int i = 0; i < h_lat.width; i++)

printf("%c",h_lat.cells[i]);

printf(“\n”);

//Cleanup

free(h_lat.cells);

cudaFree(d_lat.cells);

for(int i = 0; i < 8; i++)

{

free(h_patterns[i].cells);

cudaFree(d_patterns[i].cells);

}

return 0;

}

void decToBin(int number, char *bin, int width)

{

if(number > MAX_SIZE) return;

int remain = 0;

int i = width-2;

memset(bin,‘0’,width);

while(number > 0)

{

remain = number % 2;

number = number / 2;

if(i<0) i=0;

bin[i--] = (remain==0 ? '0': '1');

}

bin[width-1]=‘\0’;

}

void createPatterns(Pattern p[8], int rule)

{

int num=7;

int j = 0;

char bin[MAX_BIN_SIZE];

decToBin(rule,bin,MAX_BIN_SIZE);

for(int i = 0; i < 8; i++)

{

decToBin(num,p[i].cells,p[i].width+1);

p[i].next=bin[j++];

num--;

if(num<0) num=0;

}

}

global void executeRule(Lattice l, int rule, Pattern p[8])

{

int idx2 = threadIdx.x;

int patIdx = blockIdx.x;

int idx1 = idx2-1;

int idx3 = idx2+1;

if(idx2 > 1 && idx2 < l.width-1)

{

if(l.cells[idx1] == p[patIdx].cells[0] && l.cells[idx2] == p[patIdx].cells[1] && l.cells[idx3] == p[patIdx].cells[2])

  l.cells[idx2] = p[patIdx].next;

}

}

int main(int argc, char *argv)

{

executeCA(149, 90);

return 0;

}

';

}

void createPatterns(Pattern p[8], int rule)

{

  int num=7;

  int j = 0;

  char bin[MAX_BIN_SIZE];

  decToBin(rule,bin,MAX_BIN_SIZE);

  for(int i = 0; i < 8; i++)

  {

	decToBin(num,p[i].cells,p[i].width+1);

	p[i].next=bin[j++];

	num--;

	if(num<0) num=0;

  }

}

__global__ void executeRule(Lattice l, int rule, Pattern p[8])

{

  int idx2 = threadIdx.x;

  int patIdx  = blockIdx.x;

  int idx1 = idx2-1;

  int idx3 = idx2+1;

  if(idx2 > 1 && idx2 < l.width-1)

  {

	if(l.cells[idx1] == p[patIdx].cells[0] && l.cells[idx2] == p[patIdx].cells[1] && l.cells[idx3] == p[patIdx].cells[2])

	  l.cells[idx2] = p[patIdx].next;

  }

}

int main(int argc, char *argv[])

{

  executeCA(149, 90);

  return 0;

}

First of all thanks, and sorry for the long post.