Fill a vertor using CUDA in matlab. Not Working. problem on filling in a vector using cuda and mex f

Hi,

I am trying to generate a matrix of colors to represent a fractal in matlab.

I would like to use mex files anda cuda.

I wrote a code to fill a vector (that matlab sees as a matrix) using cuda but I am having problems.

Matlab close when I execute my mex function. I think is some memeory problem in cuda.

This is the first time I am gonna write something with cuda.

Could you take a look to my code and tell me where I am mistaking?

The code is here:

http://dl.dropbox.com/u/2723775/julia_mex_2.cu

Thank you!

Code here too:

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#include "mex.h"   //--This one is required for mex files

				   

#include "cuda.h"  //Cuda libreries

#include "cuda_runtime.h"

#include "cufft.h"

#include "driver_types.h"

int color_julia(double x, double y, double cx, double cy, double endpoint, double iteractions);

void checkCUDAError(const char *msg);

void julia_fractal(int widht_x , int width_y, double re_c, double im_c, double endpoint, double iteractions, double *output);

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])

	{

		

		//Declarations

		double x,y,cx,cy,endpoint, iteractions;

		

		x = mxGetScalar(prhs[0]);

		y = mxGetScalar(prhs[1]);

		cx = mxGetScalar(prhs[2]);

		cy = mxGetScalar(prhs[3]);

		endpoint = mxGetScalar(prhs[4]);

		iteractions = mxGetScalar(prhs[5]);

		/* Create the array */

		plhs[0] = mxCreateDoubleMatrix(x,y,mxREAL);

		

				

		julia_fractal(x,y,cx,cy,endpoint,iteractions,mxGetPr(plhs[0]));

		

		return;

	}

	

__global__ void fill_colors(float *output,int size)

	  {

		  

	 	int i = blockIdx.x * blockDim.x + threadIdx.x; 

	 	

	 	if(i<size){

				output[i]=100; //color_julia (xpos, ypos, re_c, im_c, endpoint, iteractions);

				

			}else{

				return;

			}

			

	}

	/*

		Prototype of function that decide the color

	*/

	void julia_fractal(int width_x , int width_y, double re_c, double im_c, double endpoint, double iteractions, double *output)

	{

		/*

		* i,j = counters

		* limit_x,limit_y = position on screen starting from the center

		* x,y = counters/position on screen

		* test = variable to decide if or not print the matrix for a test 

		*/	

	

	   int i, j, limit_x, limit_y, x, y, test;

	   float *device_output, *aux;

	   

	   aux=(float*)calloc((width_x*width_y), sizeof(float));

	   

	   //Cuda memory alloc

	   cudaMalloc((void **)&device_output,(sizeof(float)*(width_x*width_y)));

	   

	   /*

	   * xpos,ypos =  coordinates to store the Z

	   * Z is calculed using: 

	   * xpos = (double) ((x - limit_x)*2*threshold)/width_x;

	   * ypos = (double) ((y - limit_y)*2*threshold)/width_y;

	   */

	   

	   double xpos, ypos, threshold = endpoint;

	   int counter = 0;

	

			

	   /*

	   * Here we go to the center of the screen

	   */

	   limit_x = width_x / 2;

	   limit_y = width_y / 2;

	 	   

	   test = 1;

			  if(test == 1){

				  printf("PRINT\n");

				  for(i=0; i<(width_x*width_y);i++){

						  

							  printf("output1[%d] = %d",i,output[i]);

							  

						  printf("\n");

				  }

				  

				printf("END PRINT\n");

			  }

	   	  

	   //configuration of the calling

	   dim3 dimBlock(16); 

	   dim3 dimGrid((width_x + dimBlock.x-1)/dimBlock.x, (width_y + dimBlock.y-1) / dimBlock.y); 

	   //CUDA functioln calling

	   fill_colors<<<dimBlock, dimGrid>>>(device_output, (width_x*width_y));

	   

	   //CUDA error control

		 cudaThreadSynchronize();

		 checkCUDAError("call test_1");

				

	   //copy memory

	   cudaMemcpy(aux, device_output, sizeof(float)*(width_x*width_y), cudaMemcpyDeviceToHost);

	   

	   //casting

	   for(i=0;i<(width_x*width_y);i++)

	   	output[i]=(double)aux[i];   

	

		  /*

		  * Test

		  * value test to 1 if you want to print the matrix

		  */

		  test = 1;

			  if(test == 1){

				  printf("PRINT\n");

				  for(i=0; i<(width_x*width_y);i++){

						  

						  printf("output2[%d] = %g",i,output[i]);

							  

						  printf("\n");

				  }

				  

				printf("END PRINT\n");

			  }

	  return;

	}

	

	/*

	* This functions works on complex numbers (the C) and produces the color for

	* the coordinate we are asking for on axis. 

	*/

	int color_julia(double x, double y, double cx, double cy, double endpoint, double iteractions)

	{

	   int color = 0,i;

	   double zx = x, zy = y;

	   double new_zx, new_zy;

	   

	   for(i = 1; i<=(int)iteractions;i++)

		  {

		  // calculate Z

		  // re(z)=re(z^2) + re(c)

		  new_zx = zx * zx - zy * zy + cx;

		  // im(z)=2*im(z^2) + im(c)

		  new_zy = 2 * zx * zy + cy;

	

		  zx = new_zx;

		  zy = new_zy;

	

		  /* 

		  * Calculate the module than check if is out of range 2 (the one we are common using 

		  * in all the implementations). sqrt(re^2 + im^2) > 2 ???

		  */

		  if (sqrt(zx * zx + zy * zy) > endpoint)

		 	break;

		  // increment color

		  color++;

		  // se supera il tetto max ritorna colore

		  if (color >= 256)

		 		break;

		  }

	   return color;

	}

	

void checkCUDAError(const char *msg)

{

	cudaError_t err = cudaGetLastError();

	if( cudaSuccess != err) 

	{

		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

		exit(-1);

	}						 

}

	

	/*	TEST

	

		void main(int argc, char **argv){

		

		double *k;

		k = calloc(100,sizeof(double);

				julia_fractal(10,10,2,1.5,k);	

			

		}*/

We tryed to put

dim3 dimGrid(width_x*width_y);

and it works but only for small numbers.
Using width 200 (so 200*200 = 40.000) works
but not for bugger numbers. I need to work on vectors
of 2500 * 2500 at least!

Please how can I do?