Code freezes with Matlab, Mex and CUDA

Hi,

I am quite new to CUDA, but i have some experience in hlsl so i figured i try speeding um my feature detection. But now i am a bit confused: If i run the kernel as a dummy just giving back the color of the given image everything works fine. But when i start the real kernel my code just freezes and i have no idea why. Does anyone has any idea why? I allready checked the kernel with a friend but we could not iron it out. Do you guys have suggestions why i am running into problems?

Here is my kernel with helper

void __device__ __forceinline__ sampleInterpolated(double * out, double const * img, double x, double y, int sizeX) {
    const int floorX = floorf(x);
    const int floorY = floorf(y);
    const double offsetX = x - floorX;
    const double offsetY = y - floorY;
    const double Q_0 = (1-offsetX)*img[floorY*sizeX+floorX] + offsetX*img[floorY*sizeX +floorX+1];
    const double Q_1 = (1-offsetX)*img[(floorY+1)*sizeX +floorX] + offsetX*img[(floorY+1)*sizeX +floorX+1];
    out[0] = (1-offsetY)*Q_0 + offsetY*Q_1;
}

void __global__ calcScratchDirection(double * outX,
                                     double * outY,
                                     double * img,
                                     int sizeX,
                                     int sizeY,
                                     double * directions,
                                     int directionsLength)
{
    int x = blockIdx.x*blockDim.x + threadIdx.x;
    int y = blockIdx.y*blockDim.y + threadIdx.y;
    if(x>=sizeX || y>=sizeY)
        return;
    if(x<21 || y<21 || x>sizeX-21 || y>sizeY-21) {
        outX[y*sizeX+x] = 0;
        outY[y*sizeX+x] = 0;
        return;
    }
    const int n = directionsLength;
    int vectorLength = 10;
    double bestScore = 999999999999;
    double bestDirX = 0;
    double bestDirY = 0;
    const double basePixel = img[y*sizeX+x];
    if(basePixel < 10)
        vectorLength *= 2;
    for(int i=0;i<n;i++) {
        const double dirX = directions[i];
        const double dirY = directions[n+i];
        double score = 0;
        for(int t=1;t<=vectorLength;t++){
            double value;
            sampleInterpolated(&value, img, x+dirX*t, y+dirY*t, sizeX);
            score += (basePixel-value)*(basePixel-value);
        }
        if(score < bestScore) {
            bestScore = score;
            bestDirX = dirX;
            bestDirY = dirY;
        }
    }
    outY[y*sizeX+x] = bestDirX; //Yes swap them cause matlab indexing is crazy
    outX[y*sizeX+x] = bestDirY;
}

And here is my setup. I think this should be finem but you never know… :D

void mexFunction(int nlhs, mxArray *plhs[],
                 int nrhs, mxArray const *prhs[])
{
    /* Declare all variables.*/
    double *img = (double*)mxGetData(prhs[0]);
    int sizeX = mxGetM(prhs[0]);
    int sizeY = mxGetN(prhs[0]);
    double *directions = (double*)mxGetData(prhs[1]);
    int directionsLength = mxGetM(prhs[1]);
    double *d_img;
    double *d_directions;
    double *d_dirX;
    double *d_dirY;
    double *dirX = (double*)mxMalloc(sizeX*sizeY*sizeof(double));
    double *dirY = (double*)mxMalloc(sizeX*sizeY*sizeof(double));
    
    mxInitGPU();
    
    cudaMalloc(&d_img, sizeX*sizeY*sizeof(double));
    cudaMalloc(&d_directions, 180*2*sizeof(double));
    cudaMalloc(&d_dirX, sizeX*sizeY*sizeof(double));
    cudaMalloc(&d_dirY, sizeX*sizeY*sizeof(double));
    
    cudaMemcpy(d_img, img, sizeX*sizeY*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_directions, directions, 180*2*sizeof(double), cudaMemcpyHostToDevice);

    dim3 const blockSize(32, 32);
    dim3 const gridSize = dim3 ((sizeX+blockSize.x-1)/blockSize.x,
                                (sizeY+blockSize.y-1)/blockSize.y);

    calcScratchDirection<<<gridSize, blockSize>>>(d_dirX, d_dirY, d_img, sizeX, sizeY, d_directions, directionsLength);
    std::cout << "Scratch direction calculated." << std::endl;
    
    cudaMemcpy(dirX, d_dirX, sizeX*sizeY*sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(dirY, d_dirY, sizeX*sizeY*sizeof(double), cudaMemcpyDeviceToHost);
    
    plhs[0] = mxCreateUninitNumericMatrix(sizeX, sizeY, mxDOUBLE_CLASS, mxREAL);
    plhs[1] = mxCreateUninitNumericMatrix(sizeX, sizeY, mxDOUBLE_CLASS, mxREAL);
    mxSetData(plhs[0], dirX);
    mxSetData(plhs[1], dirY);
    
    cudaFree(d_img);
    cudaFree(d_directions);
    cudaFree(d_dirX);
    cudaFree(d_dirY);
}

Thanks!

#EDIT
It appeary that my problem was with Matlab. every time i compiled my code with mexcuda() i got a parsing protocoll of my code, but the final compiled programm was still the old one. I filled everything with 0.5 and then changed it to 0.4, compiled and ran it. Result was 0.5 everywhere. I compiled it with visual studio and now everything is working as intended.