Hi,
at the moment i have to run in EMU-Mode. first some code:
[codebox]//VST Plugin Main App cpp
void VSTServer::processReplacing (float** inputs, float** outputs, VstInt32 sampleFrames) //this is called everytime the audiobuffer is full
{
....
//DSP! ////////////////////////////////////////////////////////////////////////////////
if (!CudaIsInit){
width = BufferSize;
height = cEffect.numOutputs;
inputBuffer = new float[width * height];
outputBuffer = new float[width * height];
for (int i = 0; i<width*height; i++)
outputBuffer[i] = 0.0;
CudaIsInit = true;
}
GPUGAIN(fGain, width, height, inputBuffer, outputBuffer, BlockBufferIn); //calling cpp inegrated cuda code
....
// processed audio goes to soundcard! DO I NEED SOME SYNC STUFF HERE, GPUGAIN should be done before routing the audio
}[/codebox]
code above: without some buffering stuff. interesting part: GPUGAIN(fGain, width, height, inputBuffer, outputBuffer, BlockBufferIn);
fgain = gain for audio, width = number of elements in the audiobuffer, height = channels.
[codebox]
// cppcode.cu
// includes, project
#include <cutil_inline.h>
// includes, kernels
#include <GPUGAIN_kernel.cu>
extern “C” void GPUGAIN(constant float fGain, unsigned int width,unsigned int height,float* inputBuffer,float* outputBuffer, float** BlockBufferIn)
{
float *a_d; // pointer to device mem
int i,j;
size_t MEMsize = width*height*sizeof(float);
unsigned int ArraySize = width * height;
// allocate array on Device
( cudaMalloc((void **) &a_d, MEMsize));
for(i = 0; i < height; i++)
{
for(j = 0; j < width; j++)
{
unsigned int index = i * width + j;
inputBuffer[index] = BlockBufferIn[i][j]; // putting all audiodata in one vector
}
}
// copy data from host to device
( cudaMemcpy(a_d, inputBuffer, MEMsize, cudaMemcpyHostToDevice));
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int nBlocks = (ArraySize)/BLOCK_DIM + ((ArraySize)%BLOCK_DIM == 0?0:1);
//cudaSetDeviceFlags(cudaDeviceBlockingSync);
// Part 2 of 2. Call GPUGAIN kernel
GPUGAIN<<< nBlocks, BLOCK_DIM>>> (fGain, a_d, (ArraySize));
// Retrieve result from device and store in b_h
(cudaMemcpy(outputBuffer, a_d, MEMsize, cudaMemcpyDeviceToHost));
// cleanup
(cudaFree(a_d));
}[/codebox]
[codebox]
// kernel_code.cu
#ifndef GPUGAIN_KERNEL_H
#define GPUGAIN_KERNEL_H
#define BLOCK_DIM 32
global void GPUGAINOnDevice(float constant fGain, float *a, int size)
{
int Xidx = blockIdx.x* blockDim.x + threadIdx.x;
if (Xidx < size)
{
a[Xidx] = a[Xidx] * fGain;
}
//__syncthreads();
}
#endif // GPUGAIN_KERNEL_H[/codebox]
Ok. its working… almost… in the audio output i got som clicking noise.
Would it be better to put the audiodata in a Matrix instaed of a vector?
do i have to sync the gpu to the cpu?
any “no goes” in the principal idea?
this code is made by editing some existing sample code (http://codereflect.com/2008/09/29/how-to-call-cuda-programs-from-a-cc-application/)