#ifndef _TEMPLATE_KERNEL_H_ #define _TEMPLATE_KERNEL_H_ #define DEFTAPS 8191 //must be odd value. Freq/Taps = Filter Frequency accuracy. 5.4Hz for 44100. #define THREAD_NUM 512 //executed thread count per block, do not change. shared memory is common in the block. #define DATAPERCYCLE 1024 //data count per loop. do not change __device__ __constant__ float coeff_Kernel[DEFTAPS+1]; //coeff parameters are placed in constant memory. __global__ void calcFIR(const float * g_indata, float * g_outdata, const int CalcSize) { __shared__ float shared[DATAPERCYCLE+THREAD_NUM]; // access Block Width const unsigned int bw = gridDim.x; // access Block ID const unsigned int bix = blockIdx.x; // access thread id const unsigned int tid = threadIdx.x; float dOut; //do FIR //each threads has offseted address to global memory. loop jumps threads*blocks. for (int index = 0; index < CalcSize; index = index + THREAD_NUM*bw) { dOut = 0.0; //read g_indata to Shared Memory //cycle is, ex, 4=8192/2048. for (int j = 0; j < (DEFTAPS+1)/DATAPERCYCLE; j++) { shared[tid ] = g_indata[DATAPERCYCLE*j + THREAD_NUM*bix + index + tid ]; __syncthreads(); shared[tid+THREAD_NUM ] = g_indata[DATAPERCYCLE*j + THREAD_NUM*bix + index + tid + THREAD_NUM ]; __syncthreads(); shared[tid+THREAD_NUM*2] = g_indata[DATAPERCYCLE*j + THREAD_NUM*bix + index + tid + THREAD_NUM*2]; __syncthreads(); for(int k = 0; k < DATAPERCYCLE; k = k+1) { dOut += shared[k + tid] * coeff_Kernel[j*DATAPERCYCLE + k]; } } __syncthreads(); g_outdata[THREAD_NUM*bix + index + tid] = dOut; } } #endif // #ifndef _TEMPLATE_KERNEL_H_