/* * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. */ /* WaveX01 project which demonstrates the basics on how to setup a project * example application. * HOST CODE. */ // includes, system #include #include #include #include #include #include #include #include // includes, project #include #include // includes, kernels #include //////////////////////////////////////////////////////////////////////////////// // declaration, forward void ReadFIRCoeff(void); //Read desired Frequency void ReadConfig(char * ConfigFile); //Read Configuration int ProcessWav(char *infilename, char *outfilename); //Prototype Routine char InputFile[256]; char ConfigFile[256]; char OutputFile[256]; //parameters int WayofDivide; //4 = 4Way FIR Divider int OutputFormat_Bit; //16 or 24 bool StreamOut; //true:output to Xylo bool FileOut; //true:output to File #define DEFTAPS 8192 struct FIRParams{ int DelayLength; //Channel Delay Sample Length int chPointer[2]; //Channel Processing Pointer for Wave Data (initialized negative for delay) bool Enabled; //Channel Used char CoeffFile[256]; //Coeff File Name float FIRCoeff[DEFTAPS]; //Coeff Value }; struct FIRParams FIR[4]; const unsigned int CUDABLOCKS = 16; //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //Check Arguments Count if (argc != 4) { printf("requires 3 arguments.\n"); printf("WaveX01 inWaveFile ConfigFile OutWaveXFile\n"); printf("WaveX01 \"C:\\\\Temp\\\\MusicFile.wav\" \"C:\\\\Temp\\\\WaveX.cfg\" \"C:\\\\Temp\\\\MusicFile.wav8\" \n"); return -1; } strncpy_s(InputFile,256, argv[1],strlen(argv[1])); strncpy_s(ConfigFile,256, argv[2],strlen(argv[2])); strncpy_s(OutputFile,256, argv[3],strlen(argv[3])); ReadConfig(ConfigFile); ReadFIRCoeff(); ProcessWav(InputFile,OutputFile); } void ReadConfig(char* FileName) { char RetStr[256]; for (int i = 0; i < 4; i++) { FIR[i].Enabled = false; strcpy(FIR[i].CoeffFile, ""); for (int j = 0; j < DEFTAPS; j++) { FIR[i].FIRCoeff[j] = 0.0; } } GetPrivateProfileString("Global_Params", "WAYS", "1", RetStr, 64, FileName); WayofDivide = atoi(RetStr); GetPrivateProfileString("Global_Params", "FORMAT", "16", RetStr, 64, FileName); OutputFormat_Bit = atoi(RetStr); GetPrivateProfileString("Global_Params", "StreamOut", "0", RetStr, 64, FileName); if (strcmp("1",RetStr)==0) StreamOut = true; else StreamOut = false; GetPrivateProfileString("Global_Params", "FileOut", "0", RetStr, 64, FileName); if (strcmp("1",RetStr)==0) FileOut = true; else FileOut = false; GetPrivateProfileString("CH12", "COEFF", "", FIR[0].CoeffFile, 256, FileName); GetPrivateProfileString("CH34", "COEFF", "", FIR[1].CoeffFile, 256, FileName); GetPrivateProfileString("CH56", "COEFF", "", FIR[2].CoeffFile, 256, FileName); GetPrivateProfileString("CH78", "COEFF", "", FIR[3].CoeffFile, 256, FileName); GetPrivateProfileString("CH12", "Delay", "0", RetStr, 64, FileName); FIR[0].DelayLength = atoi(RetStr); GetPrivateProfileString("CH34", "Delay", "0", RetStr, 64, FileName); FIR[1].DelayLength = atoi(RetStr); GetPrivateProfileString("CH56", "Delay", "0", RetStr, 64, FileName); FIR[2].DelayLength = atoi(RetStr); GetPrivateProfileString("CH78", "Delay", "0", RetStr, 64, FileName); FIR[3].DelayLength = atoi(RetStr); for (int i = 0; i < 4; i++) { FIR[i].chPointer[0] -= FIR[i].DelayLength; FIR[i].chPointer[1] -= FIR[i].DelayLength; } for (int i = 0; i < WayofDivide-1; i++) { FIR[i].Enabled = true; } } void ReadFIRCoeff(void) { FILE * f; char buf[64]; for (int j = 0; j < 4; j++) { if ((f = fopen(FIR[j].CoeffFile,"r")) != NULL) { int i = 0; double tmp = 0.0; while((!feof(f)) && (i < DEFTAPS)) { fgets(buf, sizeof(buf), f); sscanf(buf, "%lf", &tmp); FIR[j].FIRCoeff[i] = (float)tmp; i++; } fclose(f); } } } unsigned read2bytes(FILE *f) { unsigned char buf[2]; if (fread(buf, 2, 1, f) != 1) { fprintf(stderr, "Read error\n"); return(1); } return 256U * buf[1] + buf[0] ; } unsigned long read4bytes(FILE *f) { unsigned char buf[4]; if (fread(buf, 4, 1, f) != 1) { fprintf(stderr, "Read error\n"); return(1); } return ((256LU * buf[3] + buf[2]) * 256LU + buf[1] ) * 256LU + buf[0] ; } int CheckInFile(FILE * f) { unsigned long len; unsigned char s[10]; int channels, bits; if (f == NULL) { printf("Can not open File\n"); return -1; } if (fread(s, 4, 1, f) != 1) { printf("Read error\n"); fclose(f); return -1; } if (memcmp(s, "RIFF", 4) != 0) { printf("Not a RIFF format\n"); fclose(f); return -1; } printf("[RIFF] (%lu bytes)\n", read4bytes(f)); if (fread(s, 8, 1, f) != 1) { printf("Read error\n"); fclose(f); return -1; } if (memcmp(s, "WAVEfmt ", 8) != 0) { printf("Not a WAVEfmt format\n"); fclose(f); return -1; } len = read4bytes(f); printf("[WAVEfmt ] (%lu bytes)\n", len); if (len != 16) { printf("Length of WAVEfmt must be 16\n"); return -1; } printf(" Data type = %u (1 = PCM)\n", read2bytes(f)); channels = read2bytes(f); printf(" Number of channels = %u (1 = mono, 2 = stereo)\n", channels); printf(" Sampling rate = %luHz\n", read4bytes(f)); printf(" Bytes per second = %lu\n", read4bytes(f)); printf(" Bytes per sample = %u\n", read2bytes(f)); bits = read2bytes(f); printf(" Bits per sample = %u\n", bits); return 0; } void PrepareOutFile(FILE * fo, unsigned long len, int channels, int BitFormat) { unsigned long temp_l; unsigned short temp_s; unsigned char s[10]; //Prepare Output Wave File // RIFF s[0] = 'R'; s[1] = 'I'; s[2] = 'F'; s[3] = 'F'; fwrite(s, 1, 4, fo); //filesize temp_l = len + 36; fwrite(&(temp_l), sizeof(long), 1, fo); // WAVE s[0] = 'W'; s[1] = 'A'; s[2] = 'V'; s[3] = 'E'; fwrite(s, 1, 4, fo); // fmt chunk s[0] = 'f'; s[1] = 'm'; s[2] = 't'; s[3] = ' '; fwrite(s, 1, 4, fo); // chunk size temp_l = 16; fwrite(&(temp_l), sizeof(long), 1, fo); // format PCM = 1 temp_s = 1; fwrite(&(temp_s), sizeof(short), 1, fo); // channel stereo = 2 temp_s = channels; fwrite(&(temp_s), sizeof(short), 1, fo); // sample 44100 temp_l = 44100; fwrite(&(temp_l), sizeof(long), 1, fo); // Bytes per sec temp_l = channels * BitFormat/8 * 44100; fwrite(&(temp_l), sizeof(long), 1, fo); // Block (Bytes per sample&channel) temp_s = channels * BitFormat/8 ; fwrite(&(temp_s), sizeof(short), 1, fo); // Bits per sample temp_s = BitFormat ; fwrite(&(temp_s), sizeof(short), 1, fo); // data chunk s[0] = 'd'; s[1] = 'a'; s[2] = 't'; s[3] = 'a'; fwrite(s, 1, 4, fo); // data chunk size fwrite(&(len), sizeof(long), 1, fo); } int ProcessWav(char *infilename, char *outfilename) { //Variables //p : pointer for original wav data memory. (BYTE pointer) //WaveDataLen : Total Data BYTEs in original wav file. //PacketLength : Processing in GPU, Data Count. handled data is float. //TotalDataSize : Resulting WAVx file, data chunk size. //MaxDelaySize : each channel has delay sampling count. most large one, it effects result size. //h_idata : memory, Host, to put into GPU. //datacount : Posted Data length as float, = PacketLength + DEFTAPS. //d_coeff : memory, GPU, to store coefficient.size is 4*DEFTAPS //d_idata : memory, GPU, to store input wave data. length needs packetlen + coeff len. //d_odata : memory, GPU, result data stored. size is equal to input packet size. (not including coeffs tail size) //h_odata : memory, Host, to store result. there are total 8 (4Way * L,R) buffer. size is same as packet. //sWave_Out_Data : memory, Host, to store 16 bit WAVx format. type = short. PacketLength*sizeof(short)*2*WayofDivide //wWave_Out_Data : memory, Host, to store 24 bit WAVx format. type = 3 bytes. PacketLength*sizeof(char)*3*2*WayofDivide //FIR[j].DelayLength: Channel Delay, as Sample Length. //FIR[j].chPointer[k] : Current sampling pointer count for the channel. k=0(L),1(R) //FIR[j].Enabled : channel enabled flag. //FIR[j].FIRCoeff[m]: Channel FIR Coeff. //LoopCount : calculated main loop max count. ceiled to int. FILE *f, *fo; unsigned long WaveDataLen; unsigned char s[10]; CUT_DEVICE_INIT(); printf("finename = '%s'\n", infilename); f = fopen(infilename, "rb"); if (CheckInFile(f) == -1) return -1; while (fread(s, 4, 1, f) == 1) { WaveDataLen = read4bytes(f); s[4] = 0; printf("[%s] (%lu bytes)\n", s, WaveDataLen); if (memcmp(s, "data", 4) == 0) break; for (int i = 0; i < (int)WaveDataLen; i++) printf("%02x ", fgetc(f)); printf("\n"); } const unsigned int PacketLength = 65536; //File Data Size int TotalDataSize = 0; //Delay size int MaxDelaySize = 0; //Padding Size int PaddingLen = 0; //File Output Mode, Prepare output file if (FileOut) { fo = fopen(outfilename, "wb"); if (fo == NULL) { printf("Can not create %s\n", outfilename); return -1; } //Calculate File Length //TotalSize = (WaveLength + MaxDelay)*ways*Bytes for (int i = 0; i < WayofDivide; i++) { if (FIR[i].DelayLength > MaxDelaySize) MaxDelaySize = FIR[i].DelayLength; } //WaveDataLen / 4 = sample size for one channel PaddingLen = PacketLength - ((WaveDataLen/4 + MaxDelaySize) % PacketLength); TotalDataSize = (WaveDataLen/4 + MaxDelaySize + PaddingLen) * WayofDivide * 2 * (OutputFormat_Bit/8) ; PrepareOutFile(fo, TotalDataSize, WayofDivide * 2, OutputFormat_Bit); } //get memory for "WaveDataLen" + MaxDelaySize*(short)*2, + 65536 Padding, + Tailing TAPS area bytes. unsigned char * p; p = (unsigned char *)malloc(WaveDataLen + sizeof(short)*2*(MaxDelaySize+PaddingLen + DEFTAPS*2)); //fill top half tap for (int i = 0; i < DEFTAPS; i++) { (*(p + i)) = 0; } //Copy File to Memory int readsize = fread((p + DEFTAPS), 1, WaveDataLen ,f); for (int i = 0; i < sizeof(short)*2*(MaxDelaySize + PaddingLen + DEFTAPS); i++) { (*(p + DEFTAPS + WaveDataLen + i)) = 0; } // Now WAV file is ready in Host Memory. //Prepare Host and Device Memory float * h_idata; //Host Data int datacount = PacketLength + DEFTAPS; cudaError_t cret; cret = cudaMallocHost((void**)&h_idata, sizeof(float)*(datacount+DEFTAPS)); if (cret != CUDA_SUCCESS) { printf("Can not allocate %d bytes host memory.\n", sizeof(float)*(datacount+DEFTAPS)); return -1; } //allocate device input memory float * d_idata; CUDA_SAFE_CALL(cudaMalloc((void**) &d_idata, sizeof(float)*(datacount+DEFTAPS)) ); // allocate device memory for result float * d_odata; CUDA_SAFE_CALL(cudaMalloc((void**) &d_odata, sizeof(float)*PacketLength) ); //allocate host output memory float * h_odata[8]; for (int i = 0 ; i < 8; i++) { cret = cudaMallocHost((void**)&(h_odata[i]), sizeof(float)*PacketLength); if (cret != CUDA_SUCCESS) { printf("Can not allocate %d bytes host memory.\n", sizeof(float)*PacketLength); return -1; } } //allocate host WAVE format buffer //16bit buffer. 1L1L 1R1R 2L2L 2R2R 3L3L 3R3R 4L4L 4R4R short * sWave_Out_Data; cret = cudaMallocHost((void**)&sWave_Out_Data, PacketLength*sizeof(short)*2*WayofDivide); if (cret != CUDA_SUCCESS) { printf("Can not allocate %d bytes host memory.\n", PacketLength*sizeof(short)*2*WayofDivide); return -1; } //24bit buffer. 1L1L1L 1R1R1R 2L2L2L ... unsigned char * wWave_Out_Data; cret = cudaMallocHost((void**)&wWave_Out_Data, PacketLength*sizeof(char)*3*2*WayofDivide); if (cret != CUDA_SUCCESS) { printf("Can not allocate %d bytes host memory.\n", PacketLength*sizeof(char)*3*2*WayofDivide); return -1; } // setup execution parameters dim3 grid( CUDABLOCKS, 1, 1); dim3 threads( THREAD_NUM, 1, 1); //Loop for each Packet(total length/65536) int LoopCount = 0; LoopCount = ((WaveDataLen/4) + MaxDelaySize + PaddingLen) / PacketLength; for (int i = 0; i < LoopCount; i++) { //Loop for FIR[0], [1], [2], [3]: Needs to change Coeffs. for (int j = 0; j < WayofDivide; j++) { //Loop for L, R: prepare 65536 words + 8192 words to process. for (int k = 0; k < 2; k++) { unsigned char LData1, LData2; short LData; float LDataCnv; for (int m = 0; m < datacount; m++) { if (FIR[j].chPointer[k] >= 0) { //regular case. copy actual data. LData1 = (unsigned char) (* (p+FIR[j].chPointer[k]*4+k*2 ) ); LData2 = (unsigned char) (* (p+FIR[j].chPointer[k]*4+k*2 + 1) ); LData = (short)(LData1 + (LData2<<8)); LDataCnv = ((float)(LData))/32768.0; h_idata[m] = LDataCnv; FIR[j].chPointer[k] += 1; } else { //beggining, copy 0 for delay padding. h_idata[m] = 0.0; FIR[j].chPointer[k] += 1; } } //TAP area was copied to buffer, by m. get step back data pointer. FIR[j].chPointer[k] -= DEFTAPS; //Copy Wave Data to Device // copy host memory to device CUDA_SAFE_CALL( cudaMemcpy(d_idata, h_idata,sizeof(float)*(datacount+DEFTAPS),cudaMemcpyHostToDevice)); //copy coeffs to constant CUDA_SAFE_CALL( cudaMemcpyToSymbol(coeff_Kernel, FIR[j].FIRCoeff, sizeof(float)*DEFTAPS) ); //Call Kernel CUDA_SAFE_CALL( cudaThreadSynchronize() ); calcFIR<<>>(d_idata, d_odata, PacketLength); CUT_CHECK_ERROR("calcFIR failed"); CUDA_SAFE_CALL( cudaThreadSynchronize() ); //Get processed data to Packet Output Buffer CUDA_SAFE_CALL( cudaMemcpy( (void *)(h_odata[j*2 + k]), d_odata, sizeof(float)*PacketLength, cudaMemcpyDeviceToHost) ); }//L, R Loop End }//0,1,2,3 Loop End printf("Calculating %4d / %4d \r", i+1, LoopCount); //8xPacket buffer ready //Copy to Wave Format Buffer switch (OutputFormat_Bit) { case 16: float LValue; float RValue; // convert back to 16bit short for ( int j = 0; j < WayofDivide; j++) { for (int i = 0; i < PacketLength; i++) { LValue = (h_odata[j*2 ][i]); RValue = (h_odata[j*2+1][i]); sWave_Out_Data[i*WayofDivide*2 + j*2 ] = (short)(LValue*32768.0); sWave_Out_Data[i*WayofDivide*2 + j*2 + 1] = (short)(RValue*32768.0); } } break; case 24: for (int i = 0; i < PacketLength; i++) { for ( int j = 0; j < WayofDivide; j++) { float LValue; float RValue; LValue = (h_odata[j*2 ][i]); RValue = (h_odata[j*2+1][i]); long LConv, RConv; LConv = (long)(LValue * 256.0 * 32768.0); RConv = (long)(RValue * 256.0 * 32768.0); wWave_Out_Data[i*WayofDivide*3*2 + j*3*2 ] = (char)(LConv & 0xFF); //lower byte wWave_Out_Data[i*WayofDivide*3*2 + j*3*2 + 1] = (char)((LConv>>8 ) & 0xFF); //mid byte wWave_Out_Data[i*WayofDivide*3*2 + j*3*2 + 2] = (char)((LConv>>16) & 0xFF); //upper byte wWave_Out_Data[i*WayofDivide*3*2 + j*3*2 + 3] = (char)(RConv & 0xFF); //lower byte wWave_Out_Data[i*WayofDivide*3*2 + j*3*2 + 4] = (char)((RConv>>8 ) & 0xFF); //mid byte wWave_Out_Data[i*WayofDivide*3*2 + j*3*2 + 5] = (char)((RConv>>16) & 0xFF); //upper byte } } break; } //if streamout = true //Push to FX2 stream //blocked until PC buffer < Half Full if (FileOut) { //write packet data to file switch (OutputFormat_Bit) { case 16: fwrite(sWave_Out_Data, sizeof(short), PacketLength*WayofDivide*2, fo); break; case 24: fwrite(wWave_Out_Data, sizeof(char), PacketLength*WayofDivide*3, fo); break; } } //End Loop for Packet } printf("\ndone FIR processing.\n"); //Free Memory fclose(f); if (FileOut) fclose(fo); free(p); cudaFreeHost(h_idata); cudaFree(d_idata); cudaFree(d_odata); for (int i = 0; i < 8; i++) { cudaFreeHost(h_odata[i]); } cudaFreeHost(sWave_Out_Data); cudaFreeHost(wWave_Out_Data); return 0; }