标签:initial \n offset serve was 2.0 dea created sam
在目录c64plus\dsplib_v210\src\DSP_fft16x16,包含了三个层次的FFT库函数,分别是natural C version, intrinsic C version, serial SA version,最后一个是汇编级。在DSP_fft16x16_d.c中有三个测试用例对比耗时。三个函数用法差不多,例如:
void DSP_fft16x16_cn ( const short * ptr_w, int npoints, short * ptr_x, short * ptr_y ); void test_fft(void) { const int N = 512; short x[2*N], y[2*N], w[2*N]; gen_twiddle_fft16x16(w, N); double fs = 8000.0, f1 = 114.8, f2 = 186.2; short x8k[8000]; for (int i=0; i<8000; ++i) x8k[i] = 1800.0 * (cos(2*PI*f1*i/fs) + cos(2*PI*f2*i/fs)); short x5[500]; // downsample for (int i=0; i<500; ++i) x5[i] = x8k[16*i]; int k = 0; for (; k<6; k++) { x[2*k + 0] = 0; x[2*k + 1] = 0; } for (; k<106; k++) { x[2*k + 0] = x5[k-6]; x[2*k + 1] = 0; } for (; k<N; k++) { x[2*k + 0] = 0; x[2*k + 1] = 0; } DSP_fft16x16_cn(w, N, x, y); for (int i=0; i<N; ++i) { printf("%d\n", y[2*i]); printf("%d\n", y[2*i+1]); } }
输出y是实部、虚部交替,可以求出功率值。
其中库函数的等效C源码是:
1 /* ======================================================================== */ 2 /* TEXAS INSTRUMENTS, INC. */ 3 /* */ 4 /* NAME */ 5 /* DSP_fft16x16_cn -- fft16x16 */ 6 /* */ 7 /* USAGE */ 8 /* This routine is C-callable and can be called as: */ 9 /* */ 10 /* void DSP_fft16x16_cn ( */ 11 /* const short * ptr_w, */ 12 /* int npoints, */ 13 /* short * ptr_x, */ 14 /* short * ptr_y */ 15 /* ); */ 16 /* */ 17 /* ptr_w = input twiddle factors */ 18 /* npoints = number of points */ 19 /* ptr_x = transformed data reversed */ 20 /* ptr_y = linear transformed data */ 21 /* */ 22 /* */ 23 /* DESCRIPTION */ 24 /* The following code performs a mixed radix FFT for "npoints" which */ 25 /* is either a multiple of 4 or 2. It uses logN4 - 1 stages of radix4 */ 26 /* transform and performs either a radix2 or radix4 transform on the */ 27 /* last stage depending on "npoints". If "npoints" is a multiple of 4, */ 28 /* then this last stage is also a radix4 transform, otherwise it is a */ 29 /* radix2 transform. */ 30 /* */ 31 /* */ 32 /* int gen_twiddle_fft16x16(short *w, int n) */ 33 /* */ 34 /* int i, j, k; */ 35 /* double M = 32767.5; */ 36 /* */ 37 /* for (j = 1, k = 0; j < n >> 2; j = j << 2) */ 38 /* { */ 39 /* for (i = 0; i < n >> 2; i += j << 1) */ 40 /* { */ 41 /* */ 42 /* w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n)); */ 43 /* w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n)); */ 44 /* w[k + 9] = d2s(M * cos(6.0 * PI * (i ) / n)); */ 45 /* w[k + 8] = d2s(M * sin(6.0 * PI * (i ) / n)); */ 46 /* */ 47 /* w[k + 7] = d2s(M * cos(4.0 * PI * (i + j) / n)); */ 48 /* w[k + 6] = d2s(M * sin(4.0 * PI * (i + j) / n)); */ 49 /* w[k + 5] = d2s(M * cos(4.0 * PI * (i ) / n)); */ 50 /* w[k + 4] = d2s(M * sin(4.0 * PI * (i ) / n)); */ 51 /* */ 52 /* w[k + 3] = d2s(M * cos(2.0 * PI * (i + j) / n)); */ 53 /* w[k + 2] = d2s(M * sin(2.0 * PI * (i + j) / n)); */ 54 /* w[k + 1] = d2s(M * cos(2.0 * PI * (i ) / n)); */ 55 /* w[k + 0] = d2s(M * sin(2.0 * PI * (i ) / n)); */ 56 /* */ 57 /* k += 12; */ 58 /* */ 59 /* */ 60 /* } */ 61 /* } */ 62 /* */ 63 /* return k; */ 64 /* */ 65 /* */ 66 /* ASSUMPTIONS */ 67 /* This code works for both "npoints" a multiple of 2 or 4. */ 68 /* The arrays ‘x[]‘, ‘y[]‘, and ‘w[]‘ all must be aligned on a */ 69 /* double-word boundary for the "optimized" implementations. */ 70 /* */ 71 /* The input and output data are complex, with the real/imaginary */ 72 /* components stored in adjacent locations in the array. The real */ 73 /* components are stored at even array indices, and the imaginary */ 74 /* components are stored at odd array indices. */ 75 /* */ 76 /* TECHNIQUES */ 77 /* The following C code represents an implementation of the Cooley */ 78 /* Tukey radix 4 DIF FFT. It accepts the inputs in normal order and */ 79 /* produces the outputs in digit reversed order. The natural C code */ 80 /* shown in this file on the other hand, accepts the inputs in nor- */ 81 /* mal order and produces the outputs in normal order. */ 82 /* */ 83 /* Several transformations have been applied to the original Cooley */ 84 /* Tukey code to produce the natural C code description shown here. */ 85 /* In order to understand these it would first be educational to */ 86 /* understand some of the issues involved in the conventional Cooley */ 87 /* Tukey FFT code. */ 88 /* */ 89 /* void radix4(int n, short x[], short wn[]) */ 90 /* { */ 91 /* int n1, n2, ie, ia1, ia2, ia3; */ 92 /* int i0, i1, i2, i3, i, j, k; */ 93 /* short co1, co2, co3, si1, si2, si3; */ 94 /* short xt0, yt0, xt1, yt1, xt2, yt2; */ 95 /* short xh0, xh1, xh20, xh21, xl0, xl1,xl20,xl21; */ 96 /* */ 97 /* n2 = n; */ 98 /* ie = 1; */ 99 /* for (k = n; k > 1; k >>= 2) */ 100 /* { */ 101 /* n1 = n2; */ 102 /* n2 >>= 2; */ 103 /* ia1 = 0; */ 104 /* */ 105 /* for (j = 0; j < n2; j++) */ 106 /* { */ 107 /* ia2 = ia1 + ia1; */ 108 /* ia3 = ia2 + ia1; */ 109 /* */ 110 /* co1 = wn[2 * ia1 ]; */ 111 /* si1 = wn[2 * ia1 + 1]; */ 112 /* co2 = wn[2 * ia2 ]; */ 113 /* si2 = wn[2 * ia2 + 1]; */ 114 /* co3 = wn[2 * ia3 ]; */ 115 /* si3 = wn[2 * ia3 + 1]; */ 116 /* ia1 = ia1 + ie; */ 117 /* */ 118 /* for (i0 = j; i0< n; i0 += n1) */ 119 /* { */ 120 /* i1 = i0 + n2; */ 121 /* i2 = i1 + n2; */ 122 /* i3 = i2 + n2; */ 123 /* */ 124 /* xh0 = x[2 * i0 ] + x[2 * i2 ]; */ 125 /* xh1 = x[2 * i0 + 1] + x[2 * i2 + 1]; */ 126 /* xl0 = x[2 * i0 ] - x[2 * i2 ]; */ 127 /* xl1 = x[2 * i0 + 1] - x[2 * i2 + 1]; */ 128 /* */ 129 /* xh20 = x[2 * i1 ] + x[2 * i3 ]; */ 130 /* xh21 = x[2 * i1 + 1] + x[2 * i3 + 1]; */ 131 /* xl20 = x[2 * i1 ] - x[2 * i3 ]; */ 132 /* xl21 = x[2 * i1 + 1] - x[2 * i3 + 1]; */ 133 /* */ 134 /* x[2 * i0 ] = xh0 + xh20; */ 135 /* x[2 * i0 + 1] = xh1 + xh21; */ 136 /* */ 137 /* xt0 = xh0 - xh20; */ 138 /* yt0 = xh1 - xh21; */ 139 /* xt1 = xl0 + xl21; */ 140 /* yt2 = xl1 + xl20; */ 141 /* xt2 = xl0 - xl21; */ 142 /* yt1 = xl1 - xl20; */ 143 /* */ 144 /* x[2 * i1 ] = (xt1 * co1 + yt1 * si1) >> 15; */ 145 /* x[2 * i1 + 1] = (yt1 * co1 - xt1 * si1) >> 15; */ 146 /* x[2 * i2 ] = (xt0 * co2 + yt0 * si2) >> 15; */ 147 /* x[2 * i2 + 1] = (yt0 * co2 - xt0 * si2) >> 15; */ 148 /* x[2 * i3 ] = (xt2 * co3 + yt2 * si3) >> 15; */ 149 /* x[2 * i3 + 1] = (yt2 * co3 - xt2 * si3) >> 15; */ 150 /* } */ 151 /* } */ 152 /* ie <<= 2; */ 153 /* } */ 154 /* } */ 155 /* */ 156 /* The conventional Cooley Tukey FFT, is written using three loops. */ 157 /* The outermost loop "k" cycles through the stages. There are log */ 158 /* N to the base 4 stages in all. The loop "j" cycles through the */ 159 /* groups of butterflies with different twiddle factors, loop "i" */ 160 /* reuses the twiddle factors for the different butterflies within */ 161 /* a stage. It is interesting to note the following: */ 162 /* */ 163 /* ------------------------------------------------------------------------ */ 164 /* Stage# #Groups # Butterflies with common #Groups*Bflys */ 165 /* twiddle factors */ 166 /* ------------------------------------------------------------------------ */ 167 /* 1 N/4 1 N/4 */ 168 /* 2 N/16 4 N/4 */ 169 /* .. */ 170 /* logN 1 N/4 N/4 */ 171 /* ------------------------------------------------------------------------ */ 172 /* */ 173 /* The following statements can be made based on above observations: */ 174 /* */ 175 /* a) Inner loop "i0" iterates a veriable number of times. In */ 176 /* particular the number of iterations quadruples every time from */ 177 /* 1..N/4. Hence software pipelining a loop that iterates a vraiable */ 178 /* number of times is not profitable. */ 179 /* */ 180 /* b) Outer loop "j" iterates a variable number of times as well. */ 181 /* However the number of iterations is quartered every time from */ 182 /* N/4 ..1. Hence the behaviour in (a) and (b) are exactly opposite */ 183 /* to each other. */ 184 /* */ 185 /* c) If the two loops "i" and "j" are colaesced together then they */ 186 /* will iterate for a fixed number of times namely N/4. This allows */ 187 /* us to combine the "i" and "j" loops into 1 loop. Optimized impl- */ 188 /* ementations will make use of this fact. */ 189 /* */ 190 /* In addition the Cooley Tukey FFT accesses three twiddle factors */ 191 /* per iteration of the inner loop, as the butterflies that re-use */ 192 /* twiddle factors are lumped together. This leads to accessing the */ 193 /* twiddle factor array at three points each sepearted by "ie". Note */ 194 /* that "ie" is initially 1, and is quadrupled with every iteration. */ 195 /* Therfore these three twiddle factors are not even contiguous in */ 196 /* the array. */ 197 /* */ 198 /* In order to vectorize the FFT, it is desirable to access twiddle */ 199 /* factor array using double word wide loads and fetch the twiddle */ 200 /* factors needed. In order to do this a modified twiddle factor */ 201 /* array is created, in which the factors WN/4, WN/2, W3N/4 are */ 202 /* arranged to be contiguous. This eliminates the seperation between */ 203 /* twiddle factors within a butterfly. However this implies that as */ 204 /* the loop is traversed from one stage to another, that we maintain */ 205 /* a redundant version of the twiddle factor array. Hence the size */ 206 /* of the twiddle factor array increases as compared to the normal */ 207 /* Cooley Tukey FFT. The modified twiddle factor array is of size */ 208 /* "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" */ 209 /* where N is the number of complex points to be transformed. The */ 210 /* routine that generates the modified twiddle factor array was */ 211 /* presented earlier. With the above transformation of the FFT, */ 212 /* both the input data and the twiddle factor array can be accessed */ 213 /* using double-word wide loads to enable packed data processing. */ 214 /* */ 215 /* The final stage is optimised to remove the multiplication as */ 216 /* w0 = 1. This stage also performs digit reversal on the data, */ 217 /* so the final output is in natural order. */ 218 /* */ 219 /* The fft() code shown here performs the bulk of the computation */ 220 /* in place. However, because digit-reversal cannot be performed */ 221 /* in-place, the final result is written to a separate array, y[]. */ 222 /* */ 223 /* The actual twiddle factors for the FFT are cosine, - sine. The */ 224 /* twiddle factors stored in the table are csine and sine, hence */ 225 /* the sign of the "sine" term is comprehended during multipli- */ 226 /* cation as shown above. */ 227 /* */ 228 /* MEMORY NOTE */ 229 /* The optimized implementations are written for LITTLE ENDIAN. */ 230 /* */ 231 /* ------------------------------------------------------------------------ */ 232 /* Copyright (c) 2007 Texas Instruments, Incorporated. */ 233 /* All Rights Reserved. */ 234 /* ======================================================================== */ 235 #pragma CODE_SECTION(DSP_fft16x16_cn, ".text:ansi"); 236 237 #include "DSP_fft16x16_cn.h" 238 239 /*--------------------------------------------------------------------------*/ 240 /* The following macro is used to obtain a digit reversed index, of a given */ 241 /* number i, into j where the number of bits in "i" is "m". For the natural */ 242 /* form of C code, this is done by first interchanging every set of "2 bit" */ 243 /* pairs, followed by exchanging nibbles, followed by exchanging bytes, and */ 244 /* finally halfwords. To give an example, condider the following number: */ 245 /* */ 246 /* N = FEDCBA9876543210, where each digit represents a bit, the following */ 247 /* steps illustrate the changes as the exchanges are performed: */ 248 /* M = DCFE98BA54761032 is the number after every "2 bits" are exchanged. */ 249 /* O = 98BADCFE10325476 is the number after every nibble is exchanged. */ 250 /* P = 1032547698BADCFE is the number after every byte is exchanged. */ 251 /* Since only 16 digits were considered this represents the digit reversed */ 252 /* index. Since the numbers are represented as 32 bits, there is one more */ 253 /* step typically of exchanging the half words as well. */ 254 /*--------------------------------------------------------------------------*/ 255 #if 0 256 # define DIG_REV(i, m, j) ((j) = (_shfl(_rotl(_bitr(_deal(i)), 16)) >> (m))) 257 #else 258 # define DIG_REV(i, m, j) 259 do { 260 unsigned _ = (i); 261 _ = ((_ & 0x33333333) << 2) | ((_ & ~0x33333333) >> 2); 262 _ = ((_ & 0x0F0F0F0F) << 4) | ((_ & ~0x0F0F0F0F) >> 4); 263 _ = ((_ & 0x00FF00FF) << 8) | ((_ & ~0x00FF00FF) >> 8); 264 _ = ((_ & 0x0000FFFF) << 16) | ((_ & ~0x0000FFFF) >> 16); 265 (j) = _ >> (m); 266 } while (0) 267 #endif 268 269 270 void DSP_fft16x16_cn ( 271 const short * ptr_w, 272 int npoints, 273 short * ptr_x, 274 short * ptr_y 275 ) 276 { 277 const short *w; 278 short * x, * x2, * x0; 279 short * y0, * y1, * y2, *y3; 280 281 short xt0_0, yt0_0, xt1_0, yt1_0, xt2_0, yt2_0; 282 short xt0_1, yt0_1, xt1_1, yt1_1, xt2_1, yt2_1; 283 short xh0_0, xh1_0, xh20_0, xh21_0, xl0_0, xl1_0, xl20_0, xl21_0; 284 short xh0_1, xh1_1, xh20_1, xh21_1, xl0_1, xl1_1, xl20_1, xl21_1; 285 short x_0, x_1, x_2, x_3, x_l1_0, x_l1_1, x_l1_2, x_l1_3, x_l2_0, x_l2_1; 286 short xh0_2, xh1_2, xl0_2, xl1_2, xh0_3, xh1_3, xl0_3, xl1_3; 287 short x_4, x_5, x_6, x_7, x_l2_2, x_l2_3, x_h2_0, x_h2_1, x_h2_2, x_h2_3; 288 short x_8, x_9, x_a, x_b, x_c, x_d, x_e, x_f; 289 short si10, si20, si30, co10, co20, co30; 290 short si11, si21, si31, co11, co21, co31; 291 short n00, n10, n20, n30, n01, n11, n21, n31; 292 short n02, n12, n22, n32, n03, n13, n23, n33; 293 short n0, j0; 294 295 int i, j, l1, l2, h2, predj, tw_offset, stride, fft_jmp; 296 int radix, norm, m; 297 298 /*---------------------------------------------------------------------*/ 299 /* Determine the magnitude od the number of points to be transformed. */ 300 /* Check whether we can use a radix4 decomposition or a mixed radix */ 301 /* transformation, by determining modulo 2. */ 302 /*---------------------------------------------------------------------*/ 303 for (i = 31, m = 1; (npoints & (1 << i)) == 0; i--, m++) 304 ; 305 norm = m - 2; 306 307 radix = m & 1 ? 2 : 4; 308 309 /*----------------------------------------------------------------------*/ 310 /* The stride is quartered with every iteration of the outer loop. It */ 311 /* denotes the seperation between any two adjacent inputs to the butter */ 312 /* -fly. This should start out at N/4, hence stride is initially set to */ 313 /* N. For every stride, 6*stride twiddle factors are accessed. The */ 314 /* "tw_offset" is the offset within the current twiddle factor sub- */ 315 /* table. This is set to zero, at the start of the code and is used to */ 316 /* obtain the appropriate sub-table twiddle pointer by offseting it */ 317 /* with the base pointer "ptr_w". */ 318 /*----------------------------------------------------------------------*/ 319 320 stride = npoints; 321 tw_offset = 0; 322 fft_jmp = 6 * stride; 323 324 #ifndef NOASSUME 325 _nassert(stride > 4); 326 #pragma MUST_ITERATE(1,,1); 327 #endif 328 329 while (stride > 4) { 330 /*-----------------------------------------------------------------*/ 331 /* At the start of every iteration of the outer loop, "j" is set */ 332 /* to zero, as "w" is pointing to the correct location within the */ 333 /* twiddle factor array. For every iteration of the inner loop */ 334 /* 6 * stride twiddle factors are accessed. For eg, */ 335 /* */ 336 /* #Iteration of outer loop # twiddle factors #times cycled */ 337 /* 1 6 N/4 1 */ 338 /* 2 6 N/16 4 */ 339 /* ... */ 340 /*-----------------------------------------------------------------*/ 341 j = 0; 342 fft_jmp >>= 2; 343 344 /*-----------------------------------------------------------------*/ 345 /* Set up offsets to access "N/4", "N/2", "3N/4" complex point or */ 346 /* "N/2", "N", "3N/2" half word */ 347 /*-----------------------------------------------------------------*/ 348 h2 = stride >> 1; 349 l1 = stride; 350 l2 = stride + (stride >> 1); 351 352 /*-----------------------------------------------------------------*/ 353 /* Reset "x" to point to the start of the input data array. */ 354 /* "tw_offset" starts off at 0, and increments by "6 * stride" */ 355 /* The stride quarters with every iteration of the outer loop */ 356 /*-----------------------------------------------------------------*/ 357 x = ptr_x; 358 w = ptr_w + tw_offset; 359 tw_offset += fft_jmp; 360 stride >>= 2; 361 362 /*----------------------------------------------------------------*/ 363 /* The following loop iterates through the different butterflies, */ 364 /* within a given stage. Recall that there are logN to base 4 */ 365 /* stages. Certain butterflies share the twiddle factors. These */ 366 /* are grouped together. On the very first stage there are no */ 367 /* butterflies that share the twiddle factor, all N/4 butter- */ 368 /* flies have different factors. On the next stage two sets of */ 369 /* N/8 butterflies share the same twiddle factor. Hence after */ 370 /* half the butterflies are performed, j the index into the */ 371 /* factor array resets to 0, and the twiddle factors are reused. */ 372 /* When this happens, the data pointer ‘x‘ is incremented by the */ 373 /* fft_jmp amount. In addition the following code is unrolled to */ 374 /* perform "2" radix4 butterflies in parallel. */ 375 /*----------------------------------------------------------------*/ 376 #ifndef NOASSUME 377 _nassert((int)(w) % 8 == 0); 378 _nassert((int)(x) % 8 == 0); 379 _nassert(h2 % 8 == 0); 380 _nassert(l1 % 8 == 0); 381 _nassert(l2 % 8 == 0); 382 #pragma MUST_ITERATE(1,,1); 383 #endif 384 385 for (i = 0; i < (npoints >> 3); i ++) { 386 /*------------------------------------------------------------*/ 387 /* Read the first 12 twiddle factors, six of which are used */ 388 /* for one radix 4 butterfly and six of which are used for */ 389 /* next one. */ 390 /*------------------------------------------------------------*/ 391 392 // twiddle factors for first butterfly 393 co10 = w[j+1]; 394 si10 = w[j+0]; 395 co20 = w[j+5]; 396 si20 = w[j+4]; 397 co30 = w[j+9]; 398 si30 = w[j+8]; 399 400 // twiddle factors for second butterfly 401 co11 = w[j+3]; 402 si11 = w[j+2]; 403 co21 = w[j+7]; 404 si21 = w[j+6]; 405 co31 = w[j+11]; 406 si31 = w[j+10]; 407 408 /*------------------------------------------------------------*/ 409 /* Read in the first complex input for the butterflies. */ 410 /* 1st complex input to 1st butterfly: x[0] + jx[1] */ 411 /* 1st complex input to 2nd butterfly: x[2] + jx[3] */ 412 /*------------------------------------------------------------*/ 413 x_0 = x[0]; // Re[x(k)] 414 x_1 = x[1]; // Im[x(k)] 415 x_2 = x[2]; // second butterfly 416 x_3 = x[3]; 417 418 /*------------------------------------------------------------*/ 419 /* Read in the complex inputs for the butterflies. Each of the*/ 420 /* successive complex inputs of the butterfly are seperated */ 421 /* by a fixed amount known as stride. The stride starts out */ 422 /* at N/4, and quarters with every stage. */ 423 /*------------------------------------------------------------*/ 424 x_l1_0 = x[l1 ]; // Re[x(k+N/2)] 425 x_l1_1 = x[l1+1]; // Im[x(k+N/2)] 426 x_l1_2 = x[l1+2]; // second butterfly 427 x_l1_3 = x[l1+3]; 428 429 x_l2_0 = x[l2 ]; // Re[x(k+3*N/2)] 430 x_l2_1 = x[l2+1]; // Im[x(k+3*N/2)] 431 x_l2_2 = x[l2+2]; // second butterfly 432 x_l2_3 = x[l2+3]; 433 434 x_h2_0 = x[h2 ]; // Re[x(k+N/4)] 435 x_h2_1 = x[h2+1]; // Im[x(k+N/4)] 436 x_h2_2 = x[h2+2]; // second butterfly 437 x_h2_3 = x[h2+3]; 438 439 /*-----------------------------------------------------------*/ 440 /* Two butterflies are evaluated in parallel. The following */ 441 /* results will be shown for one butterfly only, although */ 442 /* both are being evaluated in parallel. */ 443 /* */ 444 /* Perform radix2 style DIF butterflies. */ 445 /*-----------------------------------------------------------*/ 446 xh0_0 = x_0 + x_l1_0; xh1_0 = x_1 + x_l1_1; 447 xh0_1 = x_2 + x_l1_2; xh1_1 = x_3 + x_l1_3; 448 449 xl0_0 = x_0 - x_l1_0; xl1_0 = x_1 - x_l1_1; 450 xl0_1 = x_2 - x_l1_2; xl1_1 = x_3 - x_l1_3; 451 452 xh20_0 = x_h2_0 + x_l2_0; xh21_0 = x_h2_1 + x_l2_1; 453 xh20_1 = x_h2_2 + x_l2_2; xh21_1 = x_h2_3 + x_l2_3; 454 455 xl20_0 = x_h2_0 - x_l2_0; xl21_0 = x_h2_1 - x_l2_1; 456 xl20_1 = x_h2_2 - x_l2_2; xl21_1 = x_h2_3 - x_l2_3; 457 458 /*-----------------------------------------------------------*/ 459 /* Derive output pointers using the input pointer "x" */ 460 /*-----------------------------------------------------------*/ 461 x0 = x; 462 x2 = x0; 463 464 /*-----------------------------------------------------------*/ 465 /* When the twiddle factors are not to be re-used, j is */ 466 /* incremented by 12, to reflect the fact that 12 half words */ 467 /* are consumed in every iteration. The input data pointer */ 468 /* increments by 4. Note that within a stage, the stride */ 469 /* does not change and hence the offsets for the other three */ 470 /* legs, 0, h2, l1, l2. */ 471 /*-----------------------------------------------------------*/ 472 j += 12; 473 x += 4; 474 475 predj = (j - fft_jmp); 476 if (!predj) x += fft_jmp; 477 if (!predj) j = 0; 478 479 /*----------------------------------------------------------*/ 480 /* These four partial results can be re-written to show */ 481 /* the underlying DIF structure similar to radix2 as */ 482 /* follows: */ 483 /* */ 484 /* X(4k) = (x(n)+x(n + N/2)) + (x(n+N/4)+ x(n + 3N/4)) */ 485 /* X(4k+1)= (x(n)-x(n + N/2)) -j(x(n+N/4) - x(n + 3N/4)) */ 486 /* x(4k+2)= (x(n)+x(n + N/2)) - (x(n+N/4)+ x(n + 3N/4)) */ 487 /* X(4k+3)= (x(n)-x(n + N/2)) +j(x(n+N/4) - x(n + 3N/4)) */ 488 /* */ 489 /* which leads to the real and imaginary values as foll: */ 490 /* */ 491 /* y0r = x0r + x2r + x1r + x3r = xh0 + xh20 */ 492 /* y0i = x0i + x2i + x1i + x3i = xh1 + xh21 */ 493 /* y1r = x0r - x2r + (x1i - x3i) = xl0 + xl21 */ 494 /* y1i = x0i - x2i - (x1r - x3r) = xl1 - xl20 */ 495 /* y2r = x0r + x2r - (x1r + x3r) = xh0 - xh20 */ 496 /* y2i = x0i + x2i - (x1i + x3i = xh1 - xh21 */ 497 /* y3r = x0r - x2r - (x1i - x3i) = xl0 - xl21 */ 498 /* y3i = x0i - x2i + (x1r - x3r) = xl1 + xl20 */ 499 /* ---------------------------------------------------------*/ 500 x0[0] = (xh0_0 + xh20_0 + 1) >> 1; 501 x0[1] = (xh1_0 + xh21_0 + 1) >> 1; 502 x0[2] = (xh0_1 + xh20_1 + 1) >> 1; 503 x0[3] = (xh1_1 + xh21_1 + 1) >> 1; 504 505 xt0_0 = xh0_0 - xh20_0; 506 yt0_0 = xh1_0 - xh21_0; 507 xt0_1 = xh0_1 - xh20_1; 508 yt0_1 = xh1_1 - xh21_1; 509 510 xt1_0 = xl0_0 + xl21_0; 511 yt2_0 = xl1_0 + xl20_0; 512 xt2_0 = xl0_0 - xl21_0; 513 yt1_0 = xl1_0 - xl20_0; 514 515 xt1_1 = xl0_1 + xl21_1; yt2_1 = xl1_1 + xl20_1; 516 xt2_1 = xl0_1 - xl21_1; yt1_1 = xl1_1 - xl20_1; 517 518 /*---------------------------------------------------------*/ 519 /* Perform twiddle factor multiplies of three terms,top */ 520 /* term does not have any multiplies. Note the twiddle */ 521 /* factors for a normal FFT are C + j (-S). Since the */ 522 /* factors that are stored are C + j S, this is */ 523 /* corrected for in the multiplies. */ 524 /* */ 525 /* Y1 = (xt1 + jyt1) (c + js) = (xc + ys) + (yc -xs) */ 526 /*---------------------------------------------------------*/ 527 x2[l1 ] = (-co20 * xt0_0 - si20 * yt0_0 + 0x8000) >> 16; 528 x2[l1+1] = (-co20 * yt0_0 + si20 * xt0_0 + 0x8000) >> 16; 529 530 x2[l1+2] = (-co21 * xt0_1 - si21 * yt0_1 + 0x8000) >> 16; 531 x2[l1+3] = (-co21 * yt0_1 + si21 * xt0_1 + 0x8000) >> 16; 532 533 x2[h2 ] = (co10 * xt1_0 + si10 * yt1_0 + 0x8000) >> 16; 534 x2[h2+1] = (co10 * yt1_0 - si10 * xt1_0 + 0x8000) >> 16; 535 536 x2[h2+2] = (co11 * xt1_1 + si11 * yt1_1 + 0x8000) >> 16; 537 x2[h2+3] = (co11 * yt1_1 - si11 * xt1_1 + 0x8000) >> 16; 538 539 x2[l2 ] = (co30 * xt2_0 + si30 * yt2_0 + 0x8000) >> 16; 540 x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0 + 0x8000) >> 16; 541 542 x2[l2+2] = (co31 * xt2_1 + si31 * yt2_1 + 0x8000) >> 16; 543 x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1 + 0x8000) >> 16; 544 } 545 } 546 547 /*-----------------------------------------------------------------*/ 548 /* The following code performs either a standard radix4 pass or a */ 549 /* radix2 pass. Two pointers are used to access the input data. */ 550 /* The input data is read "N/4" complex samples apart or "N/2" */ 551 /* words apart using pointers "x0" and "x2". This produces out- */ 552 /* puts that are 0, N/4, N/2, 3N/4 for a radix4 FFT, and 0, N/8 */ 553 /* N/2, 3N/8 for radix 2. */ 554 /*-----------------------------------------------------------------*/ 555 y0 = ptr_y; 556 y2 = ptr_y + (int)npoints; 557 x0 = ptr_x; 558 x2 = ptr_x + (int)(npoints >> 1); 559 560 if (radix == 2) { 561 /*----------------------------------------------------------------*/ 562 /* The pointers are set at the following locations which are half */ 563 /* the offsets of a radix4 FFT. */ 564 /*----------------------------------------------------------------*/ 565 y1 = y0 + (int)(npoints >> 2); 566 y3 = y2 + (int)(npoints >> 2); 567 l1 = norm + 1; 568 j0 = 8; 569 n0 = npoints >> 1; 570 } 571 else { 572 y1 = y0 + (int)(npoints >> 1); 573 y3 = y2 + (int)(npoints >> 1); 574 l1 = norm + 2; 575 j0 = 4; 576 n0 = npoints >> 2; 577 } 578 579 /*--------------------------------------------------------------------*/ 580 /* The following code reads data indentically for either a radix 4 */ 581 /* or a radix 2 style decomposition. It writes out at different */ 582 /* locations though. It checks if either half the points, or a */ 583 /* quarter of the complex points have been exhausted to jump to */ 584 /* pervent double reversal. */ 585 /*--------------------------------------------------------------------*/ 586 j = 0; 587 588 #ifndef NOASSUME 589 _nassert((int)(n0) % 4 == 0); 590 _nassert((int)(x0) % 8 == 0); 591 _nassert((int)(x2) % 8 == 0); 592 _nassert((int)(y0) % 8 == 0); 593 #pragma MUST_ITERATE(2,,2); 594 #endif 595 596 for (i = 0; i < npoints; i += 8) { 597 /*----------------------------------------------------------------*/ 598 /* Digit reverse the index starting from 0. The increment to "j" */ 599 /* is either by 4, or 8. */ 600 /*----------------------------------------------------------------*/ 601 DIG_REV(j, l1, h2); 602 603 /*----------------------------------------------------------------*/ 604 /* Read in the input data, from the first eight locations. These */ 605 /* are transformed either as a radix4 or as a radix 2. */ 606 /*----------------------------------------------------------------*/ 607 x_0 = x0[0]; 608 x_1 = x0[1]; 609 x_2 = x0[2]; 610 x_3 = x0[3]; 611 x_4 = x0[4]; 612 x_5 = x0[5]; 613 x_6 = x0[6]; 614 x_7 = x0[7]; 615 x0 += 8; 616 617 xh0_0 = x_0 + x_4; 618 xh1_0 = x_1 + x_5; 619 xl0_0 = x_0 - x_4; 620 xl1_0 = x_1 - x_5; 621 xh0_1 = x_2 + x_6; 622 xh1_1 = x_3 + x_7; 623 xl0_1 = x_2 - x_6; 624 xl1_1 = x_3 - x_7; 625 626 n00 = xh0_0 + xh0_1; 627 n01 = xh1_0 + xh1_1; 628 n10 = xl0_0 + xl1_1; 629 n11 = xl1_0 - xl0_1; 630 n20 = xh0_0 - xh0_1; 631 n21 = xh1_0 - xh1_1; 632 n30 = xl0_0 - xl1_1; 633 n31 = xl1_0 + xl0_1; 634 635 if (radix == 2) { 636 /*-------------------------------------------------------------*/ 637 /* Perform radix2 style decomposition. */ 638 /*-------------------------------------------------------------*/ 639 n00 = x_0 + x_2; 640 n01 = x_1 + x_3; 641 n20 = x_0 - x_2; 642 n21 = x_1 - x_3; 643 n10 = x_4 + x_6; 644 n11 = x_5 + x_7; 645 n30 = x_4 - x_6; 646 n31 = x_5 - x_7; 647 } 648 649 y0[2*h2] = n00; 650 y0[2*h2 + 1] = n01; 651 y1[2*h2] = n10; 652 y1[2*h2 + 1] = n11; 653 y2[2*h2] = n20; 654 y2[2*h2 + 1] = n21; 655 y3[2*h2] = n30; 656 y3[2*h2 + 1] = n31; 657 658 /*----------------------------------------------------------------*/ 659 /* Read in ht enext eight inputs, and perform radix4 or radix2 */ 660 /* decomposition. */ 661 /*----------------------------------------------------------------*/ 662 x_8 = x2[0]; x_9 = x2[1]; 663 x_a = x2[2]; x_b = x2[3]; 664 x_c = x2[4]; x_d = x2[5]; 665 x_e = x2[6]; x_f = x2[7]; 666 x2 += 8; 667 668 xh0_2 = x_8 + x_c; xh1_2 = x_9 + x_d; 669 xl0_2 = x_8 - x_c; xl1_2 = x_9 - x_d; 670 xh0_3 = x_a + x_e; xh1_3 = x_b + x_f; 671 xl0_3 = x_a - x_e; xl1_3 = x_b - x_f; 672 673 n02 = xh0_2 + xh0_3; n03 = xh1_2 + xh1_3; 674 n12 = xl0_2 + xl1_3; n13 = xl1_2 - xl0_3; 675 n22 = xh0_2 - xh0_3; n23 = xh1_2 - xh1_3; 676 n32 = xl0_2 - xl1_3; n33 = xl1_2 + xl0_3; 677 678 if (radix == 2) { 679 n02 = x_8 + x_a; n03 = x_9 + x_b; 680 n22 = x_8 - x_a; n23 = x_9 - x_b; 681 n12 = x_c + x_e; n13 = x_d + x_f; 682 n32 = x_c - x_e; n33 = x_d - x_f; 683 } 684 685 /*-----------------------------------------------------------------*/ 686 /* Points that are read from succesive locations map to y, y[N/4] */ 687 /* y[N/2], y[3N/4] in a radix4 scheme, y, y[N/8], y[N/2],y[5N/8] */ 688 /*-----------------------------------------------------------------*/ 689 y0[2*h2+2] = n02; y0[2*h2+3] = n03; 690 y1[2*h2+2] = n12; y1[2*h2+3] = n13; 691 y2[2*h2+2] = n22; y2[2*h2+3] = n23; 692 y3[2*h2+2] = n32; y3[2*h2+3] = n33; 693 694 j += j0; 695 if (j == n0) { 696 j += n0; 697 x0 += (int)npoints >> 1; 698 x2 += (int)npoints >> 1; 699 } 700 } 701 } 702 703 /* ======================================================================== */ 704 /* End of file: DSP_fft16x16_cn.c */ 705 /* ------------------------------------------------------------------------ */ 706 /* Copyright (C) 2007 Texas Instruments, Incorporated. */ 707 /* All Rights Reserved. */ 708 /* ======================================================================== */
用于计算旋转因子的gen_twiddle_fft16x16函数在同一目录下,源代码是:
1 /* ======================================================================== */ 2 /* gen_twiddle_fft16x16.c -- File with twiddle factor generators. */ 3 /* ======================================================================== */ 4 /* This code requires a special sequence of twiddle factors stored */ 5 /* in 1Q15 fixed-point format. The following C code is used for */ 6 /* the natural C and intrinsic C implementations. */ 7 /* */ 8 /* In order to vectorize the FFT, it is desirable to access twiddle */ 9 /* factor array using double word wide loads and fetch the twiddle */ 10 /* factors needed. In order to do this a modified twiddle factor */ 11 /* array is created, in which the factors WN/4, WN/2, W3N/4 are */ 12 /* arranged to be contiguous. This eliminates the seperation between */ 13 /* twiddle factors within a butterfly. However this implies that as */ 14 /* the loop is traversed from one stage to another, that we maintain */ 15 /* a redundant version of the twiddle factor array. Hence the size */ 16 /* of the twiddle factor array increases as compared to the normal */ 17 /* Cooley Tukey FFT. The modified twiddle factor array is of size */ 18 /* "2 * N" where the conventional Cooley Tukey FFT is of size"3N/4" */ 19 /* where N is the number of complex points to be transformed. The */ 20 /* routine that generates the modified twiddle factor array was */ 21 /* presented earlier. With the above transformation of the FFT, */ 22 /* both the input data and the twiddle factor array can be accessed */ 23 /* using double-word wide loads to enable packed data processing. */ 24 /* */ 25 /* ------------------------------------------------------------------------ */ 26 /* Copyright (c) 2007 Texas Instruments, Incorporated. */ 27 /* All Rights Reserved. */ 28 /* ======================================================================== */ 29 #include <math.h> 30 #include "gen_twiddle_fft16x16.h" 31 32 #ifndef PI 33 # ifdef M_PI 34 # define PI M_PI 35 # else 36 # define PI 3.14159265358979323846 37 # endif 38 #endif 39 40 41 /* ======================================================================== */ 42 /* D2S -- Truncate a ‘double‘ to a ‘short‘, with clamping. */ 43 /* ======================================================================== */ 44 static short d2s(double d) 45 { 46 d = floor(0.5 + d); // Explicit rounding to integer // 47 if (d >= 32767.0) return 32767; 48 if (d <= -32768.0) return -32768; 49 return (short)d; 50 } 51 52 53 /* ======================================================================== */ 54 /* GEN_TWIDDLE -- Generate twiddle factors for TI‘s custom FFTs. */ 55 /* */ 56 /* USAGE */ 57 /* This routine is called as follows: */ 58 /* */ 59 /* int gen_twiddle_fft16x16(short *w, int n) */ 60 /* */ 61 /* short *w Pointer to twiddle-factor array */ 62 /* int n Size of FFT */ 63 /* */ 64 /* The routine will generate the twiddle-factors directly into the */ 65 /* array you specify. The array needs to be approximately 2*N */ 66 /* elements long. (The actual size, which is slightly smaller, is */ 67 /* returned by the function.) */ 68 /* ======================================================================== */ 69 int gen_twiddle_fft16x16(short *w, int n) 70 { 71 int i, j, k; 72 double M = 32767.5; 73 74 for (j = 1, k = 0; j < n >> 2; j = j << 2) { 75 for (i = 0; i < n >> 2; i += j << 1) { 76 w[k + 11] = d2s(M * cos(6.0 * PI * (i + j) / n)); 77 w[k + 10] = d2s(M * sin(6.0 * PI * (i + j) / n)); 78 w[k + 9] = d2s(M * cos(6.0 * PI * (i ) / n)); 79 w[k + 8] = d2s(M * sin(6.0 * PI * (i ) / n)); 80 81 w[k + 7] = -d2s(M * cos(4.0 * PI * (i + j) / n)); 82 w[k + 6] = -d2s(M * sin(4.0 * PI * (i + j) / n)); 83 w[k + 5] = -d2s(M * cos(4.0 * PI * (i ) / n)); 84 w[k + 4] = -d2s(M * sin(4.0 * PI * (i ) / n)); 85 86 w[k + 3] = d2s(M * cos(2.0 * PI * (i + j) / n)); 87 w[k + 2] = d2s(M * sin(2.0 * PI * (i + j) / n)); 88 w[k + 1] = d2s(M * cos(2.0 * PI * (i ) / n)); 89 w[k + 0] = d2s(M * sin(2.0 * PI * (i ) / n)); 90 91 k += 12; 92 } 93 } 94 return k; 95 } 96 97 /* ======================================================================= */ 98 /* End of file: gen_twiddle_fft16x16.c */ 99 /* ----------------------------------------------------------------------- */ 100 /* Copyright (c) 2007 Texas Instruments, Incorporated. */ 101 /* All Rights Reserved. */ 102 /* ======================================================================= */
在c64plus\dsplib_v210\src目录下有其他几个FFT及其余函数的源代码。特别地,在c64plus\dsplib_v210\example目录下有fft_example目录,该目录下fft_example.c列举了几个FFT的例子。和上面的代码差不多。TI DSP的FFT库函数大致是这么使用的。
标签:initial \n offset serve was 2.0 dea created sam
原文地址:https://www.cnblogs.com/songyj06/p/fft.html