Skip to content

Commit b7571f5

Browse files
singalsulgirdwood
authored andcommitted
Math: FFT-multi: Add Xtensa HiFi versions for hot code functions
This patch adds HiFi3 versions for functions dft3_32() and fft_multi_execute_32(). The functions are implemented to fft_multi_hifi3.c and the generic versions are moved to fft_multi_generic.c. in MTL platform the optimization saves 119 MCPS, from 237 MCPS to 118 MCPS. The test was done with script run: scripts/rebuild-testbench.sh -p mtl scripts/sof-testbench-helper.sh -x -m stft_process_1536_240_ \ -p profile-stft_process.txt The above STFT used FFT length of 1536 with hop 240. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent 2967f45 commit b7571f5

File tree

6 files changed

+444
-157
lines changed

6 files changed

+444
-157
lines changed

src/include/sof/math/fft.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ void mod_fft_multi_plan_free(struct processing_module *mod, struct fft_multi_pla
9898

9999
/**
100100
* dft3_32() - Discrete Fourier Transform (DFT) for size 3.
101-
* @param input: Pointer to complex values input array, Q1.31
102-
* @param output: Pointer to complex values output array, Q3.29
101+
* @param input: Pointer to complex values input array, Q1.31.
102+
* @param output: Pointer to complex values output array, Q1.31, scaled down by 1/3.
103103
*
104104
* This function is useful for calculating some non power of two FFTs. E.g. the
105105
* FFT for size 1536 is done with three 512 size FFTs and one 3 size DFT.

src/math/fft/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ if(CONFIG_MATH_32BIT_FFT)
1111
endif()
1212

1313
if(CONFIG_MATH_FFT_MULTI)
14-
list(APPEND base_files fft_multi.c)
14+
list(APPEND base_files fft_multi.c fft_multi_generic.c fft_multi_hifi3.c)
1515
endif()
1616

1717
is_zephyr(zephyr)

src/math/fft/fft_multi.c

Lines changed: 0 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,6 @@ LOG_MODULE_REGISTER(math_fft_multi, CONFIG_SOF_LOG_LEVEL);
1919
SOF_DEFINE_REG_UUID(math_fft_multi);
2020
DECLARE_TR_CTX(math_fft_multi_tr, SOF_UUID(math_fft_multi_uuid), LOG_LEVEL_INFO);
2121

22-
/* Constants for size 3 DFT */
23-
#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */
24-
#define DFT3_COEFI 1859775393 /* int32(sqrt(3) / 2 * 2^31) */
25-
#define DFT3_SCALE 715827883 /* int32(1/3*2^31) */
26-
2722
struct fft_multi_plan *mod_fft_multi_plan_new(struct processing_module *mod, void *inb,
2823
void *outb, uint32_t size, int bits)
2924
{
@@ -143,152 +138,3 @@ void mod_fft_multi_plan_free(struct processing_module *mod, struct fft_multi_pla
143138
mod_free(mod, plan->bit_reverse_idx);
144139
mod_free(mod, plan);
145140
}
146-
147-
void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
148-
{
149-
const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
150-
const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI};
151-
struct icomplex32 x[3];
152-
struct icomplex32 p1, p2, sum;
153-
int i;
154-
155-
for (i = 0; i < 3; i++) {
156-
x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
157-
x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
158-
}
159-
160-
/*
161-
* | 1 1 1 |
162-
* c = | 1 c0 c1 | , x = [ x0 x1 x2 ]
163-
* | 1 c1 c0 |
164-
*
165-
* y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(0)
166-
* y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(0)
167-
* y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(0)
168-
*/
169-
170-
/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
171-
icomplex32_adds(&x[0], &x[1], &sum);
172-
icomplex32_adds(&x[2], &sum, &y[0]);
173-
174-
/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
175-
icomplex32_mul(&c0, &x[1], &p1);
176-
icomplex32_mul(&c1, &x[2], &p2);
177-
icomplex32_adds(&p1, &p2, &sum);
178-
icomplex32_adds(&x[0], &sum, &y[1]);
179-
180-
/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
181-
icomplex32_mul(&c1, &x[1], &p1);
182-
icomplex32_mul(&c0, &x[2], &p2);
183-
icomplex32_adds(&p1, &p2, &sum);
184-
icomplex32_adds(&x[0], &sum, &y[2]);
185-
}
186-
187-
void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
188-
{
189-
struct icomplex32 x[FFT_MULTI_COUNT_MAX];
190-
struct icomplex32 y[FFT_MULTI_COUNT_MAX];
191-
struct icomplex32 t, c;
192-
int i, j, k, m;
193-
194-
/* Handle 2^N FFT */
195-
if (plan->num_ffts == 1) {
196-
memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
197-
fft_execute_32(plan->fft_plan[0], ifft);
198-
return;
199-
}
200-
201-
#ifdef DEBUG_DUMP_TO_FILE
202-
FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
203-
FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
204-
FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
205-
FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
206-
#endif
207-
208-
/* convert to complex conjugate for IFFT */
209-
if (ifft) {
210-
for (i = 0; i < plan->total_size; i++)
211-
icomplex32_conj(&plan->inb32[i]);
212-
}
213-
214-
/* Copy input buffers */
215-
k = 0;
216-
for (i = 0; i < plan->fft_size; i++)
217-
for (j = 0; j < plan->num_ffts; j++)
218-
plan->tmp_i32[j][i] = plan->inb32[k++];
219-
220-
/* Clear output buffers and call individual FFTs*/
221-
for (j = 0; j < plan->num_ffts; j++) {
222-
memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
223-
fft_execute_32(plan->fft_plan[j], 0);
224-
}
225-
226-
#ifdef DEBUG_DUMP_TO_FILE
227-
for (j = 0; j < plan->num_ffts; j++)
228-
for (i = 0; i < plan->fft_size; i++)
229-
fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
230-
#endif
231-
232-
/* Multiply with twiddle factors */
233-
m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
234-
for (j = 1; j < plan->num_ffts; j++) {
235-
for (i = 0; i < plan->fft_size; i++) {
236-
c = plan->tmp_o32[j][i];
237-
k = j * i * m;
238-
t.real = multi_twiddle_real_32[k];
239-
t.imag = multi_twiddle_imag_32[k];
240-
//fprintf(fh3, "%d %d\n", t.real, t.imag);
241-
icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
242-
}
243-
}
244-
245-
#ifdef DEBUG_DUMP_TO_FILE
246-
for (j = 0; j < plan->num_ffts; j++)
247-
for (i = 0; i < plan->fft_size; i++)
248-
fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
249-
#endif
250-
251-
/* DFT of size 3 */
252-
j = plan->fft_size;
253-
k = 2 * plan->fft_size;
254-
for (i = 0; i < plan->fft_size; i++) {
255-
x[0] = plan->tmp_o32[0][i];
256-
x[1] = plan->tmp_o32[1][i];
257-
x[2] = plan->tmp_o32[2][i];
258-
dft3_32(x, y);
259-
plan->outb32[i] = y[0];
260-
plan->outb32[i + j] = y[1];
261-
plan->outb32[i + k] = y[2];
262-
}
263-
264-
#ifdef DEBUG_DUMP_TO_FILE
265-
for (i = 0; i < plan->total_size; i++)
266-
fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag);
267-
#endif
268-
269-
/* shift back for IFFT */
270-
271-
/* TODO: Check if time shift method for IFFT is more efficient or more accurate
272-
* tmp = 1 / N * fft(X);
273-
* x = tmp([1 N:-1:2])
274-
*/
275-
if (ifft) {
276-
/*
277-
* no need to divide N as it is already done in the input side
278-
* for Q1.31 format. Instead, we need to multiply N to compensate
279-
* the shrink we did in the FFT transform.
280-
*/
281-
for (i = 0; i < plan->total_size; i++) {
282-
/* Need to negate imag part to match reference */
283-
plan->outb32[i].imag = -plan->outb32[i].imag;
284-
icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
285-
&plan->outb32[i]);
286-
plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
287-
plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
288-
}
289-
}
290-
291-
#ifdef DEBUG_DUMP_TO_FILE
292-
fclose(fh1); fclose(fh2); fclose(fh3); fclose(fh4);
293-
#endif
294-
}

src/math/fft/fft_multi_generic.c

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
// SPDX-License-Identifier: BSD-3-Clause
2+
//
3+
// Copyright(c) 2025-2026 Intel Corporation.
4+
//
5+
// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
6+
7+
#include <sof/audio/format.h>
8+
#include <sof/math/icomplex32.h>
9+
#include <sof/common.h>
10+
#include <sof/math/fft.h>
11+
#include <string.h>
12+
13+
#ifdef DEBUG_DUMP_TO_FILE
14+
#include <stdio.h>
15+
#endif
16+
17+
/* Twiddle factor tables defined in fft_multi.c via twiddle_3072_32.h */
18+
#define FFT_MULTI_TWIDDLE_SIZE 2048
19+
extern const int32_t multi_twiddle_real_32[];
20+
extern const int32_t multi_twiddle_imag_32[];
21+
22+
/* Constants for size 3 DFT */
23+
#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */
24+
#define DFT3_COEFI 1859775393 /* int32(sqrt(3) / 2 * 2^31) */
25+
#define DFT3_SCALE 715827883 /* int32(1/3*2^31) */
26+
27+
#ifdef FFT_GENERIC
28+
29+
void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
30+
{
31+
const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
32+
const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI};
33+
struct icomplex32 x[3];
34+
struct icomplex32 p1, p2, sum;
35+
int i;
36+
37+
for (i = 0; i < 3; i++) {
38+
x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
39+
x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
40+
}
41+
42+
/*
43+
* | 1 1 1 |
44+
* c = | 1 c0 c1 | , x = [ x0 x1 x2 ]
45+
* | 1 c1 c0 |
46+
*
47+
* y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(2)
48+
* y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(2)
49+
* y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(2)
50+
*/
51+
52+
/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
53+
icomplex32_adds(&x[0], &x[1], &sum);
54+
icomplex32_adds(&x[2], &sum, &y[0]);
55+
56+
/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
57+
icomplex32_mul(&c0, &x[1], &p1);
58+
icomplex32_mul(&c1, &x[2], &p2);
59+
icomplex32_adds(&p1, &p2, &sum);
60+
icomplex32_adds(&x[0], &sum, &y[1]);
61+
62+
/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
63+
icomplex32_mul(&c1, &x[1], &p1);
64+
icomplex32_mul(&c0, &x[2], &p2);
65+
icomplex32_adds(&p1, &p2, &sum);
66+
icomplex32_adds(&x[0], &sum, &y[2]);
67+
}
68+
69+
void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
70+
{
71+
struct icomplex32 x[FFT_MULTI_COUNT_MAX];
72+
struct icomplex32 y[FFT_MULTI_COUNT_MAX];
73+
struct icomplex32 t, c;
74+
int i, j, k, m;
75+
76+
/* Handle 2^N FFT */
77+
if (plan->num_ffts == 1) {
78+
memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
79+
fft_execute_32(plan->fft_plan[0], ifft);
80+
return;
81+
}
82+
83+
#ifdef DEBUG_DUMP_TO_FILE
84+
FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
85+
FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
86+
FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
87+
FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
88+
#endif
89+
90+
/* convert to complex conjugate for IFFT */
91+
if (ifft) {
92+
for (i = 0; i < plan->total_size; i++)
93+
icomplex32_conj(&plan->inb32[i]);
94+
}
95+
96+
/* Copy input buffers */
97+
k = 0;
98+
for (i = 0; i < plan->fft_size; i++)
99+
for (j = 0; j < plan->num_ffts; j++)
100+
plan->tmp_i32[j][i] = plan->inb32[k++];
101+
102+
/* Clear output buffers and call individual FFTs*/
103+
for (j = 0; j < plan->num_ffts; j++) {
104+
memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
105+
fft_execute_32(plan->fft_plan[j], 0);
106+
}
107+
108+
#ifdef DEBUG_DUMP_TO_FILE
109+
for (j = 0; j < plan->num_ffts; j++)
110+
for (i = 0; i < plan->fft_size; i++)
111+
fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
112+
#endif
113+
114+
/* Multiply with twiddle factors */
115+
m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
116+
for (j = 1; j < plan->num_ffts; j++) {
117+
for (i = 0; i < plan->fft_size; i++) {
118+
c = plan->tmp_o32[j][i];
119+
k = j * i * m;
120+
t.real = multi_twiddle_real_32[k];
121+
t.imag = multi_twiddle_imag_32[k];
122+
// fprintf(fh3, "%d %d\n", t.real, t.imag);
123+
icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
124+
}
125+
}
126+
127+
#ifdef DEBUG_DUMP_TO_FILE
128+
for (j = 0; j < plan->num_ffts; j++)
129+
for (i = 0; i < plan->fft_size; i++)
130+
fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
131+
#endif
132+
133+
/* DFT of size 3 */
134+
j = plan->fft_size;
135+
k = 2 * plan->fft_size;
136+
for (i = 0; i < plan->fft_size; i++) {
137+
x[0] = plan->tmp_o32[0][i];
138+
x[1] = plan->tmp_o32[1][i];
139+
x[2] = plan->tmp_o32[2][i];
140+
dft3_32(x, y);
141+
plan->outb32[i] = y[0];
142+
plan->outb32[i + j] = y[1];
143+
plan->outb32[i + k] = y[2];
144+
}
145+
146+
#ifdef DEBUG_DUMP_TO_FILE
147+
for (i = 0; i < plan->total_size; i++)
148+
fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag);
149+
#endif
150+
151+
/* shift back for IFFT */
152+
153+
/* TODO: Check if time shift method for IFFT is more efficient or more accurate
154+
* tmp = 1 / N * fft(X);
155+
* x = tmp([1 N:-1:2])
156+
*/
157+
if (ifft) {
158+
/*
159+
* no need to divide N as it is already done in the input side
160+
* for Q1.31 format. Instead, we need to multiply N to compensate
161+
* the shrink we did in the FFT transform.
162+
*/
163+
for (i = 0; i < plan->total_size; i++) {
164+
/* Need to negate imag part to match reference */
165+
plan->outb32[i].imag = -plan->outb32[i].imag;
166+
icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
167+
&plan->outb32[i]);
168+
plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
169+
plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
170+
}
171+
}
172+
173+
#ifdef DEBUG_DUMP_TO_FILE
174+
fclose(fh1);
175+
fclose(fh2);
176+
fclose(fh3);
177+
fclose(fh4);
178+
#endif
179+
}
180+
181+
#endif /* FFT_GENERIC */

0 commit comments

Comments
 (0)