Skip to content

Commit

Permalink
Update and clarify some comments
Browse files Browse the repository at this point in the history
  • Loading branch information
kcat committed Oct 7, 2023
1 parent 393790d commit 5ed78a9
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 91 deletions.
8 changes: 4 additions & 4 deletions common/pffft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@
#endif


/*
* vector support macros: the rest of the code is independant of
/* Vector support macros: the rest of the code is independent of
* SSE/Altivec/NEON -- adding support for other platforms with 4-element
* vectors should be limited to these macros
*/
Expand Down Expand Up @@ -248,7 +247,7 @@ typedef float v4sf;
#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
#endif

// shortcuts for complex multiplcations
// shortcuts for complex multiplications
#define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0)
#define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0)
#ifndef SVMUL
Expand Down Expand Up @@ -1866,6 +1865,8 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
float scaling)
{
assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));

const int Ncvec = s->Ncvec;
const v4sf *RESTRICT va = reinterpret_cast<const v4sf*>(a);
const v4sf *RESTRICT vb = reinterpret_cast<const v4sf*>(b);
Expand All @@ -1892,7 +1893,6 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
#ifndef ZCONVOLVE_USING_INLINE_ASM
const v4sf vscal = LD_PS1(scaling);
#endif
assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
float ar1 = reinterpret_cast<const v4sf_union*>(va)[0].f[0];
float ai1 = reinterpret_cast<const v4sf_union*>(va)[1].f[0];
float br1 = reinterpret_cast<const v4sf_union*>(vb)[0].f[0];
Expand Down
176 changes: 89 additions & 87 deletions common/pffft.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,34 +45,36 @@
SOFTWARE.
*/

/*
PFFFT : a Pretty Fast FFT.
This is basically an adaptation of the single precision fftpack
(v4) as found on netlib taking advantage of SIMD instruction found
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
For architectures where no SIMD instruction is available, the code
falls back to a scalar version.
Restrictions:
- 1D transforms only, with 32-bit single precision.
- supports only transforms for inputs of length N of the form
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
144, 160, etc are all acceptable lengths). Performance is best for
128<=N<=8192.
- all (float*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 16 bytes on x86 and
powerpc CPUs.
You can allocate such buffers with the functions
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
posix_memalign..)
*/
/* PFFFT : a Pretty Fast FFT.
*
* This is basically an adaptation of the single precision fftpack (v4) as
* found on netlib taking advantage of SIMD instructions found on CPUs such as
* Intel x86 (SSE1), PowerPC (Altivec), and Arm (NEON).
*
* For architectures where SIMD instructions aren't available, the code falls
* back to a scalar version.
*
* Restrictions:
*
* - 1D transforms only, with 32-bit single precision.
*
* - supports only transforms for inputs of length N of the form
* N=(2^a)*(3^b)*(5^c), given a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, 144,
* 160, etc are all acceptable lengths). Performance is best for 128<=N<=8192.
*
* - all (float*) pointers for the functions below are expected to have a
* "SIMD-compatible" alignment, that is 16 bytes.
*
* You can allocate such buffers with the pffft_aligned_malloc function, and
* deallocate them with pffft_aligned_free (or with stuff like posix_memalign,
* aligned_alloc, etc).
*
* Note that for the z-domain data of real transforms, when in the canonical
* order (as interleaved complex numbers) both 0-frequency and half-frequency
* components, which are real, are assembled in the first entry as
* F(0)+i*F(n/2+1). The original fftpack placed F(n/2+1) at the end of the
* arrays instead.
*/

#ifndef PFFFT_H
#define PFFFT_H
Expand Down Expand Up @@ -100,77 +102,77 @@ typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;

#endif

/*
prepare for performing transforms of size N -- the returned
PFFFT_Setup structure is read-only so it can safely be shared by
multiple concurrent threads.
*/
/**
* Prepare for performing transforms of size N -- the returned PFFFT_Setup
* structure is read-only so it can safely be shared by multiple concurrent
* threads.
*/
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
void pffft_destroy_setup(PFFFT_Setup *);
/*
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the
"usual" way, that is as an array of interleaved complex numbers,
either use pffft_transform_ordered , or call pffft_zreorder after
the forward fft, and before the backward fft.
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
Typically you will want to scale the backward transform by 1/N.
The 'work' pointer must point to an area of N (2*N for complex
fft) floats, properly aligned. It cannot be NULL.
input and output may alias.
*/
void pffft_destroy_setup(PFFFT_Setup *setup);

/**
* Perform a Fourier transform. The z-domain data is stored in the most
* efficient order for transforming back or using for convolution, and as
* such, there's no guarantee to the order of the values. If you need to have
* its content sorted in the usual way, that is as an array of interleaved
* complex numbers, either use pffft_transform_ordered, or call pffft_zreorder
* after the forward fft and before the backward fft.
*
* Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. Typically
* you will want to scale the backward transform by 1/N.
*
* The 'work' pointer must point to an area of N (2*N for complex fft) floats,
* properly aligned. It cannot be NULL.
*
* The input and output parameters may alias.
*/
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);

/*
Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder.
input and output may alias.
*/
/**
* Similar to pffft_transform, but handles the complex values in the usual form
* (interleaved complex numbers). This is similar to calling
* pffft_transform(..., PFFFT_FORWARD) followed by
* pffft_zreorder(..., PFFFT_FORWARD), or
* pffft_zreorder(..., PFFFT_BACKWARD) followed by
* pffft_transform(..., PFFFT_BACKWARD), for the given direction.
*
* The input and output parameters may alias.
*/
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);

/*
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers.
(for real transforms, both 0-frequency and half frequency
components, which are real, are assembled in the first entry as
F(0)+i*F(n/2+1). Note that the original fftpack did place
F(n/2+1) at the end of the arrays).
input and output should not alias.
*/
/**
* Reorder the z-domain data. For PFFFT_FORWARD, it reorders from the internal
* representation to the "canonical" order (as interleaved complex numbers).
* For PFFFT_BACKWARD, it reorders from the canonical order to the internal
* order suitable for pffft_transform(..., PFFFT_BACKWARD) or
* pffft_zconvolve_accumulate.
*
* The input and output parameters should not alias.
*/
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);

/*
Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
/**
* Perform a multiplication of the z-domain data in dft_a and dft_b and
* accumulate them into dft_ab. The arrays should have been obtained with
* pffft_transform(..., PFFFT_FORWARD) or pffft_zreorder(..., PFFFT_BACKWARD)
* and should *not* be in the usual order (otherwise just perform the operation
* yourself as the dft coeffs are stored as interleaved complex numbers).
*
* The operation performed is: dft_ab += (dft_a * dft_b)*scaling
*
* The dft_a, dft_b, and dft_ab parameters may alias.
*/
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);

/*
the float buffers must have the correct alignment (16-byte boundary
on intel and powerpc). This function may be used to obtain such
correctly aligned buffers.
*/
/**
* The float buffers must have the correct alignment (16-byte boundary on intel
* and powerpc). This function may be used to obtain such correctly aligned
* buffers.
*/
void *pffft_aligned_malloc(size_t nb_bytes);
void pffft_aligned_free(void *);

/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
/* Return 4 or 1 depending if vectorization was enable when building pffft.cpp. */
int pffft_simd_size();

#ifdef __cplusplus
Expand Down

0 comments on commit 5ed78a9

Please sign in to comment.