Bela
Real-time, ultra-low-latency audio and sensor processing system for BeagleBone Black
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups
QuadBiquad.h
1 #pragma once
2 #include <array>
3 #include <arm_neon.h>
4 #include <new>
5 #include <stdlib.h>
6 #include "Biquad.h"
7 #include <stdio.h>
8 
16 {
17 public:
18  static unsigned int constexpr kNumFilters = 4;
26  std::array<BiquadCoeffT<float>, kNumFilters> filters;
27 
37  {
38  if(size_t(this) & size_t(alignof(QuadBiquad) - 1))
39  {
40  fprintf(stderr, "QuadBiquad object is improperly aligned. Avoid heap allocation, use operator new or use -std=c++17\n");
41  std::bad_alloc e;
42  throw(e);
43  }
44  }
45 
49  void* operator new(size_t sz) {
50  auto ptr = aligned_alloc(alignof(QuadBiquad), sz);
51  if(!ptr)
52  {
53  std::bad_alloc e;
54  throw(e);
55  }
56  return ptr;
57  }
58 
65  int setup(const BiquadCoeff::Settings& settings);
66 
71  void update();
72 
79  void process(float data[kNumFilters])
80  {
81  // See here https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
82  // for more on ARM intrinsics
83  float32x4_t in = vld1q_f32(data);
84  // We need to do the following: (see Biquad::process)
85  // out = in * a0 + z1; (A)
86  // z1 = in * a1 + z2 - b1 * out; split into two(B, C):
87  // z2 = in * a2 - b2 * out; split into two(D, E):
88  // store the output (F)
89  // These are interleaved below to avoid NEON to stall while
90  // waiting for results.
91 
92  // A: out = in * a0 + z1;
93  float32x4_t out = vmlaq_f32(z1, in, a0);
94  // B: z1 = in * a1 + z2;
95  z1 = vmlaq_f32(z2, in, a1);
96  // D: z2 = in * a2;
97  z2 = vmulq_f32(in, a2);
98  // F: store the output
99  vst1q_f32(data, out);
100  // C: z1 = z1 - b1 * out; ***
101  z1 = vmlaq_f32(z1, b1, out);
102  // E: z2 = z2 - b2 * out; ***
103  z2 = vmlaq_f32(z2, b2, out);
104  // ***: note that we inverted the sign of the b1 and b2 coefficients
105  // in update() and we use vmlaq instead of vmlsq here. This is
106  // because vmlaq seems to be slightly faster than vmlsq.
107  }
108 private:
109  float32x4_t z1;
110  float32x4_t z2;
111  float32x4_t a0;
112  float32x4_t a1;
113  float32x4_t a2;
114  float32x4_t b1;
115  float32x4_t b2;
116 };
117 extern template class BiquadCoeffT<float>;
int setup(const BiquadCoeff::Settings &settings)
Definition: Biquad.cpp:122
Definition: Biquad.h:34
Definition: Biquad.h:47
void update()
Definition: Biquad.cpp:132
QuadBiquad()
Definition: QuadBiquad.h:36
Definition: QuadBiquad.h:15
void process(float data[kNumFilters])
Definition: QuadBiquad.h:79
std::array< BiquadCoeffT< float >, kNumFilters > filters
Definition: QuadBiquad.h:26