From 112f3b50f0fe132b2d59f8ce8f2c76af23fc25e9 Mon Sep 17 00:00:00 2001
From: drowe67 <david@rowetel.com>
Date: Sat, 18 Nov 2023 09:37:58 +1030
Subject: kicking off Codec 2 documentation

---
 doc/codec2_refs.bib | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 doc/codec2_refs.bib

(limited to 'doc/codec2_refs.bib')

diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib
new file mode 100644
index 0000000..e578439
--- /dev/null
+++ b/doc/codec2_refs.bib
@@ -0,0 +1,24 @@
+@article{griffin1988multiband,
+  title={Multiband excitation vocoder},
+  author={Griffin, Daniel W and Lim, Jae S},
+  journal={IEEE Transactions on acoustics, speech, and signal processing},
+  volume={36},
+  number={8},
+  pages={1223--1235},
+  year={1988},
+  publisher={IEEE}
+}
+@book{rowe1997techniques,
+  title={Techniques for harmonic sinusoidal coding},
+  author={Rowe, David Grant},
+  year={1997},
+  publisher={Citeseer},
+  note = {\url{https://www.rowetel.com/downloads/1997_rowe_phd_thesis.pdf}}
+}
+
+@misc{ardc2023,
+  title = {{Enhancing HF Digital Voice with FreeDV}},
+  year = {2023}, 
+  note = {\url{https://www.ardc.net/apply/grants/2023-grants/enhancing-hf-digital-voice-with-freedv/}}
+}
+
-- 
cgit v1.2.3


From 4d2492dcd994fc9465e0ef072976d70ca3a0f155 Mon Sep 17 00:00:00 2001
From: drowe67 <david@rowetel.com>
Date: Thu, 23 Nov 2023 07:04:45 +1030
Subject: building up detailed design intro

---
 doc/codec2.pdf      | Bin 141380 -> 157596 bytes
 doc/codec2.tex      | 137 +++++++++++++++++++++++++++++++++++++++++++++-------
 doc/codec2_refs.bib |  10 ++++
 3 files changed, 129 insertions(+), 18 deletions(-)

(limited to 'doc/codec2_refs.bib')

diff --git a/doc/codec2.pdf b/doc/codec2.pdf
index 8220e85..fba5127 100644
Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ
diff --git a/doc/codec2.tex b/doc/codec2.tex
index 7fea6d2..6bb67a0 100644
--- a/doc/codec2.tex
+++ b/doc/codec2.tex
@@ -43,7 +43,7 @@ Codec 2 is an open source speech codec designed for communications quality speec
 
 The Codec 2 project was started in 2009 in response to the problem of closed source, patented, proprietary voice codecs in the sub-5 kbit/s range, in particular for use in the Amateur Radio service.
 
-This document describes Codec 2 at two levels.  Section \ref{sect:overview} is a high level overview aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description with math and signal processing theory. This document is not a concise algorithmic description, instead the algorithm is defined by the reference C99 source code and automated tests (ctests).
+This document describes Codec 2 at two levels.  Section \ref{sect:overview} is a high level overview aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description with math and signal processing theory.  Combined with the C source code, it is intended to give the reader enough information to understand the operation of Codec 2 in detail and embark on source code level projects, such as improvements, ports to other languages, student or academic research projects.  Issues with the current algorithms and topics for further work are also included.
 
 This production of this document was kindly supported by an ARDC grant \cite{ardc2023}.  As an open source project, many people have contributed to Codec 2 over the years - we deeply appreciate all of your support.
 
@@ -52,7 +52,7 @@ This production of this document was kindly supported by an ARDC grant \cite{ard
 
 \subsection{Model Based Speech Coding}
 
-A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at an 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (e.g. 700 bits/s for HF).  Speech coding is the art of "what can we throw away". We need to lower the bit rate of the speech while retaining speech you can understand, and making it sound as natural as possible.
+A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (e.g. 700 bits/s for HF).  Speech coding is the art of "what can we throw away". We need to lower the bit rate of the speech while retaining speech you can understand, and making it sound as natural as possible.
 
 As such low bit rates we use a speech production ``model".  The input speech is anlaysed, and we extract model parameters, which are then sent over the channel.  An example of a model based parameter is the pitch of the person speaking.  We estimate the pitch of the speaker, quantise it to a 7 bit number, and send that over the channel every 20ms.
 
@@ -83,7 +83,7 @@ Note that each harmonic has it's own amplitude, that varies across frequency.  T
 A sinewave will cause a spike or spectral line on a spectrum plot, so we can see each spike as a small sine wave generator.  Each sine wave generator has it's own frequency that are all multiples of the fundamental pitch frequency (e.g. $230, 460, 690,...$ Hz).  They will also have their own amplitude and phase.  If we add all the sine waves together (Figure \ref{fig:sinusoidal_model}) we can produce reasonable quality synthesised speech.  This is called sinusoidal speech coding and is the speech production ``model" at the heart of Codec 2.
 
 \begin{figure}[h]
-\caption{The sinusoidal speech model.  If we sum a series of sine waves, we can generate a speech signal.  Each sinewave has it's own amplitude ($A_1,A_2,... A_L$), frequency, and phase (not shown).  We assume the frequencies are multiples of the fundamental frequency $F_0$. $L$ is the total number of sinewaves.}
+\caption{The sinusoidal speech model.  If we sum a series of sine waves, we can generate a speech signal.  Each sinewave has it's own amplitude ($A_1,A_2,... A_L$), frequency, and phase (not shown).  We assume the frequencies are multiples of the fundamental frequency $F_0$. $L$ is the total number of sinewaves we can fit in 4kHz.}
 \label{fig:sinusoidal_model}
 \begin{center}
 \begin{tikzpicture}[>=triangle 45,x=1.0cm,y=1.0cm]
@@ -114,7 +114,7 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see
 \draw [->] (0.45,0.7) -- (2.05,1.8);
 \draw [->] (0.3,-2.1) -- (2.2,1.6);
 
-% output speec
+% output speech
 \draw [->] (3,2) -- (4,2);
 \draw [xshift=4.2cm,yshift=2cm,color=blue] plot[smooth] file {hts2a_37_sn.txt};
 
@@ -122,33 +122,35 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see
 \end{center}
 \end{figure}
 
-The model parameters evolve over time, but can generally be considered constant for short snap shots in time (a few 10s of ms).  For example pitch evolves time, moving up or down as a word is articulated.
+The model parameters evolve over time, but can generally be considered constant for short time window (a few 10s of ms).  For example pitch evolves over time, moving up or down as a word is articulated.
 
 As the model parameters change over time, we need to keep updating them.  This is known as the \emph{frame rate} of the codec, which can be expressed in terms of frequency (Hz) or time (ms).  For sampling model parameters Codec 2 uses a frame rate of 10ms.  For transmission over the channel we reduce this to 20-40ms, in order to lower the bit rate.  The trade off with a lower frame rate is reduced speech quality.
 
 The parameters of the sinusoidal model are:
 \begin{enumerate}
-\item The frequency of each sine wave.  As they are all harmonics of $F_0$ we can just send $F_0$ to the decoder, and it can reconstruct the frequency of each harmonic as $F_0,2F_0,3F_0,...,LF_0$.  We used 5-7 bits/frame to represent the $F_0$ in Codec 2.
-\item The magnitude of each sine wave, $A_1,A_2,...,A_L$.  These ``spectral magnitudes" are really important as they convey the information the ear needs to understand speech.  Most of the bits are used for spectral magnitude information.  Codec 2 uses between 20 and 36 bits/frame for spectral amplitude information.
-\item Voicing information.  Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two.  The example in Figure \ref{fig:hts2a_time} above is for voiced speech.  So we need some way to describe voicing to the decoder. This requires just a few bits/frame.
-\item The phase of each sine wave  Codec 2 discards the phases of each harmonic and reconstruct them at the decoder using an algorithm, so no bits are required for phases.  This results in some drop in speech quality.
+\item The frequency of each sine wave.  As they are all harmonics of $F_0$ we can just send $F_0$ to the decoder, and it can reconstruct the frequency of each harmonic as $F_0,2F_0,3F_0,...,LF_0$.  We used 5-7 bits/frame to represent $F_0$ in Codec 2.
+\item The amplitude of each sine wave, $A_1,A_2,...,A_L$.  These ``spectral amplitudes" are really important as they convey the information the ear needs to understand speech.  Most of the bits are used for spectral amplitude information.  Codec 2 uses between 18 and 50 bits/frame for spectral amplitude information.
+\item Voicing information.  Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two.  The example in Figure \ref{fig:hts2a_time} above is voiced speech.  So we need some way to describe voicing to the decoder. This requires just a few bits/frame.
+\item The phase of each sine wave.  Codec 2 discards the phases of each harmonic at the encoder and reconstruct them at the decoder using an algorithm, so no bits are required for phases.  This results in some drop in speech quality.
 \end{enumerate}
 
-\subsection{Codec 2 Block Diagram}
+\subsection{Codec 2 Encoder and Decoder}
+
+This section explains how the Codec 2 encoder and decoder works using block diagrams.
 
 \begin{figure}[h]
-\caption{Codec 2 Encoder.}
+\caption{Codec 2 Encoder}
 \label{fig:codec2_encoder}
 \begin{center}
 \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center,text width=2cm]
 
 \node [input] (rinput) {};
-\node [input, right of=rinput,node distance=1cm] (z) {};
-\node [block, right of=z] (pitch_est) {Pitch Estimator};
+\node [input, right of=rinput,node distance=0.5cm] (z) {};
+\node [block, right of=z,node distance=1.5cm] (pitch_est) {Pitch Estimator};
 \node [block, below of=pitch_est] (fft) {FFT};
 \node [block, right of=fft,node distance=3cm] (est_am) {Estimate Amplitudes};
 \node [block, below of=est_am] (est_v) {Estimate Voicing};
-\node [block, right of=est_am,node distance=3cm] (quant) {Quantise};
+\node [block, right of=est_am,node distance=3cm] (quant) {Decimate Quantise};
 \node [output, right of=quant,node distance=2cm] (routput) {};
 
 \draw [->] node[align=left] {Input Speech} (rinput) --  (pitch_est);
@@ -159,25 +161,124 @@ The parameters of the sinusoidal model are:
 \draw [->] (pitch_est) -| (quant);
 \draw [->] (est_am) -- (quant);
 \draw [->] (est_v) -| (quant);
-\draw [->] (est_v) -| (quant);
+\draw [->] (est_v) -| (quant); 
 \draw [->] (quant) -- (routput) node[right, align=left, text width=1.5cm] {Bit Stream};
 
 \end{tikzpicture}
 \end{center}
 \end{figure}
 
+The encoder is presented in Figure \ref{fig:codec2_encoder}.  Frames of input speech samples are passed to a Fast Fourier Transform (FFT), which converts the time domain samples to the frequency domain.  The same frame of input samples is used to estimate the pitch of the current frame. We then use the pitch and frequency domain speech to estimate the amplitude of each sine wave.
+
+Yet another algorithm is used to determine if the frame is voiced or unvoiced.  This works by comparing the spectrum to what we would expect for voiced speech (e.g. lots of spectral lines).  If the energy is more random and continuous rather than discrete lines, we consider it unvoiced.
+
+Up until this point the processing happens at a 10ms frame rate.  However in the next step we ``decimate`` the model parameters - this means we discard some of the model parameters to lower the frame rate, which helps us lower the bit rate.  Decimating to 20ms (throwing away every 2nd set of model parameters) doesn't have much effect, but beyond that the speech quality starts to degrade.  So there is a trade off between decimation rate and bit rate over the channel.
+
+Once we have the desired frame rate, we ``quantise"" each model parameter.  This means we use a fixed number of bits to represent it, so we can send the bits over the channel.  Parameters like pitch and voicing are fairly easy, but quite a bit of DSP goes into quantising the spectral amplitudes. For the higher bit rate Codec 2 modes, we design a filter that matches the spectral amplitudes, then send a quantised version of the filter over the channel. Using the example in Figure \ref{fig:hts2a_time} - the filter would have a band pass peaks at 500 and 2300 Hz.  It's frequency response would follow the red line. The filter is time varying - we redesign it for every frame.
+
+You'll notice the term "estimate" being used a lot.  One of the problems with model based speech coding is the algorithms we use to extract the model parameters are not perfect.  Occasionally the algorithms get it wrong.  Look at the red crosses on the bottom plot of Figure \ref{fig:hts2a_time}.  These mark the amplitude estimate of each harmonic.  If you look carefully, you'll see that above 2000Hz, the crosses fall a little short of the exact centre of each harmonic.  This is an example of a ``fine" pitch estimator error, a little off the correct value.
+
+Often the errors interact, for example the fine pitch error shown above will mean the amplitude estimates are a little bit off as well. Fortunately these errors tend to be temporary, and are sometimes not even noticeable to the listener - remember this codec is often used for HF/VHF radio where channel noise is part of the normal experience.
+ 
+\begin{figure}[h]
+\caption{Codec 2 Decoder}
+\label{fig:codec2_decoder}
+\begin{center}
+\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center,text width=2cm]
+
+\node [input] (rinput) {};
+\node [block, right of=rinput,node distance=2cm] (dequantise) {Dequantise Interpolate};
+\node [block, right of=dequantise,node distance=3cm] (recover) {Recover Amplitudes};
+\node [block, right of=recover,node distance=3cm] (synthesise) {Synthesise Speech};
+\node [block, above of=synthesise] (phase) {Synthesise Phases};
+\node [output, right of=synthesise,node distance=2cm] (routput) {};
+
+\draw [->] node[align=left, text width=1.5cm] {Bit Stream} (rinput) --  (dequantise);
+\draw [->] (dequantise) -- (recover);
+\draw [->] (recover) -- (synthesise);
+\draw [->] (recover) |- (phase);
+\draw [->] (phase) -- (synthesise);
+\draw [->] (synthesise) -- (routput) node[right, align=left, text width=1.5cm] {Output Speech};
+
+\end{tikzpicture}
+\end{center}
+\end{figure}
+
+Figure \ref{fig:codec2_decoder} shows the operation of the Codec 2 decoder.  We take the sequence of bits received from the channel and recover the quantised model parameters, pitch, spectral amplitudes, and voicing.  We then resample the model parameters back up to the 10ms frame rate using a technique called interpolation.  For example say we receive a $F_0=200$ Hz pitch value then 20ms later $F_0=220$ Hz.  We can use the average $F_0=210$ Hz for the middle 10ms frame.
+
+The phases of each harmonic are generated using the other model parameters and some DSP.  It turns out that if you know the amplitude spectrum, you can determine a ``reasonable" phase spectrum using some DSP operations, which in practice is implemented with a couple of FFTs.  We also use the voicing information - for unvoiced speech we use random phases (a good way to synthesise noise-like signals) - and for voiced speech we make sure the phases are chosen so the synthesised speech transitions smoothly from one frame to the next.
+
+Frames of speech are synthesised using an inverse FFT.  We take a blank array of FFT samples, and at intervals of $F_0$ insert samples with the amplitude and phase for each harmonic.  We then inverse FFT to create a frame of time domain samples.  These frames of synthesised speech samples are carefully aligned with the previous frame to ensure smooth frame-frame transitions, and output to the listener.
+
 \subsection{Bit Allocation}
 
-\section{Signal Processing Details}
+Table \ref{tab:bit_allocation} presents the bit allocation for two popular Codec 2 modes.  One additional parameter is the frame energy, this is the average level of the spectral amplitudes, or ``AF gain" of the speech frame.
+
+At very low bit rates such as 700C, we use Vector Quantisation (VQ) to represent the spectral amplitudes.  We construct a table such that each row of the table has a set of spectral amplitude samples.  In Codec 2 700C the table has 512 rows.  During the quantisation process, we choose the table row that best matches the spectral amplitudes for this frame, then send the \emph{index} of the table row.  The decoder has a similar table, so can use the index to look up the output values.  If the table is 512 rows, we can use a 9 bit number to quantise the spectral amplitudes.  In Codec 2 700C, we use two tables of 512 entries each (18 bits total), the second one helps fine tune the quantisation from the first table.
+
+Vector Quantisation can only represent what is present in the tables, so if it sees anything unusual (for example a different microphone frequency response or background noise), the quantisation can become very rough and speech quality poor.  We train the tables at design time using a database of speech samples and a training algorithm - an early form of machine learning.
+
+Codec 2 3200 uses the method of fitting a filter to the spectral amplitudes, this approach tends to be more forgiving of small variations in the input speech spectrum, but is not as efficient in terms of bit rate.
+
+\begin{table}[H]
+\label{tab:bit_allocation}
+\centering
+\begin{tabular}{l c c }
+\hline
+Parameter & 3200 & 700C \\
+\hline
+Pitch $F_0$ & 7 & 5 \\
+Spectral Amplitudes $\{A_m\}$ & 50 & 18 \\
+Energy & 5 & 3 \\
+Voicing & 2 & 1 \\
+Bits/frame & 64 & 28 \\
+Frame Rate & 20ms & 40ms \\
+Bit rate & 3200 & 700 \\
+\hline
+\end{tabular}
+\caption{Bit allocation of the 3200 and 700C modes}
+\end{table}
+
+\section{Detailed Design}
 \label{sect:details}
 
-\cite{griffin1988multiband}
+Codec 2 is based on sinusoidal \cite{mcaulay1986speech} and Multi-Band Excitation (MBE) \cite{griffin1988multiband} vocoders that were first developed in the late 1980s.  Descendants of the MBE vocoders (IMBE, AMBE etc) have enjoyed widespread use in many applications such as VHF/UHF hand held radios and satellite communications. In the 1990s the author studied sinusoidal speech coding \cite{rowe1997techniques}, which provided the skill set and a practical, patent free baseline for starting the Codec 2 project:
+
+Some features of Codec 2:
+\begin{enumerate}
+\item A range of modes supporting different bit rates, currently (Nov 2023): 3200, 2400, 1600, 1400, 1300, 1200, 700 bits/s.  These are referred to as ``Codec 2 3200", ``Codec 700C"" etc.
+\item Modest CPU (a few 10s of MIPs) and memory (a few 10s of kbytes of RAM) requirements such that it can run on stm32 class microcontrollers with hardware FPU.
+\item An open source reference implementation in the C language for C99/gcc compilers, and a \emph{cmake} build and test framework that runs on Linux.  Also included is a cross compiled stm32 reference implementation.
+\item Ports to non-C99 compilers (e.g. MSVC, some microcontrollers, native builds on Windows) are left to third party developers - we recommend the tests also be ported and pass before considering the port successful.
+\item Codec 2 has been designed for digital voice over radio applications, and retains intelligible speech at a few percent bit error rate.
+\item A suite of automated tests used to verify the implementation.
+\item A pitch estimator based on a 2nd order non-linearity developed by the author.
+\item A single voiced/unvoiced binary voicing model.
+\item A frequency domain IFFT/overlap-add synthesis model for voiced and unvoiced speech speech.
+\item For the higher bit rate modes, spectral amplitudes are represented using LPCs extracted from time domain analysis and scalar LSP quantisation.
+\item For Codec 2 700C, vector quantisation of resampled spectral amplitudes in the log domain.
+\item Minimal interframe prediction in order to minimise error propagation and maximise robustness to channel errors.
+\item A post filter that enhances the speech quality of the baseline codec, especially for low pitched (male) speakers.
+\end{enumerate}
+
+\subsection{Non-Linear Pitch Estimation}
+
+The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}.  There is nothing particularly unique about this pitch estimator or it's performance. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements.  This section presents an overview of the NLP algorithm extracted from \cite{rowe1997techniques}.
+
+
+\subsection{Sinusoidal Analysis and Synthesis}
+
+\subsection{LPC/LSP based modes}
+
+\subsection{Codec 2 700C}
 
 \section{Further Work}
 
 \begin{enumerate}
-\item Using c2sim to extract and plot model parameters
+\item some examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters
 \item How to use tools to single step through codec operation
+\item table summarising source files with one line description
+\item Add doc license (Creative Commons?)
 \end{enumerate}
 
 
diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib
index e578439..7348902 100644
--- a/doc/codec2_refs.bib
+++ b/doc/codec2_refs.bib
@@ -22,3 +22,13 @@
   note = {\url{https://www.ardc.net/apply/grants/2023-grants/enhancing-hf-digital-voice-with-freedv/}}
 }
 
+@article{mcaulay1986speech,
+  title={Speech analysis/synthesis based on a sinusoidal representation},
+  author={McAulay, Robert and Quatieri, Thomas},
+  journal={IEEE Transactions on Acoustics, Speech, and Signal Processing},
+  volume={34},
+  number={4},
+  pages={744--754},
+  year={1986},
+  publisher={IEEE}
+}
-- 
cgit v1.2.3


From 067eaa7998240509c89e32b849654e8f347bed24 Mon Sep 17 00:00:00 2001
From: drowe67 <david@rowetel.com>
Date: Fri, 1 Dec 2023 11:32:39 +1030
Subject: LPC/LSP enocder description, decoder block diagram

---
 doc/codec2.pdf      | Bin 237912 -> 243348 bytes
 doc/codec2.tex      |  61 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 doc/codec2_refs.bib |  11 ++++++++++
 3 files changed, 70 insertions(+), 2 deletions(-)

(limited to 'doc/codec2_refs.bib')

diff --git a/doc/codec2.pdf b/doc/codec2.pdf
index b1a1a34..e718b7f 100644
Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ
diff --git a/doc/codec2.tex b/doc/codec2.tex
index 6047a3c..62176e7 100644
--- a/doc/codec2.tex
+++ b/doc/codec2.tex
@@ -474,6 +474,7 @@ The DFT power spectrum of the squared signal $F_w(k)$ generally contains several
 
 The accuracy of the pitch estimate in then refined by maximising the function:
 \begin{equation}
+\label{eq:pitch_refinement}
 E(\omega_0)=\sum_{m=1}^L|S_w(\lfloor r m \rceil)|^2
 \end{equation}
 where $r=\omega_0 N_{dft}/2 \pi$ maps the harmonic number $m$ to a DFT bin. This function will be maximised when $m \omega_0$ aligns with the peak of each harmonic, corresponding with an accurate pitch estimate.   It is evaluated in a small range about the coarse $F_0$ estimate.
@@ -508,6 +509,7 @@ E_m &= \sum_{k=a_m}^{b_m-1} |S_w(k) - \hat{S}_w(k)|^2 \\
 \end{equation}
 A Signal to Noise Ratio (SNR) ratio is defined as:
 \begin{equation}
+\label{eq:voicing_snr}
 SNR = \sum_{m=1}^{m_{1000}} \frac{A^2_m}{E_m}
 \end{equation}
 where $m_{1000}= \lfloor L/4 \rceil$ is the band closest to 1000 Hz, and $\{A_m\}$ are computed from (\ref{eq:mag_est}). If the energy in the bands up to 1000 Hz is a good match to a harmonic series of sinusoids then $\hat{S}_w(k) \approx S_w(k)$ and $E_m$ will be small compared to the energy in the band resulting in a high SNR.  Voicing is declared using the following rule:
@@ -580,6 +582,12 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi
 \subsection{LPC/LSP based modes}
 \label{sect:mode_lpc_lsp}
 
+In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode.  This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pair (LSP) model to quantise and transmit the spectral magnitude information over the channel.  There is a great deal of material on the topics of linear prediction and LSPs, so they will not be explained here.  An excellent reference for LPCs is \cite{makhoul1975linear}.
+
+Figure \ref{fig:encoder_lpc_lsp} presents the encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples).  The LPC analysis extracts $p=10$ LPC coefficients $\{a_k\}, k=1..10$ and the LPC energy $E$.  The LPC coefficients are transformed to $p=10$ LSP frequencies $\{f_k\}, k=1..10$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}.  The LSP frequencies are then quantised to a fixed number of bits/frame.
+
+Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}).
+
 \begin{figure}[h]
 \caption{LPC/LSP Modes Encoder}
 \label{fig:encoder_lpc_lsp}
@@ -593,10 +601,12 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi
 \node [block, right of=z1,node distance=1.5cm] (dft) {DFT};
 \node [block, above of=dft,text width=2cm] (lpc) {LPC Analysis};
 \node [block, right of=lpc,node distance=3cm,text width=2cm] (lsp) {LSP Quantisation};
+\node [tmp, right of=nlp,node distance=1cm] (z2) {};
+\node [tmp, above of=z2,node distance=1cm] (z3) {};
 \node [block, below of=dft,text width=2cm] (est) {Est Amp};
 \node [block, right of=est,node distance=3cm,text width=2cm] (voicing) {Est Voicing};
 \node [block, below of=window] (nlp) {NLP};
-\node [block, below of=lsp,text width=2cm] (pack) {Bit Packing};
+\node [block, below of=lsp,text width=2.5cm] (pack) {Decimation \&\\Bit Packing};
 \node [output, right of=pack,node distance=2cm] (routput) {};
 
 \draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window);
@@ -607,6 +617,7 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi
 \draw [->] (lsp) -- (pack);
 \draw [->] (dft) -- (est);
 \draw [->] (nlp) -- (est);
+\draw [->] (z2) -- (z3) -| (pack);
 \draw [->] (est) -- (voicing);
 \draw [->] (voicing) -- (pack);
 \draw [->] (pack) -- (routput) node[right,align=left,text width=1.5cm] {Bit Stream};
@@ -615,13 +626,59 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi
 \end{center}
 \end{figure}
 
-Block diagram of LPC/LSP mode encoder and decoder.  Walk through operation.  Decimation and interpolation.
+One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame.  As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters.  Using a fixed order LPC model is a neat solution to this problem.  Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly.  The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well.   For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope.
+
+In CELP codecs these problems can be accommodated by the (high bit rate) excitation, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model.
+
+Before bit packing, the Codec 2 parameters are decimated in time.  An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality.  The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes.  This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech.  Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis.  Neverthless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate.
+
+TODO: table of LPC/LSP modes, frame rate.  Perhaps make this a table covering all modes.
+
+\begin{figure}[h]
+\caption{LPC/LSP Modes Decoder}
+\label{fig:decoder_lpc_lsp}
+\begin{center}
+\begin{tikzpicture}[auto, node distance=3cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center]
+
+\node [input] (rinput) {};
+\node [block, right of=rinput,node distance=1.5cm] (unpack) {Unpack};
+\node [block, right of=unpack,node distance=2.5cm] (interp) {Interpolate};
+\node [block, right of=interp,text width=2cm] (lpc) {LSP to LPC};
+\node [block, right of=lpc,text width=2cm] (sample) {Sample $A_m$};
+\node [block, below of=sample,text width=2cm,node distance=2cm] (post) {Post Filter};
+\node [block, left of=post,text width=2.5cm] (synth) {Sinusoidal\\Synthesis};
+\node [output, left of=synth,node distance=2cm] (routput) {};
+
+\draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack);
+\draw [->] (unpack) -- (interp);
+\draw [->] (interp) -- (lpc);
+\draw [->] (lpc) -- (sample);
+\draw [->] (sample) -- (post);
+\draw [->] (post) -- (synth);
+\draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$};
+%\draw [->] (dft) -- (est);
+%\draw [->] (nlp) -- (est);
+%\draw [->] (z2) -- (z3) -| (pack);
+%\draw [->] (est) -- (voicing);
+%\draw [->] (voicing) -- (pack);
+%\draw [->] (pack) -- (routput) node[right,align=left,text width=1.5cm] {Bit Stream};
+
+\end{tikzpicture}
+\end{center}
+\end{figure}
+
+TODO expression for linear interpolation.  Interpolation in LSP domain.  Ear protection.
 
 \subsection{Codec 2 700C}
 \label{sect:mode_newamp1}
 
+Microphone equaliser
+ratek study
+
 \section{Further Work}
 
+Summary of mysteries/interesting points drawn out above.
+
 \begin{enumerate}
 \item Some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters.  Listen to various phases of quantisation.
 \item How to use Octave tools to single step through codec operation
diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib
index 7348902..039accc 100644
--- a/doc/codec2_refs.bib
+++ b/doc/codec2_refs.bib
@@ -32,3 +32,14 @@
   year={1986},
   publisher={IEEE}
 }
+
+@article{makhoul1975linear,
+  title={Linear prediction: A tutorial review},
+  author={Makhoul, John},
+  journal={Proceedings of the IEEE},
+  volume={63},
+  number={4},
+  pages={561--580},
+  year={1975},
+  publisher={IEEE}
+}
-- 
cgit v1.2.3


From 43defe5bbed510a8fdbb7c8f0fba155d3238b084 Mon Sep 17 00:00:00 2001
From: drowe67 <david@rowetel.com>
Date: Sun, 3 Dec 2023 07:11:51 +1030
Subject: decoder description, mode table

---
 doc/codec2.pdf      | Bin 243348 -> 250500 bytes
 doc/codec2.tex      |  80 ++++++++++++++++++++++++++++++++++++++++++----------
 doc/codec2_refs.bib |  11 ++++++++
 3 files changed, 76 insertions(+), 15 deletions(-)

(limited to 'doc/codec2_refs.bib')

diff --git a/doc/codec2.pdf b/doc/codec2.pdf
index e718b7f..7fcb0ff 100644
Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ
diff --git a/doc/codec2.tex b/doc/codec2.tex
index 62176e7..612c196 100644
--- a/doc/codec2.tex
+++ b/doc/codec2.tex
@@ -527,6 +527,7 @@ In Codec 2 the harmonic phases $\{\theta_m\}$ are not transmitted, instead they
 
 Consider the source-filter model of speech production:
 \begin{equation}
+\label{eq:source_filter}
 \hat{S}(z)=E(z)H(z)
 \end{equation}
 where $E(z)$ is an excitation signal with a relatively flat spectrum, and $H(z)$ is a synthesis filter that shapes the magnitude spectrum.  The phase of each harmonic is the sum of the excitation and synthesis filter phase:
@@ -582,11 +583,28 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi
 \subsection{LPC/LSP based modes}
 \label{sect:mode_lpc_lsp}
 
-In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode.  This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pair (LSP) model to quantise and transmit the spectral magnitude information over the channel.  There is a great deal of material on the topics of linear prediction and LSPs, so they will not be explained here.  An excellent reference for LPCs is \cite{makhoul1975linear}.
+In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode.  This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pairs (LSPs) to quantise and transmit the spectral magnitude information.  There is a great deal of information available on these techniques so they are only briefly described here.
 
-Figure \ref{fig:encoder_lpc_lsp} presents the encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples).  The LPC analysis extracts $p=10$ LPC coefficients $\{a_k\}, k=1..10$ and the LPC energy $E$.  The LPC coefficients are transformed to $p=10$ LSP frequencies $\{f_k\}, k=1..10$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}.  The LSP frequencies are then quantised to a fixed number of bits/frame.
+The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}).  A relatively flat excitation source $E(z)$ excites a filter $(H(z)$ which models the magnitude spectrum.  Linear Predictive Coding (LPC) defines $H(z)$ as an all pole filter:
+\begin{equation}
+H(z) = \frac{G}{1-\sum_{k=1}^p a_k z^{-k}} = \frac{G}{A(z)}
+\end{equation}
+where  $\{a_k\}, k=1..10$ is a set of p linear prediction coefficients that characterise the filter's frequency response and G is a scalar gain factor. An excellent reference for LPC is \cite{makhoul1975linear}.
+
+To be useful in low bit rate speech coding it is necessary to quantise and transmit the LPC coefficients using a small number of bits. Direct quantisation of these LPC coefficients is inappropriate due to their large dynamic range (8-10 bits/coefficient). Thus for transmission purposes, especially at low bit rates, other forms such as the Line Spectral Pair (LSP) \cite{itakura1975line} frequencies are used to represent the LPC parameters. The LSP frequencies can be derived by decomposing the $p$-th order polynomial $A(z)$, into symmetric and anti-symmetric polynomials $P(z)$ and $Q(z)$, shown here in factored form:
+\begin{equation}
+\begin{split}
+P(z) &= (1+z^{-1}) \prod_{i=1}^{p/2} (1 - 2cos(\omega_{2i-1} z^{-1} + z^{-2} ) \\
+Q(z) &= (1-z^{-1}) \prod_{i=1}^{p/2} (1 - 2cos(\omega_{2i} z^{-1} + z^{-2} )
+\end{split}
+\end{equation}
+where $\omega_{2i-1}$ and $\omega_{2i}$ are the LSP frequencies, found by evaluating the polynomials on the unit circle. The LSP frequencies are interlaced with each other, where $0<\omega_1 < \omega_2 <,..., < \omega_p < \pi$. The separation of adjacent LSP frequencies is related to the bandwidth of spectral peaks in $H(z)=G/A(z)$. A small separation indicates a narrow bandwidth. $A(z)$ may be reconstructed from $P(z)$ and $Q(z)$ using:
+\begin{equation}
+A(z) = \frac{P(z)+Q(z)}{2}
+\end{equation}
+Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $(A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$.  More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources.
 
-Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}).
+Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples).  LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}.  The LSP frequencies are then quantised to a fixed number of bits/frame.  Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$.  The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}.  Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}).
 
 \begin{figure}[h]
 \caption{LPC/LSP Modes Encoder}
@@ -632,7 +650,15 @@ In CELP codecs these problems can be accommodated by the (high bit rate) excitat
 
 Before bit packing, the Codec 2 parameters are decimated in time.  An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality.  The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes.  This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech.  Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis.  Neverthless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate.
 
-TODO: table of LPC/LSP modes, frame rate.  Perhaps make this a table covering all modes.
+Figure \ref{fig:decoder_lpc_lsp} shows the LPC/LSP mode decoder.  Frames of bits received at the frame rate are unpacked and resampled to the 10ms internal frame rate using linear interpolation.  The spectral magnitude information is resampled by linear interpolation of the LSP frequencies, and converted back to a quantised LPC model $\hat{H}(z)$.  The harmonic magnitudes are recovered by averaging the energy of the LPC
+spectrum over the region of each harmonic:
+\begin{equation}
+\hat{A}_m = \sqrt{ \sum_{k=a_m}^{b_m-1} | \hat{H}(k) |^2 }
+\end{equation}
+where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame.  For phase synthesis, the phase of $H(z)$ is determined by sampling $\hat{H}(k)$ in the centre of each harmonic:
+\begin{equation}
+arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right]
+\end{equation}
 
 \begin{figure}[h]
 \caption{LPC/LSP Modes Decoder}
@@ -644,9 +670,11 @@ TODO: table of LPC/LSP modes, frame rate.  Perhaps make this a table covering al
 \node [block, right of=rinput,node distance=1.5cm] (unpack) {Unpack};
 \node [block, right of=unpack,node distance=2.5cm] (interp) {Interpolate};
 \node [block, right of=interp,text width=2cm] (lpc) {LSP to LPC};
+\node [tmp, right of=interp,node distance=1.25cm] (z1) {};
 \node [block, right of=lpc,text width=2cm] (sample) {Sample $A_m$};
-\node [block, below of=sample,text width=2cm,node distance=2cm] (post) {Post Filter};
-\node [block, left of=post,text width=2.5cm] (synth) {Sinusoidal\\Synthesis};
+\node [block, below of=lpc,text width=2cm,node distance=2cm] (phase) {Phase Synthesis};
+\node [block, below of=phase,text width=2.5cm,node distance=2cm] (synth) {Sinusoidal\\Synthesis};
+\node [block, right of=synth,text width=2cm] (post) {Post Filter};
 \node [output, left of=synth,node distance=2cm] (routput) {};
 
 \draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack);
@@ -655,20 +683,15 @@ TODO: table of LPC/LSP modes, frame rate.  Perhaps make this a table covering al
 \draw [->] (lpc) -- (sample);
 \draw [->] (sample) -- (post);
 \draw [->] (post) -- (synth);
+\draw [->] (z1) |- (phase);
+\draw [->] (phase) -- (synth);
+\draw [->] (sample) |- (phase);
 \draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$};
-%\draw [->] (dft) -- (est);
-%\draw [->] (nlp) -- (est);
-%\draw [->] (z2) -- (z3) -| (pack);
-%\draw [->] (est) -- (voicing);
-%\draw [->] (voicing) -- (pack);
-%\draw [->] (pack) -- (routput) node[right,align=left,text width=1.5cm] {Bit Stream};
 
 \end{tikzpicture}
 \end{center}
 \end{figure}
 
-TODO expression for linear interpolation.  Interpolation in LSP domain.  Ear protection.
-
 \subsection{Codec 2 700C}
 \label{sect:mode_newamp1}
 
@@ -687,9 +710,32 @@ Summary of mysteries/interesting points drawn out above.
 \item Energy distribution theory.  Need for V model, neural vocoders, non-linear function. Figures and simulation plots would be useful. Figure of phase synthesis.
 \end{enumerate}
 
-\section{Glossary}
 
+\section{Codec 2 Modes}
 \label{sect:glossary}
+
+\begin{table}[H]
+\label{tab:codec2_modes}
+\centering
+\begin{tabular}{p{0.75cm}|p{0.75cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{5cm}}
+\hline
+Mode & Frm (ms) & Bits & $A_m$ & $E$ & $\omega_0$ & $v$ & Comment \\
+\hline
+3200 & 20 & 64 & 50 & 5  & 7  & 2 & LSP differences \\
+2400 & 20 & 50 & 36 & 8  & -  & 2 & Joint $\omega_0$/E VQ, 2 spare bits \\
+1600 & 40 & 64 & 36 & 10 & 14 & 4 \\
+1400 & 40 & 56 & 36 & 16 & -  & 4 \\
+1300 & 40 & 52 & 36 & 5  & 7  & 4 & Joint $\omega_0$/E VQ \\
+1200 & 48 & 40 & 27 & 16 & -  & 4 & LSP VQ, Joint  $\omega_0$/E VQ, 1 spare \\
+700C & 40 & 28 & 18 & 4  & 6  & - & VQ of log magnitudes \\
+\hline
+\end{tabular}
+\caption{Codec 2 Modes}
+\end{table}
+
+\section{Glossary}
+\label{sect:glossary}
+
 \begin{table}[H]
 \label{tab:acronyms}
 \centering
@@ -700,6 +746,8 @@ Acronym & Description \\
 DFT & Discrete Fourier Transform \\
 DTCF & Discrete Time Continuous Frequency Fourier Transform \\
 IDFT & Inverse Discrete Fourier Transform \\
+LPC & Linear Predictive Coding \\
+LSP & Line Spectrum Pair \\
 MBE & Multi-Band Excitation \\
 NLP & Non Linear Pitch (algorithm) \\
 \hline
@@ -714,6 +762,7 @@ NLP & Non Linear Pitch (algorithm) \\
 \hline
 Symbol & Description & Units \\
 \hline
+$A(z)$ & LPC (analysis) filter \\
 $a_m$ & lower DFT index of current band \\
 $b_m$ & upper DFT index of current band \\
 $\{A_m\}$ & Set of harmonic magnitudes $m=1,...L$ & dB \\
@@ -729,6 +778,7 @@ $s_w(n)$ & Time domain windowed input speech \\
 $S_w(k)$ & Frequency domain windowed input speech \\
 $\phi_m$ & Phase of excitation harmonic \\
 $\omega_0$ & Fundamental frequency (pitch) & radians/sample \\
+$\{\omega_i\}$ & set of LSP frequencies \\
 $v$ & Voicing decision for the current frame \\
 \hline
 \end{tabular}
diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib
index 039accc..f100db3 100644
--- a/doc/codec2_refs.bib
+++ b/doc/codec2_refs.bib
@@ -43,3 +43,14 @@
   year={1975},
   publisher={IEEE}
 }
+
+@article{itakura1975line,
+  title={Line spectrum representation of linear predictor coefficients of speech signals},
+  author={Itakura, Fumitada},
+  journal={The Journal of the Acoustical Society of America},
+  volume={57},
+  number={S1},
+  pages={S35--S35},
+  year={1975},
+  publisher={AIP Publishing}
+}
-- 
cgit v1.2.3


From 009897669310d1c74cc512b5b82cc952df078294 Mon Sep 17 00:00:00 2001
From: drowe67 <david@rowetel.com>
Date: Thu, 7 Dec 2023 05:52:41 +1030
Subject: building up 700C section

---
 doc/codec2.pdf        | Bin 250500 -> 286543 bytes
 doc/codec2.tex        | 152 ++++++++++++++++++++++++++++++++++++++++++++++++--
 doc/codec2_refs.bib   |   8 +++
 doc/ratek_mel_fhz.png | Bin 0 -> 11685 bytes
 doc/warp_fhz_k.png    | Bin 0 -> 9209 bytes
 5 files changed, 155 insertions(+), 5 deletions(-)
 create mode 100644 doc/ratek_mel_fhz.png
 create mode 100644 doc/warp_fhz_k.png

(limited to 'doc/codec2_refs.bib')

diff --git a/doc/codec2.pdf b/doc/codec2.pdf
index 7fcb0ff..d9b5294 100644
Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ
diff --git a/doc/codec2.tex b/doc/codec2.tex
index 612c196..d034dc6 100644
--- a/doc/codec2.tex
+++ b/doc/codec2.tex
@@ -3,6 +3,7 @@
 \usepackage{hyperref}
 \usepackage{tikz}
 \usetikzlibrary{calc,arrows,shapes,positioning}
+\usepackage{tkz-euclide}
 \usepackage{float}
 \usepackage{xstring}
 \usepackage{catchfile}
@@ -583,9 +584,9 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi
 \subsection{LPC/LSP based modes}
 \label{sect:mode_lpc_lsp}
 
-In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode.  This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pairs (LSPs) to quantise and transmit the spectral magnitude information.  There is a great deal of information available on these techniques so they are only briefly described here.
+In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode.  This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pairs (LSPs) to quantise and transmit the spectral magnitude information.  There is a great deal of information available on these topics so they are only briefly described here.
 
-The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}).  A relatively flat excitation source $E(z)$ excites a filter $(H(z)$ which models the magnitude spectrum.  Linear Predictive Coding (LPC) defines $H(z)$ as an all pole filter:
+The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}).  A relatively flat excitation source $E(z)$ excites a filter $H(z)$ which models the magnitude spectrum of the speech.  Linear Predictive Coding (LPC) defines $H(z)$ as an all pole filter:
 \begin{equation}
 H(z) = \frac{G}{1-\sum_{k=1}^p a_k z^{-k}} = \frac{G}{A(z)}
 \end{equation}
@@ -602,7 +603,7 @@ where $\omega_{2i-1}$ and $\omega_{2i}$ are the LSP frequencies, found by evalua
 \begin{equation}
 A(z) = \frac{P(z)+Q(z)}{2}
 \end{equation}
-Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $(A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$.  More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources.
+Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$.  More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources.
 
 Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples).  LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}.  The LSP frequencies are then quantised to a fixed number of bits/frame.  Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$.  The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}.  Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}).
 
@@ -695,8 +696,149 @@ arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil)
 \subsection{Codec 2 700C}
 \label{sect:mode_newamp1}
 
-Microphone equaliser
-ratek study
+To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}.  One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation.
+
+Consider a vector $\mathbf{a}$ of $L$ harmonic spectral magnitudes in dB: 
+\begin{equation}
+\mathbf{a} = \begin{bmatrix} 20log_{10}A_1, 20log_{10}A_2, \ldots 20log_{10}A_L \end{bmatrix} 
+\end{equation}
+\begin{equation}
+L=\left \lfloor \frac{F_s}{2F_0} \right \rfloor = \left \lfloor \frac{\pi}{\omega_0} \right \rfloor
+\end{equation}
+$F_0$ and $L$ are time varying as the pitch track evolves over time. For speech sampled at $F_s=8$ kHz $F_0$ is typically in the range of 50 to 400 Hz, giving $L$ in the range of 10 $\ldots$ 80. \\
+
+To quantise and transmit $\mathbf{a}$, it is convenient to resample $\mathbf{a}$ to a fixed length $K$ element vector $\mathbf{b}$ using a resampling function:
+\begin{equation}
+\begin{split}
+\mathbf{y} &= \begin{bmatrix} Y_1, Y_2, \ldots Y_L \end{bmatrix} = H(\mathbf{a}) \\
+\mathbf{b} &= \begin{bmatrix} B_1, B_2, \ldots B_K \end{bmatrix} = R(\mathbf{y})
+\end{split}
+\end{equation}
+Where $H$ is a filter function chosen to smooth the spectral amplitude samples $A_m$ while not significantly altering the perceptual quality of the speech; and $R$ is a resampling function. To model the response of the human ear $B_k$  are sampled on $K$ non-linearly spaced points on the frequency axis:
+\begin{equation}
+\begin{split}
+f_k &= warp(k,K) \ \textrm{Hz} \quad k=1 \ldots K \\
+warp(1,K) &= 200 \ \textrm{Hz} \\
+warp(K,K) &= 3700 \ \textrm{Hz}
+\end{split}
+\end{equation}
+where $warp()$ is a frequency warping function. Codec 2 700C uses $K=20$, $H=1$, and $warp()$ is defined using the Mel function \cite[p 150]{o1997human} (Figure \ref{fig:mel_fhz}) which samples the spectrum more densely at low frequencies, and less densely at high frequencies:
+\begin{equation} \label{eq:mel_f}
+mel(f) = 2595log_{10}(1+f/700)
+\end{equation}
+The inverse mapping of $f$ in Hz from $mel(f)$ is given by:
+\begin{equation} \label{eq:f_mel}
+f = mel^{-1}(x) = 700(10^{x/2595} - 1);
+\end{equation}
+
+\begin{figure}[h]
+\caption{Mel function}
+\label{fig:mel_fhz}
+\begin{center}
+\includegraphics[width=8cm]{ratek_mel_fhz}
+\end{center}
+\end{figure}
+
+We wish to use $mel(f)$ to construct $warp(k,K)$, such that there are $K$ evenly spaced points on the $mel(f)$ axis (Figure \ref{fig:mel_k}).  Solving for the equation of a straight line we can obtain $mel(f)$ as a function of $k$, and hence $warp(k,K)$ (Figure \ref{fig:warp_fhz_k}):
+\begin{equation} \label{eq:mel_k}
+\begin{split}
+g &= \frac{mel(3700)-mel(200)}{K-1} \\
+mel(f) &= g(k-1) + mel(200)
+\end{split}
+\end{equation}
+Substituting (\ref{eq:f_mel}) into the LHS:
+\begin{equation} \label{eq:warp}
+\begin{split}
+2595log_{10}(1+f/700) &= g(k-1) + mel(200) \\
+f = warp(k,K) &= mel^{-1} ( g(k-1) + mel(200) ) \\
+\end{split}
+\end{equation}
+and the inverse warp function:
+\begin{equation} \label{warp_inv}
+k = warp^{-1}(f,K) = \frac{mel(f)-mel(200)}{g} + 1
+\end{equation}
+
+\begin{figure}[h]
+\caption{Linear mapping of $mel(f)$ to Rate $K$ sample index $k$}
+\vspace{5mm}
+\label{fig:mel_k}
+\centering
+\begin{tikzpicture}
+\tkzDefPoint(1,1){A}
+\tkzDefPoint(5,5){B}
+\draw[thick] (1,1) node [right]{(1,mel(200))} -- (5,5) node [right]{(K,mel(3700))};
+\draw[thick,->] (0,0) -- (6,0) node [below]{k};
+\draw[thick,->] (0,0) -- (0,6) node [left]{mel(f)};
+\foreach \n in {A,B}
+  \node at (\n)[circle,fill,inner sep=1.5pt]{};
+\end{tikzpicture}
+\end{figure}
+
+\begin{figure}[h]
+\caption{$warp(k,K)$ function for $K=20$}
+\label{fig:warp_fhz_k}
+\begin{center}
+\includegraphics[width=8cm]{warp_fhz_k}
+\end{center}
+\end{figure}
+
+The rate $K$ vector $\mathbf{b}$ is vector quantised for transmission over the channel:
+\begin{equation}
+\hat{\mathbf{b}} = Q(\mathbf{b})
+\end{equation}
+Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. The rate filtered rate $L$ vector can then be recovered by resampling $\mathbf{\hat{b}}$ using another resampling function:
+\begin{equation}
+\hat{\mathbf{y}} = S(\hat{\mathbf{b}})
+\end{equation}
+	
+Figure \ref{fig:newamp1_encoder} is the Codec 2 700C encoder.  Some notes on this algorithm:
+\begin{enumerate}
+\item The amplitudes and Vector Quantiser (VQ) entries are in dB, which is very nice to work in and matches the ears logarithmic amplitude response. 
+\item The mode is capable of communications quality speech and is in common use with FreeDV, but is close to the lower limits of intelligibility, and doesn't do well in some languages (problems have been reported with German and Japanese).
+\item The VQ was trained on just 120 seconds of data - way too short.
+\item The parameter set (pitch, voicing, log spectral magnitudes) is very similar to that used for the latest neural vocoders.
+\item The input speech may be subject to arbitrary filtering, for example due to the microphone frequency response, room acoustics, and anti-aliasing filter.  This filtering is fixed or slowly time varying.  The filtering biases the target vectors away from the VQ training material, resulting in significant additional mean square error.  The filtering does not greatly affect the input speech quality, however the VQ performance distortion increases and the output speech quality is reduced.  This is exacerbated by operating in the log domain, the VQ will try to match very low level, perceptually insignificant energy near 0 and 4000 Hz. A microphone equaliser algorithm has been developed to help adjust to arbitrary microphone filtering.
+\end{enumerate}
+
+\begin{figure}[h]
+\caption{Codec 2 700C (newamp1) encoder}
+
+\label{fig:newamp1_encoder}
+\begin{center}
+\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center]
+
+\node [input] (rinput) {};
+\node [tmp, right of=rinput,node distance=0.5cm] (z) {};
+\node [block, right of=z,node distance=1.5cm] (window) {Window};
+\node [block, right of=window,node distance=2.5cm] (dft) {DFT};
+\node [block, right of=dft,node distance=3cm,text width=1.5cm] (est) {Est Amp};
+\node [block, below of=window] (nlp) {NLP};
+\node [block, below of=nlp] (log) {log $\omega_0$};
+\node [block, below of=est,node distance=2cm,text width=2cm] (resample) {Resample Rate $K$};
+\node [block, right of=est,node distance=2.5cm,text width=1.5cm] (voicing) {Est Voicing};
+\node [tmp, below of=resample,node distance=1cm] (z1) {};
+\node [block, below of=dft,node distance=2cm,text width=2cm] (vq) {Decimate \& VQ};
+\node [block, below of=vq,node distance=2cm,text width=2cm] (pack) {Bit Packing};
+\node [output, right of=pack,node distance=2cm] (routput) {};
+
+\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window);
+\draw [->] (z) |- (nlp);
+\draw [->] (window) -- node[below] {$s_w(n)$} (dft);
+\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est);
+\draw [->] (est) -- node[right] {$\mathbf{a}$} (resample);
+\draw [->] (est) -- (voicing);
+\draw [->] (resample) -- node[below] {$\mathbf{b}$} (vq);
+\draw [->] (vq) -- (pack);
+\draw [->] (nlp) -- (log);
+\draw [->] (log) -- (pack);
+\draw [->] (voicing) |- (z1) -| (pack);
+\draw [->] (pack) -- (routput) node[right] {Bit Stream};
+
+\end{tikzpicture}
+\end{center}
+\end{figure}
+
+TODO: Microphone equaliser. ratek study
 
 \section{Further Work}
 
diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib
index f100db3..ea9ee6c 100644
--- a/doc/codec2_refs.bib
+++ b/doc/codec2_refs.bib
@@ -54,3 +54,11 @@
   year={1975},
   publisher={AIP Publishing}
 }
+
+@book{o1997human,
+  title={Speech Communication - Human and machine},
+  author={O‘Shaughnessy, Douglas},
+  publisher={Addison-Wesley Publishing Company},
+  year={1997}
+}
+
diff --git a/doc/ratek_mel_fhz.png b/doc/ratek_mel_fhz.png
new file mode 100644
index 0000000..c51d409
Binary files /dev/null and b/doc/ratek_mel_fhz.png differ
diff --git a/doc/warp_fhz_k.png b/doc/warp_fhz_k.png
new file mode 100644
index 0000000..a6cbf3c
Binary files /dev/null and b/doc/warp_fhz_k.png differ
-- 
cgit v1.2.3


From 71b86a8a1167b03d650d8ec3930770d8d17a9259 Mon Sep 17 00:00:00 2001
From: David Rowe <david.rowe@myriota.com>
Date: Thu, 7 Dec 2023 08:00:21 +1030
Subject: mic EQ and VQ mean removal maths

---
 doc/codec2.pdf      | Bin 286543 -> 301693 bytes
 doc/codec2.tex      | 133 +++++++++++++++++++++++++++++-----------------------
 doc/codec2_refs.bib |   6 +++
 3 files changed, 80 insertions(+), 59 deletions(-)

(limited to 'doc/codec2_refs.bib')

diff --git a/doc/codec2.pdf b/doc/codec2.pdf
index d9b5294..d91f1f8 100644
Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ
diff --git a/doc/codec2.tex b/doc/codec2.tex
index d034dc6..2c5d13c 100644
--- a/doc/codec2.tex
+++ b/doc/codec2.tex
@@ -696,25 +696,63 @@ arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil)
 \subsection{Codec 2 700C}
 \label{sect:mode_newamp1}
 
-To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}.  One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation.
+To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}.  One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation. Figure \ref{fig:newamp1_encoder} presents the Codec 2 700C encoder.  
 
-Consider a vector $\mathbf{a}$ of $L$ harmonic spectral magnitudes in dB: 
+\begin{figure}[h]
+\caption{Codec 2 700C (newamp1) encoder}
+
+\label{fig:newamp1_encoder}
+\begin{center}
+\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center]
+
+\node [input] (rinput) {};
+\node [tmp, right of=rinput,node distance=0.5cm] (z) {};
+\node [block, right of=z,node distance=1.5cm] (window) {Window};
+\node [block, right of=window,node distance=2.5cm] (dft) {DFT};
+\node [block, right of=dft,node distance=3cm,text width=1.5cm] (est) {Est Amp};
+\node [block, below of=est,node distance=2cm,text width=2cm] (resample) {Resample Rate $K$};
+\node [block, below of=dft,node distance=2cm,text width=2cm] (eq) {Microphone EQ};
+\node [block, below of=eq,node distance=2cm,text width=2cm] (vq) {Decimate \& VQ};
+\node [block, below of=window] (nlp) {NLP};
+\node [block, below of=nlp] (log) {log $\omega_0$};
+\node [block, below of=resample,node distance=2cm,text width=1.5cm] (voicing) {Est Voicing};
+\node [block, below of=vq,node distance=2cm,text width=2cm] (pack) {Bit Packing};
+\node [tmp, right of=resample,node distance=2cm] (z1) {};
+\node [tmp, below of=vq,node distance=1cm] (z2) {};
+\node [output, right of=pack,node distance=2cm] (routput) {};
+
+\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window);
+\draw [->] (z) |- (nlp);
+\draw [->] (window) -- node[below] {$s_w(n)$} (dft);
+\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est);
+\draw [->] (est) -- node[right] {$\mathbf{a}$} (resample);
+\draw [->] (resample) -- node[below] {$\mathbf{b}$} (eq);
+\draw [->] (eq) -- (vq);
+\draw [->] (vq) -- (pack);
+\draw [->] (est) -| (z1) |- (voicing);
+\draw [->] (nlp) -- (log);
+\draw [->] (log) |- (pack);
+\draw [->] (voicing) |- (z2) -| (pack);
+\draw [->] (pack) -- (routput) node[right] {Bit Stream};
+
+\end{tikzpicture}
+\end{center}
+\end{figure}
+
+Consider a vector $\mathbf{a}$ of $L$ harmonic spectral magnitudes expressed in dB: 
 \begin{equation}
 \mathbf{a} = \begin{bmatrix} 20log_{10}A_1, 20log_{10}A_2, \ldots 20log_{10}A_L \end{bmatrix} 
 \end{equation}
 \begin{equation}
 L=\left \lfloor \frac{F_s}{2F_0} \right \rfloor = \left \lfloor \frac{\pi}{\omega_0} \right \rfloor
 \end{equation}
-$F_0$ and $L$ are time varying as the pitch track evolves over time. For speech sampled at $F_s=8$ kHz $F_0$ is typically in the range of 50 to 400 Hz, giving $L$ in the range of 10 $\ldots$ 80. \\
+$F_0$ and $L$ are time varying as the pitch track evolves over time. For speech sampled at $F_s=8$ kHz $F_0$ is typically in the range of 50 to 400 Hz, giving $L$ in the range of 10 $\ldots$ 80.
 
 To quantise and transmit $\mathbf{a}$, it is convenient to resample $\mathbf{a}$ to a fixed length $K$ element vector $\mathbf{b}$ using a resampling function:
 \begin{equation}
-\begin{split}
-\mathbf{y} &= \begin{bmatrix} Y_1, Y_2, \ldots Y_L \end{bmatrix} = H(\mathbf{a}) \\
-\mathbf{b} &= \begin{bmatrix} B_1, B_2, \ldots B_K \end{bmatrix} = R(\mathbf{y})
-\end{split}
+\mathbf{b} = \begin{bmatrix} B_1, B_2, \ldots B_K \end{bmatrix} = R(\mathbf{a})
 \end{equation}
-Where $H$ is a filter function chosen to smooth the spectral amplitude samples $A_m$ while not significantly altering the perceptual quality of the speech; and $R$ is a resampling function. To model the response of the human ear $B_k$  are sampled on $K$ non-linearly spaced points on the frequency axis:
+Where $R$ is a resampling function. To model the response of the human ear $B_k$  are sampled on $K$ non-linearly spaced points on the frequency axis:
 \begin{equation}
 \begin{split}
 f_k &= warp(k,K) \ \textrm{Hz} \quad k=1 \ldots K \\
@@ -722,7 +760,7 @@ warp(1,K) &= 200 \ \textrm{Hz} \\
 warp(K,K) &= 3700 \ \textrm{Hz}
 \end{split}
 \end{equation}
-where $warp()$ is a frequency warping function. Codec 2 700C uses $K=20$, $H=1$, and $warp()$ is defined using the Mel function \cite[p 150]{o1997human} (Figure \ref{fig:mel_fhz}) which samples the spectrum more densely at low frequencies, and less densely at high frequencies:
+where $warp()$ is a frequency warping function. Codec 2 700C uses $K=20$, and $warp()$ is defined using the Mel function \cite[p 150]{o1997human} (Figure \ref{fig:mel_fhz}) which samples the spectrum more densely at low frequencies, and less densely at high frequencies:
 \begin{equation} \label{eq:mel_f}
 mel(f) = 2595log_{10}(1+f/700)
 \end{equation}
@@ -782,63 +820,37 @@ k = warp^{-1}(f,K) = \frac{mel(f)-mel(200)}{g} + 1
 \end{center}
 \end{figure}
 
-The rate $K$ vector $\mathbf{b}$ is vector quantised for transmission over the channel:
+The input speech may be subject to arbitrary filtering, for example due to the microphone frequency response, room acoustics, and anti-aliasing filter.  This filtering is fixed or slowly time varying.  The filtering biases the target vectors away from the VQ training material, resulting in significant additional mean square error.  The filtering does not greatly affect the input speech quality, however the VQ performance distortion increases and the output speech quality is reduced.  This is exacerbated by operating in the log domain, the VQ will try to match very low level, perceptually insignificant energy near 0 and 4000 Hz. A microphone equaliser algorithm has been developed to help adjust to arbitrary microphone filtering.
+
+For every input frame $l$, the equaliser (EQ) updates the dimension $K$ equaliser vector $\mathbf{e}$:
 \begin{equation}
-\hat{\mathbf{b}} = Q(\mathbf{b})
+\mathbf{e}^{l+1} = \mathbf{e}^l + \beta(\mathbf{b} - \mathbf{t})
 \end{equation}
-Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. The rate filtered rate $L$ vector can then be recovered by resampling $\mathbf{\hat{b}}$ using another resampling function:
+where $\mathbf{t}$ is a fixed target vector set to the mean of the VQ quantiser, and $\beta$ is a small adaption constant.
+
+The equalised, mean removed rate $K$ vector $\mathbf{d}$ is vector quantised for transmission over the channel:
 \begin{equation}
-\hat{\mathbf{y}} = S(\hat{\mathbf{b}})
+\begin{split}
+\mathbf{c} &= \mathbf{b} - \mathbf{e} \\
+\mathbf{d} &= \mathbf{c} - \bar{\mathbf{c}} \\
+\hat{\mathbf{c}} &= VQ(\mathbf{d}) + Q(\bar{\mathbf{c}})
+\end{split}
 \end{equation}
-	
-Figure \ref{fig:newamp1_encoder} is the Codec 2 700C encoder.  Some notes on this algorithm:
+Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. Note that VQ is performed in the $log$ amplitude (dB) domain. The mean of $\mathbf{c}$ is removed prior to VQ and scalar quantised and transmitted separately as the frame energy. The rate $L$ vector $\hat{\mathbf{y}}$ can then be recovered by resampling $\mathbf{\hat{c}}$:
+\begin{equation}
+\hat{\mathbf{y}} = S(\hat{\mathbf{c}})
+\end{equation}
+
+Some notes on the Codec 2 700C \emph{newamp1} algorithms:
 \begin{enumerate}
-\item The amplitudes and Vector Quantiser (VQ) entries are in dB, which is very nice to work in and matches the ears logarithmic amplitude response. 
+\item The amplitudes and Vector Quantiser (VQ) entries are in dB, which matches the ears logarithmic amplitude response. 
 \item The mode is capable of communications quality speech and is in common use with FreeDV, but is close to the lower limits of intelligibility, and doesn't do well in some languages (problems have been reported with German and Japanese).
 \item The VQ was trained on just 120 seconds of data - way too short.
 \item The parameter set (pitch, voicing, log spectral magnitudes) is very similar to that used for the latest neural vocoders.
-\item The input speech may be subject to arbitrary filtering, for example due to the microphone frequency response, room acoustics, and anti-aliasing filter.  This filtering is fixed or slowly time varying.  The filtering biases the target vectors away from the VQ training material, resulting in significant additional mean square error.  The filtering does not greatly affect the input speech quality, however the VQ performance distortion increases and the output speech quality is reduced.  This is exacerbated by operating in the log domain, the VQ will try to match very low level, perceptually insignificant energy near 0 and 4000 Hz. A microphone equaliser algorithm has been developed to help adjust to arbitrary microphone filtering.
-\end{enumerate}
-
-\begin{figure}[h]
-\caption{Codec 2 700C (newamp1) encoder}
-
-\label{fig:newamp1_encoder}
-\begin{center}
-\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center]
-
-\node [input] (rinput) {};
-\node [tmp, right of=rinput,node distance=0.5cm] (z) {};
-\node [block, right of=z,node distance=1.5cm] (window) {Window};
-\node [block, right of=window,node distance=2.5cm] (dft) {DFT};
-\node [block, right of=dft,node distance=3cm,text width=1.5cm] (est) {Est Amp};
-\node [block, below of=window] (nlp) {NLP};
-\node [block, below of=nlp] (log) {log $\omega_0$};
-\node [block, below of=est,node distance=2cm,text width=2cm] (resample) {Resample Rate $K$};
-\node [block, right of=est,node distance=2.5cm,text width=1.5cm] (voicing) {Est Voicing};
-\node [tmp, below of=resample,node distance=1cm] (z1) {};
-\node [block, below of=dft,node distance=2cm,text width=2cm] (vq) {Decimate \& VQ};
-\node [block, below of=vq,node distance=2cm,text width=2cm] (pack) {Bit Packing};
-\node [output, right of=pack,node distance=2cm] (routput) {};
-
-\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window);
-\draw [->] (z) |- (nlp);
-\draw [->] (window) -- node[below] {$s_w(n)$} (dft);
-\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est);
-\draw [->] (est) -- node[right] {$\mathbf{a}$} (resample);
-\draw [->] (est) -- (voicing);
-\draw [->] (resample) -- node[below] {$\mathbf{b}$} (vq);
-\draw [->] (vq) -- (pack);
-\draw [->] (nlp) -- (log);
-\draw [->] (log) -- (pack);
-\draw [->] (voicing) |- (z1) -| (pack);
-\draw [->] (pack) -- (routput) node[right] {Bit Stream};
-
-\end{tikzpicture}
-\end{center}
-\end{figure}
+\item The Rate K algorithms were recently revisited, several improvements proposed and prototyped \cite{rowe2023ratek}.
+\end{enumerate}.
 
-TODO: Microphone equaliser. ratek study
+TODO: Post filters for LPC/LSP and 700C.
 
 \section{Further Work}
 
@@ -853,7 +865,7 @@ Summary of mysteries/interesting points drawn out above.
 \end{enumerate}
 
 
-\section{Codec 2 Modes}
+\section{Summary of Codec 2 Modes}
 \label{sect:glossary}
 
 \begin{table}[H]
@@ -868,7 +880,7 @@ Mode & Frm (ms) & Bits & $A_m$ & $E$ & $\omega_0$ & $v$ & Comment \\
 1600 & 40 & 64 & 36 & 10 & 14 & 4 \\
 1400 & 40 & 56 & 36 & 16 & -  & 4 \\
 1300 & 40 & 52 & 36 & 5  & 7  & 4 & Joint $\omega_0$/E VQ \\
-1200 & 48 & 40 & 27 & 16 & -  & 4 & LSP VQ, Joint  $\omega_0$/E VQ, 1 spare \\
+1200 & 40 & 48 & 27 & 16 & -  & 4 & LSP VQ, Joint  $\omega_0$/E VQ, 1 spare \\
 700C & 40 & 28 & 18 & 4  & 6  & - & VQ of log magnitudes \\
 \hline
 \end{tabular}
@@ -887,11 +899,14 @@ Acronym & Description \\
 \hline
 DFT & Discrete Fourier Transform \\
 DTCF & Discrete Time Continuous Frequency Fourier Transform \\
+EQ & (microphone) Equaliser \\
 IDFT & Inverse Discrete Fourier Transform \\
 LPC & Linear Predictive Coding \\
 LSP & Line Spectrum Pair \\
 MBE & Multi-Band Excitation \\
+MSE & Mean Square Error \\
 NLP & Non Linear Pitch (algorithm) \\
+VQ & Vector Quantiser \\
 \hline
 \end{tabular}
 \caption{Glossary of Acronyms}
diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib
index ea9ee6c..9999286 100644
--- a/doc/codec2_refs.bib
+++ b/doc/codec2_refs.bib
@@ -55,6 +55,7 @@
   publisher={AIP Publishing}
 }
 
+
 @book{o1997human,
   title={Speech Communication - Human and machine},
   author={O‘Shaughnessy, Douglas},
@@ -62,3 +63,8 @@
   year={1997}
 }
 
+@misc{rowe2023ratek,
+  title = {{FreeDV-015 Codec 2 Rate K Resampler}},
+  year = {2023}, 
+  note = {\url{https://github.com/drowe67/misc/blob/master/ratek_resampler/ratek_resampler.pdf}}
+}
-- 
cgit v1.2.3


From 670b278f60b796ce3717960a28985d121f8ea68b Mon Sep 17 00:00:00 2001
From: David Rowe <david.rowe@myriota.com>
Date: Sat, 9 Dec 2023 08:16:53 +1030
Subject: aligning 700C figures with maths

---
 doc/codec2.pdf      | Bin 301693 -> 310563 bytes
 doc/codec2.tex      | 126 +++++++++++++++++++++++++++++++++++++---------------
 doc/codec2_refs.bib |  14 ++++++
 3 files changed, 104 insertions(+), 36 deletions(-)

(limited to 'doc/codec2_refs.bib')

diff --git a/doc/codec2.pdf b/doc/codec2.pdf
index d91f1f8..f5f2804 100644
Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ
diff --git a/doc/codec2.tex b/doc/codec2.tex
index 2c5d13c..f1ea924 100644
--- a/doc/codec2.tex
+++ b/doc/codec2.tex
@@ -307,7 +307,7 @@ Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal ana
 \end{center}
 \end{figure}
 
-For the purposes of speech analysis the time domain speech signal $s(n)$ is divided into overlapping analysis windows (frames) of $N_w=279$ samples. The centre of each analysis window is separated by $N=80$ samples, or an internal frame rate or 10ms. To analyse the $l$-th frame it is convenient to convert the fixed time reference to a sliding time reference centred on the current analysis window:
+The time domain speech signal $s(n)$ is divided into overlapping analysis windows (frames) of $N_w=279$ samples. The centre of each analysis window is separated by $N=80$ or 10ms. Codec 2 operates at an internal frame rate of 100 Hz. To analyse the $l$-th frame it is convenient to convert the fixed time reference to a sliding time reference centred on the current analysis window:
 \begin{equation}
 s_w(n) = s(lN + n) w(n), \quad n = - N_{w2} ... N_{w2}
 \end{equation}
@@ -352,7 +352,7 @@ The phase is sampled at the centre of the band.  For all practical Codec 2 modes
 Synthesis is achieved by constructing an estimate of the original speech spectrum using the sinusoidal model parameters for the current frame. This information is then transformed to the time domain using an Inverse DFT (IDFT). To produce a continuous time domain waveform the IDFTs from adjacent frames are smoothly interpolated using a weighted overlap add procedure \cite{mcaulay1986speech}.
 
 \begin{figure}[h]
-\caption{Sinusoidal Synthesis.  At frame $l$ the windowing function generates $2N$ samples.  The first $N$ samples complete the current frame and are the synthesiser output. The second $N$ samples are stored for summing with the next frame.}
+\caption{Sinusoidal Synthesis.  At frame $l$ the windowing function generates $2N$ samples.  The first $N$ samples complete the current frame. The second $N$ samples are stored for summing with the next frame.}
 \label{fig:synthesis}
 \begin{center}
 \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center]
@@ -565,7 +565,7 @@ n_0    &= -\phi_1 / \omega_0 \\
 
 For unvoiced speech $E(z)$ is a white noise signal.  At each frame we sample a random number generator on the interval $-\pi ... \pi$ to obtain the excitation phase of each harmonic.  We set $F_0 = 50$ Hz to use a large number of harmonics $L=4000/50=80$ for synthesis to best approximate a noise signal.
 
-An additional phase component is provided by sampling $H(z)$ at the harmonic centres.  The phase spectra of $H(z)$ is derived from the filter magnitude response using minimum phase techniques.  The method for deriving the phase spectra of $H(z)$ differs between Codec 2 modes and is described below in Sections \ref{sect:mode_lpc_lsp} and \ref{sect:mode_newamp1}.  This component of the phase tends to disperse the pitch pulse energy in time, especially around spectral peaks (formants).
+The second phase component is provided by sampling the phase of $H(z)$ at the harmonic centres.  The phase spectra of $H(z)$ is derived from the magnitude response using minimum phase techniques.  The method for deriving the phase spectra of $H(z)$ differs between Codec 2 modes and is described below in Sections \ref{sect:mode_lpc_lsp} and \ref{sect:mode_newamp1}.  This component of the phase tends to disperse the pitch pulse energy in time, especially around spectral peaks (formants).
 
 The zero phase model tends to make speech with background noise sound "clicky".  With high levels of background noise the low level inter-formant parts of the spectrum will contain noise rather than speech harmonics, so modelling them as voiced (i.e. a continuous, non-random phase track) is inaccurate. Some codecs (like MBE) have a mixed voicing model that breaks the spectrum into voiced and unvoiced regions.  However (5-12) bits/frame (5-12) are required to transmit the frequency selective voicing information.  Mixed excitation also requires accurate voicing estimation (parameter estimators always break occasionally under exceptional conditions).
 
@@ -645,23 +645,15 @@ Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping
 \end{center}
 \end{figure}
 
-One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame.  As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters.  Using a fixed order LPC model is a neat solution to this problem.  Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly.  The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well.   For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope.
+One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame.  As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters.  Using a fixed order LPC model is a neat solution to this problem.   Another feature of LPC modelling combined with scalar LSP quantisation is a tolerance to variations in the input frequency response (see section \ref{sect:mode_newamp1} for more information on this issue).
+
+Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly.  The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well.   For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope. 
 
 In CELP codecs these problems can be accommodated by the (high bit rate) excitation, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model.
 
-Before bit packing, the Codec 2 parameters are decimated in time.  An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality.  The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes.  This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech.  Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis.  Neverthless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate.
+Before bit packing, the Codec 2 parameters are decimated in time.  An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality.  The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes.  This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech.  Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis.  Nevertheless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate.
 
-Figure \ref{fig:decoder_lpc_lsp} shows the LPC/LSP mode decoder.  Frames of bits received at the frame rate are unpacked and resampled to the 10ms internal frame rate using linear interpolation.  The spectral magnitude information is resampled by linear interpolation of the LSP frequencies, and converted back to a quantised LPC model $\hat{H}(z)$.  The harmonic magnitudes are recovered by averaging the energy of the LPC
-spectrum over the region of each harmonic:
-\begin{equation}
-\hat{A}_m = \sqrt{ \sum_{k=a_m}^{b_m-1} | \hat{H}(k) |^2 }
-\end{equation}
-where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame.  For phase synthesis, the phase of $H(z)$ is determined by sampling $\hat{H}(k)$ in the centre of each harmonic:
-\begin{equation}
-arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right]
-\end{equation}
-
-\begin{figure}[h]
+\begin{figure}[H]
 \caption{LPC/LSP Modes Decoder}
 \label{fig:decoder_lpc_lsp}
 \begin{center}
@@ -675,7 +667,7 @@ arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil)
 \node [block, right of=lpc,text width=2cm] (sample) {Sample $A_m$};
 \node [block, below of=lpc,text width=2cm,node distance=2cm] (phase) {Phase Synthesis};
 \node [block, below of=phase,text width=2.5cm,node distance=2cm] (synth) {Sinusoidal\\Synthesis};
-\node [block, right of=synth,text width=2cm] (post) {Post Filter};
+\node [block, right of=phase,text width=2cm] (post) {Post Filter};
 \node [output, left of=synth,node distance=2cm] (routput) {};
 
 \draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack);
@@ -683,25 +675,45 @@ arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil)
 \draw [->] (interp) -- (lpc);
 \draw [->] (lpc) -- (sample);
 \draw [->] (sample) -- (post);
-\draw [->] (post) -- (synth);
+\draw [->] (post) |- (synth);
 \draw [->] (z1) |- (phase);
 \draw [->] (phase) -- (synth);
-\draw [->] (sample) |- (phase);
+\draw [->] (post) -- (phase);
 \draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$};
 
 \end{tikzpicture}
 \end{center}
 \end{figure}
 
+Figure \ref{fig:decoder_lpc_lsp} shows the LPC/LSP mode decoder.  Frames of bits received at the frame rate are unpacked and resampled to the 10ms internal frame rate using linear interpolation.  The spectral magnitude information is resampled by linear interpolation of the LSP frequencies, and converted back to a quantised LPC model $\hat{H}(z)$.  The harmonic magnitudes are recovered by averaging the energy of the LPC spectrum over the region of each harmonic:
+\begin{equation}
+\hat{A}_m = \sqrt{ \sum_{k=a_m}^{b_m-1} | \hat{H}(k) |^2 }
+\end{equation}
+where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame.  For phase synthesis, the phase of $H(z)$ is determined by sampling $\hat{H}(k)$ in the centre of each harmonic:
+\begin{equation}
+arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right]
+\end{equation}
+Prior to sampling the amplitude and phase, a frequency domain post filter is applied to the LPC power spectrum. The algorithm is based on the MBE frequency domain post filter \cite[Section 8.6, p 267]{kondoz1994digital}, which is turn based on the frequency domain post filter from McAulay and Quatieri \cite[Section 4.3, p 148]{kleijn1995speech}.  The authors report a significant improvement in speech quality from the post filter, which has also been our experience when applied to Codec 2. The post filter is given by:
+\begin{equation}
+\label{eq:lpc_lsp_pf}
+\begin{split}
+P_f(e^{j\omega}) &= g \left( R_w(e^{j \omega} \right))^\beta \\
+R_w(^{j\omega}) &= A(e^{j \omega/ \gamma})/A(e^{j \omega})
+\end{split}
+\end{equation}
+where $g$ is a gain chosen to such that the energy of at the output of the post filter is the same as the input, $\beta=0.2$, and $\gamma=0.5$.  The post filter raises the spectral peaks (formants), and pushes down the energy between formants.  The $\beta$ term compensates for spectral tilt, such that $R_w$ is similar to the LPC synthesis filter $1/A(z)$ however with equal emphasis at low and high frequencies.  The authors suggest the post filter reduces the noise level between formants, an explanation commonly given to post filters used for CELP codecs where significant inter-formant noise exists from the noisy excitation source.  However in harmonic sinusoidal codecs there is no excitation noise between formants in $E(z)$.  Our theory is the post filter also acts to reduce the bandwidth of spectral peaks, modifying the energy distribution across the time domain pitch cycle in a way that improves intelligibility, especially for low pitched speakers.
+
+A disadvantage of the post filter is the need for experimentally derived constants.  It performs a non-linear operation on the speech spectrum, and if mis-applied can worsen speech quality.  As it's operation is not completely understood, it represents a source of future quality improvement.
+
 \subsection{Codec 2 700C}
 \label{sect:mode_newamp1}
 
-To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}.  One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation. Figure \ref{fig:newamp1_encoder} presents the Codec 2 700C encoder.  
+To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}.  One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation. Figure \ref{fig:encoder_newamp1} presents the Codec 2 700C encoder.  
 
-\begin{figure}[h]
-\caption{Codec 2 700C (newamp1) encoder}
+\begin{figure}[H]
+\caption{Codec 2 700C (newamp1) Encoder}
 
-\label{fig:newamp1_encoder}
+\label{fig:encoder_newamp1}
 \begin{center}
 \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center]
 
@@ -727,7 +739,7 @@ To efficiently transmit spectral amplitude information Codec 2 700C uses a set o
 \draw [->] (dft) -- node[below] {$S_\omega(k)$} (est);
 \draw [->] (est) -- node[right] {$\mathbf{a}$} (resample);
 \draw [->] (resample) -- node[below] {$\mathbf{b}$} (eq);
-\draw [->] (eq) -- (vq);
+\draw [->] (eq) -- node[left] {$\mathbf{c}$} (vq);
 \draw [->] (vq) -- (pack);
 \draw [->] (est) -| (z1) |- (voicing);
 \draw [->] (nlp) -- (log);
@@ -778,17 +790,19 @@ f = mel^{-1}(x) = 700(10^{x/2595} - 1);
 \end{figure}
 
 We wish to use $mel(f)$ to construct $warp(k,K)$, such that there are $K$ evenly spaced points on the $mel(f)$ axis (Figure \ref{fig:mel_k}).  Solving for the equation of a straight line we can obtain $mel(f)$ as a function of $k$, and hence $warp(k,K)$ (Figure \ref{fig:warp_fhz_k}):
-\begin{equation} \label{eq:mel_k}
+\begin{equation}
+\label{eq:mel_k}
 \begin{split}
 g &= \frac{mel(3700)-mel(200)}{K-1} \\
 mel(f) &= g(k-1) + mel(200)
 \end{split}
 \end{equation}
 Substituting (\ref{eq:f_mel}) into the LHS:
-\begin{equation} \label{eq:warp}
+\begin{equation}
+\label{eq:warp}
 \begin{split}
 2595log_{10}(1+f/700) &= g(k-1) + mel(200) \\
-f = warp(k,K) &= mel^{-1} ( g(k-1) + mel(200) ) \\
+f_k = warp(k,K) &= mel^{-1} ( g(k-1) + mel(200) ) \\
 \end{split}
 \end{equation}
 and the inverse warp function:
@@ -833,13 +847,54 @@ The equalised, mean removed rate $K$ vector $\mathbf{d}$ is vector quantised for
 \begin{split}
 \mathbf{c} &= \mathbf{b} - \mathbf{e} \\
 \mathbf{d} &= \mathbf{c} - \bar{\mathbf{c}} \\
-\hat{\mathbf{c}} &= VQ(\mathbf{d}) + Q(\bar{\mathbf{c}})
+\hat{\mathbf{c}} &= VQ(\mathbf{d}) + Q(\bar{\mathbf{c}}) \\
+                 &= \hat{\mathbf{d}} + \hat{\bar{\mathbf{c}}}
 \end{split}
 \end{equation}
-Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. Note that VQ is performed in the $log$ amplitude (dB) domain. The mean of $\mathbf{c}$ is removed prior to VQ and scalar quantised and transmitted separately as the frame energy. The rate $L$ vector $\hat{\mathbf{y}}$ can then be recovered by resampling $\mathbf{\hat{c}}$:
+Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. The \emph{mbest} multi-stage search algorithm is used to jointly search the two stages (using 5 survivors from the first stage).  Note that VQ is performed in the $log$ amplitude (dB) domain. The mean of $\mathbf{c}$ is removed prior to VQ and scalar quantised and transmitted separately as the frame energy. At the decoder, the rate $L$ vector $\hat{\mathbf{a}}$ can then be recovered by resampling $\mathbf{\hat{a}}$:
 \begin{equation}
-\hat{\mathbf{y}} = S(\hat{\mathbf{c}})
+\hat{\mathbf{a}} = S(\hat{\mathbf{c}} + \mathbf{p})
 \end{equation}
+where $\mathbf{p}$ is a post filter vector. The post filter vector is generated from the mean-removed rate $K$ vector $\hat{\mathbf{d}}$ in the $log$ frequency domain:
+\begin{equation}
+\begin{split}
+\mathbf{p} &= G + P_{gain} \left( \hat{\mathbf{d}} + \mathbf{r} \right) - \mathbf{r} \\
+\mathbf{r} &= \begin{bmatrix} R_1, R_2, \ldots R_K \end{bmatrix} \\
+       R_k &= 20log_{10}(f_k/300) \quad k=1,...,K 
+\end{split}
+\end{equation}
+where $G$ is an energy normalisation term, and $1.2 < P_{gain} < 1.5$ describes the amount if post filtering applied.  $G$ and $P_{gain}$ are similar to $g$ and $\beta$ in the LPC/LSP post filter (\ref{eq:lpc_lsp_pf}).  The $\mathbf{r}$ term is a high pass (pre-emphasis) filter with +20 dB/decade gain after 300 Hz ($f_k$ is given in (\ref{eq:warp})).  The post filtering is applied on the pre-emphasised vector, then the pre-emphasis is removed from the final result.  Multiplying by $P_{gain}$ in the $log$ domain is similar to the $\alpha$ power function in (\ref{eq:lpc_lsp_pf}); spectral peaks are moved up, and troughs pushed down.  This filter enhances the speech quality but also introduces some artefacts.
+
+Figure \ref{fig:decoder_newamp1} is the block diagram of the decoder signal processing.  Cepstral techniques are used to synthesise a  phase spectra $arg[H(e^{j \omega}])$ from $\hat{\mathbf{a}}$ using a minimum phase model.  
+
+\begin{figure}[h]
+\caption{Codec 2 700C (newamp1) Decoder}
+\label{fig:decoder_newamp1}
+\begin{center}
+\begin{tikzpicture}[auto, node distance=3cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center]
+
+\node [input] (rinput) {};
+\node [block, right of=rinput,node distance=1.5cm] (unpack) {Unpack};
+\node [block, right of=unpack,node distance=2.5cm] (interp) {Interpolate};
+\node [block, right of=interp,node distance=3cm,text width=2cm] (post) {Post Filter};
+\node [block, below of=post,text width=2cm,node distance=2cm] (resample) {Resample to Rate $L$};
+\node [block, below of=resample,text width=2cm,node distance=2cm] (synth) {Sinusoidal\\Synthesis};
+\node [tmp, below of=resample,node distance=1cm] (z1) {};
+\node [block, right of=synth,text width=2cm] (phase) {Phase Synthesis};
+\node [output,left of=synth,node distance=2cm] (routput) {};
+
+\draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack);
+\draw [->] (unpack) -- (interp);
+\draw [->] (interp) --  (post);
+\draw [->] (post) -- node[left] {$\hat{\mathbf{c}}$} (resample);
+\draw [->] (resample) -- node[left] {$\hat{\mathbf{a}}$}  (synth);
+\draw [->] (resample) -- (z1) -| (phase);
+\draw [->] (phase) -- (synth);
+\draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$};
+
+\end{tikzpicture}
+\end{center}
+\end{figure}
 
 Some notes on the Codec 2 700C \emph{newamp1} algorithms:
 \begin{enumerate}
@@ -847,10 +902,8 @@ Some notes on the Codec 2 700C \emph{newamp1} algorithms:
 \item The mode is capable of communications quality speech and is in common use with FreeDV, but is close to the lower limits of intelligibility, and doesn't do well in some languages (problems have been reported with German and Japanese).
 \item The VQ was trained on just 120 seconds of data - way too short.
 \item The parameter set (pitch, voicing, log spectral magnitudes) is very similar to that used for the latest neural vocoders.
-\item The Rate K algorithms were recently revisited, several improvements proposed and prototyped \cite{rowe2023ratek}.
-\end{enumerate}.
-
-TODO: Post filters for LPC/LSP and 700C.
+\item The Rate K algorithms were recently revisited, several improvements were proposed and prototyped \cite{rowe2023ratek}.
+\end{enumerate}
 
 \section{Further Work}
 
@@ -861,7 +914,8 @@ Summary of mysteries/interesting points drawn out above.
 \item How to use Octave tools to single step through codec operation
 \item Table summarising source files with one line description
 \item Add doc license (Creative Commons?)
-\item Energy distribution theory.  Need for V model, neural vocoders, non-linear function. Figures and simulation plots would be useful. Figure of phase synthesis.
+\item Energy distribution theory.  Need for V model, neural vocoders, non-linear function.
+\item Figures and simulation plots would be useful to better explain algorithms.
 \end{enumerate}
 
 
@@ -880,7 +934,7 @@ Mode & Frm (ms) & Bits & $A_m$ & $E$ & $\omega_0$ & $v$ & Comment \\
 1600 & 40 & 64 & 36 & 10 & 14 & 4 \\
 1400 & 40 & 56 & 36 & 16 & -  & 4 \\
 1300 & 40 & 52 & 36 & 5  & 7  & 4 & Joint $\omega_0$/E VQ \\
-1200 & 40 & 48 & 27 & 16 & -  & 4 & LSP VQ, Joint  $\omega_0$/E VQ, 1 spare \\
+1200 & 40 & 48 & 27 & 16 & -  & 4 & LSP VQ, joint  $\omega_0$/E VQ, 1 spare \\
 700C & 40 & 28 & 18 & 4  & 6  & - & VQ of log magnitudes \\
 \hline
 \end{tabular}
diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib
index 9999286..756a92b 100644
--- a/doc/codec2_refs.bib
+++ b/doc/codec2_refs.bib
@@ -68,3 +68,17 @@
   year = {2023}, 
   note = {\url{https://github.com/drowe67/misc/blob/master/ratek_resampler/ratek_resampler.pdf}}
 }
+
+@book{kondoz1994digital,
+  title={Digital speech: coding for low bit rate communication systems},
+  author={Kondoz, Ahmet M},
+  year={1994},
+  publisher={John Wiley \& Sons}
+}
+
+@book{kleijn1995speech,
+  title={Speech coding and synthesis},
+  author={Kleijn, W Bastiaan and Paliwal, Kuldip K},
+  year={1995},
+  publisher={Elsevier Science Inc.}
+}
\ No newline at end of file
-- 
cgit v1.2.3