From 112f3b50f0fe132b2d59f8ce8f2c76af23fc25e9 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sat, 18 Nov 2023 09:37:58 +1030 Subject: kicking off Codec 2 documentation --- doc/codec2.pdf | Bin 0 -> 83203 bytes doc/codec2.tex | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ doc/codec2_refs.bib | 24 ++++++++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 doc/codec2.pdf create mode 100644 doc/codec2.tex create mode 100644 doc/codec2_refs.bib diff --git a/doc/codec2.pdf b/doc/codec2.pdf new file mode 100644 index 0000000..21fc6cd Binary files /dev/null and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex new file mode 100644 index 0000000..0c6bcee --- /dev/null +++ b/doc/codec2.tex @@ -0,0 +1,48 @@ +\documentclass{article} +\usepackage{amsmath} +\usepackage{hyperref} +\usepackage{tikz} + +\usepackage{xstring} +\usepackage{catchfile} + +\CatchFileDef{\headfull}{../.git/HEAD}{} +\StrGobbleRight{\headfull}{1}[\head] +\StrBehind[2]{\head}{/}[\branch] +\IfFileExists{../.git/refs/heads/\branch}{% + \CatchFileDef{\commit}{../.git/refs/heads/\branch}{}}{% + \newcommand{\commit}{\dots~(in \emph{packed-refs})}} +\newcommand{\gitrevision}{% + \StrLeft{\commit}{7}% +} + +\title{Codec 2} +\author{David Rowe\\ \\ Revision: {\gitrevision} on branch: {\branch}} + +\begin{document} +\maketitle + +\section{Introduction} + +Codec 2 is an open source speech codec designed for communications quality speech between 700 and 3200 bit/s. The main application is low bandwidth HF/VHF digital radio. It fills a gap in open source voice codecs beneath 5000 bit/s and is released under the GNU Lesser General Public License (LGPL). It is written in C99 standard C. + +The Codec 2 project was started in 2009 in response to the problem of closed source, patented, proprietary voice codecs in the sub-5 kbit/s range, in particular for use in the Amateur Radio service. + +This document describes Codec 2 at two levels. Section \ref{sect:overview} is a high level overview aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description with math and signal processing theory. This document is not a concise algorithmic description, instead the algorithm is defined by the reference C99 source code and automated tests (ctests). + +This production of this document was kindly supported by an ARDC grant \cite{ardc2023}. As an open source project, many people have contributed to Codec 2 over the years - we deeply appreciate all of your support. + +\section{Codec 2 for the Radio Amateur} +\label{sect:overview} + +\section{Signal Processing Details} +\label{sect:details} + +\section{Further Work} + + +\cite{griffin1988multiband} + +\bibliographystyle{plain} +\bibliography{codec2_refs} +\end{document} diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib new file mode 100644 index 0000000..e578439 --- /dev/null +++ b/doc/codec2_refs.bib @@ -0,0 +1,24 @@ +@article{griffin1988multiband, + title={Multiband excitation vocoder}, + author={Griffin, Daniel W and Lim, Jae S}, + journal={IEEE Transactions on acoustics, speech, and signal processing}, + volume={36}, + number={8}, + pages={1223--1235}, + year={1988}, + publisher={IEEE} +} +@book{rowe1997techniques, + title={Techniques for harmonic sinusoidal coding}, + author={Rowe, David Grant}, + year={1997}, + publisher={Citeseer}, + note = {\url{https://www.rowetel.com/downloads/1997_rowe_phd_thesis.pdf}} +} + +@misc{ardc2023, + title = {{Enhancing HF Digital Voice with FreeDV}}, + year = {2023}, + note = {\url{https://www.ardc.net/apply/grants/2023-grants/enhancing-hf-digital-voice-with-freedv/}} +} + -- cgit v1.2.3 From 9bc86bc2c7fe516703bf3d1adb939657a405e966 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 19 Nov 2023 07:14:04 +1030 Subject: building up plot support --- doc/Makefile | 15 +++++++ doc/codec2.pdf | Bin 83203 -> 93270 bytes doc/codec2.tex | 32 +++++++++++++- octave/plamp.m | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ raw/hts2a.raw | Bin 0 -> 48000 bytes 5 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 doc/Makefile create mode 100644 octave/plamp.m create mode 100644 raw/hts2a.raw diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..3729f6a --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,15 @@ +# Makefile for codec2.pdf + +# set these externally with an env variable (e.g. for GitHub action) to override +# defaults below. Need to run cmake with -DDUMP +CODEC2_SRC ?= $(HOME)/codec2 +CODEC2_BINARY ?= $(HOME)/codec2/build_linux/src + +PATH := $(PATH):$(CODEC2_BINARY) + +PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex + +$(PLOT_FILES): + echo $(PATH) + c2sim $(CODEC2_SRC)/raw/hts2a.raw --dump hts2a + DISPLAY=""; printf "plamp('hts2a',f=37,epslatex=1)\nq\n" | octave-cli -qf -p $(CODEC2_SRC)/octave diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 21fc6cd..6b59d41 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 0c6bcee..b29a87c 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -35,12 +35,42 @@ This production of this document was kindly supported by an ARDC grant \cite{ard \section{Codec 2 for the Radio Amateur} \label{sect:overview} +\subsection{Model Based Speech Coding} + +A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at an 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (700 bits/s). Speech coding is the art of "what can we throw away". We need to lower the bit rate of the speech while retaining intelligible speech, and making it sound as natural as possible. + +As such low bit rates we use a speech production model. The input speech is anlaysed, and we extract model parameters, which are then sent over the channel. An example of a model based parameter is the pitch of the person speaking. We estimate the pitch of the speaker, quantise it to a 7 bit number, and send that over the channel every 20ms. + +The model based approach used by Codec 2 allows high compression, with some trade offs such as noticeable artefacts in the decoded speech. Higher bit rate codecs (above 5000 bit/s), such as those use for mobile telephony or voice on the Internet, tend to pay more attention to preserving the speech waveform, or use a hybrid approach of waveform and model based techniques. + +Recently, machine learning has been applied to speech coding. This technology promises high quality, artefact free speech quality at low bit rates, but currently (2023) requires significantly more memory and CPU than traditional speech coding technology such as Codec 2. However the field is progressing rapidly, and with the progress of Moore's law will soon be a viable technology for many low bit rate speech applications. + +\subsection{Speech in Time and Frequency} + +\begin{figure} +\caption{ A 40ms segment of the word "these" from a female speaker, sampled at 8 kHz. The waveform repeats itself every 4.3ms (230 Hz), this is the "pitch period" of this segment.} +\label{fig:hts2a_time} +\begin{center} +\input hts2a_37_sn.tex +\\ +\input hts2a_37_sw.tex +\end{center} +\end{figure} + +\subsection{Sinusoidal Speech Coding} + +\subsection{Spectral Magnitude Quantisation} + +\subsection{Bit Allocation} + \section{Signal Processing Details} \label{sect:details} \section{Further Work} - +\begin{enumerate} +\item How to use tools to single step through codec operation +\end{enumerate} \cite{griffin1988multiband} \bibliographystyle{plain} diff --git a/octave/plamp.m b/octave/plamp.m new file mode 100644 index 0000000..a224a5c --- /dev/null +++ b/octave/plamp.m @@ -0,0 +1,136 @@ +% plamp.m +% Plot ampltiude modelling information from c2sim dump files. + +function plamp(samname, f, epslatex=0) + + plot_sw = 1; + + sn_name = strcat(samname,"_sn.txt"); + Sn = load(sn_name); + + sw_name = strcat(samname,"_sw.txt"); + Sw = load(sw_name); + + sw__name = strcat(samname,"_sw_.txt"); + if (file_in_path(".",sw__name)) + Sw_ = load(sw__name); + endif + + ew_name = strcat(samname,"_ew.txt"); + if (file_in_path(".",ew_name)) + Ew = load(ew_name); + endif + + rk_name = strcat(samname,"_rk.txt"); + if (file_in_path(".",rk_name)) + Rk = load(rk_name); + endif + + model_name = strcat(samname,"_model.txt"); + model = load(model_name); + + modelq_name = strcat(samname,"_qmodel.txt"); + if (file_in_path(".",modelq_name)) + modelq = load(modelq_name); + endif + + pw_name = strcat(samname,"_pw.txt"); + if (file_in_path(".",pw_name)) + Pw = load(pw_name); + endif + + lsp_name = strcat(samname,"_lsp.txt"); + if (file_in_path(".",lsp_name)) + lsp = load(lsp_name); + endif + + phase_name = strcat(samname,"_phase.txt"); + if (file_in_path(".",phase_name)) + phase = load(phase_name); + endif + + phase_name_ = strcat(samname,"_phase_.txt"); + if (file_in_path(".",phase_name_)) + phase_ = load(phase_name_); + endif + + snr_name = strcat(samname,"_snr.txt"); + if (file_in_path(".",snr_name)) + snr = load(snr_name); + endif + + if epslatex, [textfontsize linewidth] = set_fonts(); end + + k = ' '; + do + figure(1); + clf; + s = [ Sn(2*f-1,:) Sn(2*f,:) ]; + plot(s,'b'); + axis([1 length(s) -30000 30000]); + + figure(2); + Wo = model(f,1); + L = model(f,2); + Am = model(f,3:(L+2)); + plot((1:L)*Wo*4000/pi, 20*log10(Am),"+-r"); + axis([1 4000 -10 80]); + hold on; + if plot_sw + plot((0:255)*4000/256, Sw(f,:),"b"); + legend('boxoff'); + end + + hold off; grid minor; + + % print EPS file + + if epslatex + sz = "-S300,250"; + figure(1); + fn = sprintf("%s_%d_sn.tex",samname,f); + print(fn,"-depslatex",sz); printf("\nprinting... %s\n", fn); + + figure(2); + fn = sprintf("%s_%d_sw.tex",samname,f); + print(fn,"-depslatex",sz); printf("printing... %s\n", fn); + + restore_fonts(textfontsize,linewidth); + endif + + % interactive menu + + printf("\rframe: %d menu: n-next b-back s-plot_sw q-quit", f); + fflush(stdout); + k = kbhit(); + if k == 'n'; f = f + 1; endif + if k == 'b'; f = f - 1; endif + if k == 's' + if plot_sw; plot_sw = 0; else; plot_sw = 1; end + endif + + until (k == 'q') + printf("\n"); + +endfunction + +function [textfontsize linewidth] = set_fonts(font_size=12) + textfontsize = get(0,"defaulttextfontsize"); + linewidth = get(0,"defaultlinelinewidth"); + set(0, "defaulttextfontsize", font_size); + set(0, "defaultaxesfontsize", font_size); + set(0, "defaultlinelinewidth", 0.5); +end + +function restore_fonts(textfontsize,linewidth) + set(0, "defaulttextfontsize", textfontsize); + set(0, "defaultaxesfontsize", textfontsize); + set(0, "defaultlinelinewidth", linewidth); +end + +function print_eps_restore(fn,sz,textfontsize,linewidth) + print(fn,"-depslatex",sz); + printf("printing... %s\n", fn); + restore_fonts(textfontsize,linewidth); +end + diff --git a/raw/hts2a.raw b/raw/hts2a.raw new file mode 100644 index 0000000..6d9cf17 Binary files /dev/null and b/raw/hts2a.raw differ -- cgit v1.2.3 From cef07b4bd72a930fad74b0ef0f7bf765fd59cf28 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 19 Nov 2023 08:28:25 +1030 Subject: drafted time-freq speech section, building up sinsuoidal model figure --- doc/codec2.pdf | Bin 93270 -> 121748 bytes doc/codec2.tex | 49 +++++++++++++++++++++++++++++++++++++++++++------ octave/plamp.m | 7 ++++--- 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 6b59d41..e148998 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index b29a87c..c9cd723 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -2,6 +2,7 @@ \usepackage{amsmath} \usepackage{hyperref} \usepackage{tikz} +\usepackage{float} \usepackage{xstring} \usepackage{catchfile} @@ -37,18 +38,26 @@ This production of this document was kindly supported by an ARDC grant \cite{ard \subsection{Model Based Speech Coding} -A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at an 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (700 bits/s). Speech coding is the art of "what can we throw away". We need to lower the bit rate of the speech while retaining intelligible speech, and making it sound as natural as possible. +A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at an 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (e.g. 700 bits/s for HF). Speech coding is the art of "what can we throw away". We need to lower the bit rate of the speech while retaining speech you can understand, and making it sound as natural as possible. -As such low bit rates we use a speech production model. The input speech is anlaysed, and we extract model parameters, which are then sent over the channel. An example of a model based parameter is the pitch of the person speaking. We estimate the pitch of the speaker, quantise it to a 7 bit number, and send that over the channel every 20ms. +As such low bit rates we use a speech production ``model". The input speech is anlaysed, and we extract model parameters, which are then sent over the channel. An example of a model based parameter is the pitch of the person speaking. We estimate the pitch of the speaker, quantise it to a 7 bit number, and send that over the channel every 20ms. -The model based approach used by Codec 2 allows high compression, with some trade offs such as noticeable artefacts in the decoded speech. Higher bit rate codecs (above 5000 bit/s), such as those use for mobile telephony or voice on the Internet, tend to pay more attention to preserving the speech waveform, or use a hybrid approach of waveform and model based techniques. +The model based approach used by Codec 2 allows high compression, with some trade offs such as noticeable artefacts in the decoded speech. Higher bit rate codecs (above 5000 bit/s), such as those use for mobile telephony or voice on the Internet, tend to pay more attention to preserving the speech waveform, or use a hybrid approach of waveform and model based techniques. They sound better but require a higher bit rate. -Recently, machine learning has been applied to speech coding. This technology promises high quality, artefact free speech quality at low bit rates, but currently (2023) requires significantly more memory and CPU than traditional speech coding technology such as Codec 2. However the field is progressing rapidly, and with the progress of Moore's law will soon be a viable technology for many low bit rate speech applications. +Recently, machine learning has been applied to speech coding. This technology promises high quality, artefact free speech quality at low bit rates, but currently (2023) requires significantly more memory and CPU resources than traditional speech coding technology such as Codec 2. However the field is progressing rapidly, and as the cost of CPU and memory decreases (Moore's law) will soon be a viable technology for many low bit rate speech applications. \subsection{Speech in Time and Frequency} -\begin{figure} -\caption{ A 40ms segment of the word "these" from a female speaker, sampled at 8 kHz. The waveform repeats itself every 4.3ms (230 Hz), this is the "pitch period" of this segment.} +To explain how Codec 2 works, lets look at some speech. Figure \ref{fig:hts2a_time} shows a short 40ms segment of speech in the time and frequency domain. On the time plot we can see the waveform is changing slowly over time as the word is articulated. On the right hand side it also appears to repeat itself - one cycle looks very similar to the last. This cycle time is the "pitch period", which for this example is around $P=35$ samples. Given we are sampling at $F_s=8000$ Hz, the pitch period is $P/F_s=35/8000=0.0044$ seconds, or 4.4ms. + +The pitch changes in time, and is generally higher for females and children, and lower for males. It only appears to be constant for a short snap shot (a few 10s of ms) in time. For human speech pitch can vary over a range of 50 Hz to 500 Hz. + +Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency. + +Note that each harmonic has it's own amplitude, that varies slowly up and down with frequency. The red line plots the amplitude of each harmonic. There is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. + +\begin{figure}[H] +\caption{ A 40ms segment of the word "these" from a female speaker, sampled at 8kHz. Top is a plot again time, bottom (blue) is a plot against frequency. The waveform repeats itself every 4.3ms (230 Hz), this is the "pitch period" of this segment.} \label{fig:hts2a_time} \begin{center} \input hts2a_37_sn.tex @@ -59,6 +68,34 @@ Recently, machine learning has been applied to speech coding. This technology p \subsection{Sinusoidal Speech Coding} +A sinewave will cause a spike or spectral line on a spectrum plot, so we can see each spike as a small sine wave generator. Each sine wave generator has it's own frequency (e.g. $230, 460, 690,...$ Hz), amplitude and phase. If we add all of the sine waves together we can produce the time domain signal at the top of Figure \ref{fig:hts2a_time}, and produce synthesised speech. This is called sinusoidal speech coding and is the ``model" at the heart of Codec 2. + +\begin{figure}[h] +\caption{The Sinusoidal speech model. If we sum a series of sine waves, we can generate speech.} +\label{fig:sinusoidal_model} +\begin{center} +\begin{tikzpicture} +\draw (1,2) -- (2,2); +\draw (2.5,0.75) -- (2.5,1.5); +\draw (2.5,2) circle (0.5); +\draw (2.25,2) -- (2.75,2); +\draw (2.5,1.75) -- (2.5,2.25); +\draw (3,2) -- (4,2); +\draw (4,1.5) rectangle (5,2.5); +\draw (5,2) -- (6,2) -- (6,1.5); +\draw (5.75,1.5) rectangle (6.25,0.5); +\draw (6,0.5) -- (6,0); +\draw (5.75,0) -- (6.25,0); + +\node[] at (0.5,2) {$P_{sun}$}; +\node[] at (2.5,0.5) {$P_{quiet}$}; +\node[] at (4.5,2) {$G$}; +\node[align=right] at (6.75,1) {Rx}; + +\end{tikzpicture} +\end{center} +\end{figure} + \subsection{Spectral Magnitude Quantisation} \subsection{Bit Allocation} diff --git a/octave/plamp.m b/octave/plamp.m index a224a5c..e32d102 100644 --- a/octave/plamp.m +++ b/octave/plamp.m @@ -68,7 +68,8 @@ function plamp(samname, f, epslatex=0) s = [ Sn(2*f-1,:) Sn(2*f,:) ]; plot(s,'b'); axis([1 length(s) -30000 30000]); - + xlabel('Time (samples)'); ylabel('Amplitude'); + figure(2); Wo = model(f,1); L = model(f,2); @@ -78,15 +79,15 @@ function plamp(samname, f, epslatex=0) hold on; if plot_sw plot((0:255)*4000/256, Sw(f,:),"b"); - legend('boxoff'); end + legend('boxoff'); ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); hold off; grid minor; % print EPS file if epslatex - sz = "-S300,250"; + sz = "-S300,200"; figure(1); fn = sprintf("%s_%d_sn.tex",samname,f); print(fn,"-depslatex",sz); printf("\nprinting... %s\n", fn); -- cgit v1.2.3 From ce5e8ba7169c079de023c4ce6e5be2c705ea0132 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 19 Nov 2023 09:21:18 +1030 Subject: macro for sinusoid --- doc/codec2.pdf | Bin 121748 -> 120193 bytes doc/codec2.tex | 33 +++++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index e148998..77877b1 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index c9cd723..07b77ed 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -2,6 +2,7 @@ \usepackage{amsmath} \usepackage{hyperref} \usepackage{tikz} +\usetikzlibrary{calc} \usepackage{float} \usepackage{xstring} @@ -75,22 +76,34 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see \label{fig:sinusoidal_model} \begin{center} \begin{tikzpicture} + +\newcommand{\DrawSine}[4]{% x, y, x_scale, y_scale +\begin{tikzpicture} +%\draw plot [smooth] coordinates {(#1,#2) (0.5,0.707) (1,1) (1.5,0.707) (2,0) (2.5,-0.707) (3,-1) (3.5,-0.707) (4,0)}; + +\draw plot [smooth] coordinates {(#1-2*#3, #2 ) (#1-1.5*#3,#2+0.707*#4) + (#1-1*#3, #2+1*#4) (#1-0.5*#3,#2+0.707*#4) + (#1 ,#2+0) (#1+0.5*#3,#2-0.707*#4) + (#1+1*#3,#2-1*#4) (#1+1.5*#3,#2-0.707*#4) + (#1+2*#3,#2+0)}; +\end{tikzpicture} +} + +%%\draw plot [smooth] coordinates {(0,0) (0.5,0.707) (1,1) (1.5,0.707) (2,0) (2.5,-0.707) (3,-1) (3.5,-0.707) (4,0)}; + +\DrawSine {0}{1}{0.5}{0.5} + +\draw (0,2) circle (0.5); +\draw (0,0.5) circle (0.5); +\draw (0,-1.5) circle (0.5); + \draw (1,2) -- (2,2); \draw (2.5,0.75) -- (2.5,1.5); \draw (2.5,2) circle (0.5); \draw (2.25,2) -- (2.75,2); \draw (2.5,1.75) -- (2.5,2.25); \draw (3,2) -- (4,2); -\draw (4,1.5) rectangle (5,2.5); -\draw (5,2) -- (6,2) -- (6,1.5); -\draw (5.75,1.5) rectangle (6.25,0.5); -\draw (6,0.5) -- (6,0); -\draw (5.75,0) -- (6.25,0); - -\node[] at (0.5,2) {$P_{sun}$}; -\node[] at (2.5,0.5) {$P_{quiet}$}; -\node[] at (4.5,2) {$G$}; -\node[align=right] at (6.75,1) {Rx}; + \end{tikzpicture} \end{center} -- cgit v1.2.3 From def80d4bf628e55afc31ccd8b9648c0308323b80 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 19 Nov 2023 10:29:28 +1030 Subject: building up sinusoid figure --- doc/codec2.pdf | Bin 120193 -> 120512 bytes doc/codec2.tex | 35 ++++++++++++++++++----------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 77877b1..8aa38ce 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 07b77ed..5719601 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -2,7 +2,7 @@ \usepackage{amsmath} \usepackage{hyperref} \usepackage{tikz} -\usetikzlibrary{calc} +\usetikzlibrary{calc,arrows} \usepackage{float} \usepackage{xstring} @@ -75,34 +75,35 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see \caption{The Sinusoidal speech model. If we sum a series of sine waves, we can generate speech.} \label{fig:sinusoidal_model} \begin{center} -\begin{tikzpicture} +\begin{tikzpicture}[>=triangle 45,x=1.0cm,y=1.0cm] -\newcommand{\DrawSine}[4]{% x, y, x_scale, y_scale -\begin{tikzpicture} -%\draw plot [smooth] coordinates {(#1,#2) (0.5,0.707) (1,1) (1.5,0.707) (2,0) (2.5,-0.707) (3,-1) (3.5,-0.707) (4,0)}; +% draws little sine wave squiggle +\newcommand{\drawSine}[4]{% x, y, x_scale, y_scale \draw plot [smooth] coordinates {(#1-2*#3, #2 ) (#1-1.5*#3,#2+0.707*#4) (#1-1*#3, #2+1*#4) (#1-0.5*#3,#2+0.707*#4) (#1 ,#2+0) (#1+0.5*#3,#2-0.707*#4) (#1+1*#3,#2-1*#4) (#1+1.5*#3,#2-0.707*#4) - (#1+2*#3,#2+0)}; -\end{tikzpicture} + (#1+2*#3,#2+0)} } -%%\draw plot [smooth] coordinates {(0,0) (0.5,0.707) (1,1) (1.5,0.707) (2,0) (2.5,-0.707) (3,-1) (3.5,-0.707) (4,0)}; - -\DrawSine {0}{1}{0.5}{0.5} - -\draw (0,2) circle (0.5); -\draw (0,0.5) circle (0.5); -\draw (0,-1.5) circle (0.5); +% three sine wave sources +\draw (0, 2.0) circle (0.5); \drawSine{0}{ 2.0}{0.2}{0.2}; +\draw (0, 0.5) circle (0.5); \drawSine{0}{ 0.5}{0.2}{0.2}; +\draw (0,-1.5) circle (0.5); \drawSine{0}{-1.5}{0.2}{0.2}; -\draw (1,2) -- (2,2); -\draw (2.5,0.75) -- (2.5,1.5); +% summer \draw (2.5,2) circle (0.5); \draw (2.25,2) -- (2.75,2); \draw (2.5,1.75) -- (2.5,2.25); -\draw (3,2) -- (4,2); + +% connecting lines +\draw [->] (0.5,2) -- (2,2); +\draw [->] (0.45,0.7) -- (2.05,1.8); +\draw [->] (0.3,-1.1) -- (2.2,1.6); + +% output +\draw [->] (3,2) -- (4,2); \end{tikzpicture} -- cgit v1.2.3 From f778670d0c711c4d72d71adcd401997cb603f7c9 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 19 Nov 2023 19:11:10 +1030 Subject: sinusoidal figure OK --- doc/codec2.pdf | Bin 120512 -> 138290 bytes doc/codec2.tex | 15 ++++++++------- octave/plamp.m | 7 +++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 8aa38ce..663cf83 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 5719601..ae01ae7 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -87,10 +87,11 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see (#1+2*#3,#2+0)} } -% three sine wave sources -\draw (0, 2.0) circle (0.5); \drawSine{0}{ 2.0}{0.2}{0.2}; -\draw (0, 0.5) circle (0.5); \drawSine{0}{ 0.5}{0.2}{0.2}; -\draw (0,-1.5) circle (0.5); \drawSine{0}{-1.5}{0.2}{0.2}; +% sine wave sources +\draw (0, 2.0) circle (0.5); \drawSine{0}{ 2.0}{0.2}{0.2}; \draw (-2.0,2.0) node {$A_1, F_0$ Hz}; +\draw (0, 0.5) circle (0.5); \drawSine{0}{ 0.5}{0.2}{0.2}; \draw (-2.0,0.5) node {$A_2, 2F_0$ Hz}; +\draw (0,-2.5) circle (0.5); \drawSine{0}{-2.5}{0.2}{0.2}; \draw (-2.0,-2.5) node {$A_L, LF_0$ Hz}; +\draw [dotted,thick] (0,0) -- (0,-2); % summer \draw (2.5,2) circle (0.5); @@ -100,11 +101,11 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see % connecting lines \draw [->] (0.5,2) -- (2,2); \draw [->] (0.45,0.7) -- (2.05,1.8); -\draw [->] (0.3,-1.1) -- (2.2,1.6); +\draw [->] (0.3,-2.1) -- (2.2,1.6); -% output +% output speec \draw [->] (3,2) -- (4,2); - +\draw [xshift=4.2cm,yshift=2cm,color=blue] plot[smooth] file {hts2a_37_sn.txt}; \end{tikzpicture} \end{center} diff --git a/octave/plamp.m b/octave/plamp.m index e32d102..6f0478f 100644 --- a/octave/plamp.m +++ b/octave/plamp.m @@ -92,6 +92,13 @@ function plamp(samname, f, epslatex=0) fn = sprintf("%s_%d_sn.tex",samname,f); print(fn,"-depslatex",sz); printf("\nprinting... %s\n", fn); + % file of points to plot in sinusoidal model + fn = sprintf("%s_%d_sn.txt",samname,f); + t_length = 4; s_max = 2; s=s*s_max/max(abs(s)); + N = length(s); t = (0:N-1)*t_length/N; + s_save = [t' s']; size(s_save) + save("-ascii",fn,"s_save"); printf("printing... %s\n", fn); + figure(2); fn = sprintf("%s_%d_sw.tex",samname,f); print(fn,"-depslatex",sz); printf("printing... %s\n", fn); -- cgit v1.2.3 From 24d7b22e4f4086ef64b27048cbdb5bffc6ed5bd4 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 20 Nov 2023 06:58:16 +1030 Subject: parameter updates --- doc/codec2.pdf | Bin 138290 -> 139875 bytes doc/codec2.tex | 31 ++++++++++++++++++++++--------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 663cf83..b2ad5b3 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index ae01ae7..4d2c2ac 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -51,14 +51,12 @@ Recently, machine learning has been applied to speech coding. This technology p To explain how Codec 2 works, lets look at some speech. Figure \ref{fig:hts2a_time} shows a short 40ms segment of speech in the time and frequency domain. On the time plot we can see the waveform is changing slowly over time as the word is articulated. On the right hand side it also appears to repeat itself - one cycle looks very similar to the last. This cycle time is the "pitch period", which for this example is around $P=35$ samples. Given we are sampling at $F_s=8000$ Hz, the pitch period is $P/F_s=35/8000=0.0044$ seconds, or 4.4ms. -The pitch changes in time, and is generally higher for females and children, and lower for males. It only appears to be constant for a short snap shot (a few 10s of ms) in time. For human speech pitch can vary over a range of 50 Hz to 500 Hz. +Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. -Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency. - -Note that each harmonic has it's own amplitude, that varies slowly up and down with frequency. The red line plots the amplitude of each harmonic. There is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. +Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. \begin{figure}[H] -\caption{ A 40ms segment of the word "these" from a female speaker, sampled at 8kHz. Top is a plot again time, bottom (blue) is a plot against frequency. The waveform repeats itself every 4.3ms (230 Hz), this is the "pitch period" of this segment.} +\caption{ A 40ms segment from the word "these" from a female speaker, sampled at 8kHz. Top is a plot again time, bottom (blue) is a plot against frequency. The waveform repeats itself every 4.3ms ($F_0=230$ Hz), this is the "pitch period" of this segment.} \label{fig:hts2a_time} \begin{center} \input hts2a_37_sn.tex @@ -69,10 +67,10 @@ Note that each harmonic has it's own amplitude, that varies slowly up and down w \subsection{Sinusoidal Speech Coding} -A sinewave will cause a spike or spectral line on a spectrum plot, so we can see each spike as a small sine wave generator. Each sine wave generator has it's own frequency (e.g. $230, 460, 690,...$ Hz), amplitude and phase. If we add all of the sine waves together we can produce the time domain signal at the top of Figure \ref{fig:hts2a_time}, and produce synthesised speech. This is called sinusoidal speech coding and is the ``model" at the heart of Codec 2. +A sinewave will cause a spike or spectral line on a spectrum plot, so we can see each spike as a small sine wave generator. Each sine wave generator has it's own frequency that are all multiples of the fundamental pitch frequency (e.g. $230, 460, 690,...$ Hz). They will also have their own amplitude and phase. If we add all the sine waves together (Figure \ref{fig:sinusoidal_model}) we can produce reasonable quality synthesised speech. This is called sinusoidal speech coding and is the speech production ``model" at the heart of Codec 2. \begin{figure}[h] -\caption{The Sinusoidal speech model. If we sum a series of sine waves, we can generate speech.} +\caption{The sinusoidal speech model. If we sum a series of sine waves, we can generate a speech signal. Each sinewave has it's own amplitude ($A_1,A_2,... A_L$), frequency, and phase (not shown). We assume the frequencies are multiples of the fundamental frequency $F_0$. $L$ is the total number of sinewaves.} \label{fig:sinusoidal_model} \begin{center} \begin{tikzpicture}[>=triangle 45,x=1.0cm,y=1.0cm] @@ -111,19 +109,34 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see \end{center} \end{figure} -\subsection{Spectral Magnitude Quantisation} +The model parameters evolve over time, but can generally be considered constant for short snap shots in time (a few 10s of ms). For example pitch evolves time, moving up or down as a word is articulated. + +As the model parameters change over time, we need to keep updating them. This is known as the \emph{frame rate} of the codec, which can be expressed in terms of frequency (Hz) or time (ms). For sampling model parameters Codec 2 uses a frame rate of 10ms. For transmission over the channel we reduce this to 20-40ms, in order to lower the bit rate. The trade off with a lower frame rate is reduced speech quality. + +The parameters of the sinusoidal model are: +\begin{enumerate} +\item Frequencies of each sine wave. As they are all harmonics of $F_0$ we can just send $F_0$ to the decoder, and it can reconstruct the frequency of each harmonic as $F_0,2F_0,3F_0,...,LF_0$. We used 5-7 bits/frame to represent the $F_0$ in Codec 2. +\item The spectral magnitudes, $A_1,A_2,...,A_L$. These are really important as they convey the information the ear needs to make the speech intelligible. Most of the bits are used for spectral magnitude information. Codec 2 uses between 20 and 36 bits/frame for spectral amplitude information. +\item A voicing model. Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two. The example in Figure \ref{fig:hts2a_time} above is for voiced speech. So we need some way to tell the decoder if the speech is voiced or unvoiced, this requires just a few bits/frame. +\end{enumerate} + +\subsection{Codec 2 Block Diagram} + \subsection{Bit Allocation} \section{Signal Processing Details} \label{sect:details} +\cite{griffin1988multiband} + \section{Further Work} \begin{enumerate} +\item Using c2sim to ectract and plot model parameters \item How to use tools to single step through codec operation \end{enumerate} -\cite{griffin1988multiband} + \bibliographystyle{plain} \bibliography{codec2_refs} -- cgit v1.2.3 From 3d9443facfb06de1ee7efaecf876ff225e2fd5c1 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 20 Nov 2023 08:33:17 +1030 Subject: building up encoder block diagram --- doc/codec2.pdf | Bin 139875 -> 140472 bytes doc/codec2.tex | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index b2ad5b3..c8332bf 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 4d2c2ac..d60d557 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -2,7 +2,7 @@ \usepackage{amsmath} \usepackage{hyperref} \usepackage{tikz} -\usetikzlibrary{calc,arrows} +\usetikzlibrary{calc,arrows,shapes,positioning} \usepackage{float} \usepackage{xstring} @@ -22,6 +22,19 @@ \author{David Rowe\\ \\ Revision: {\gitrevision} on branch: {\branch}} \begin{document} + +% Tikz code used to support block diagrams +% credit: https://tex.stackexchange.com/questions/175969/block-diagrams-using-tikz + +\tikzset{ +block/.style = {draw, fill=white, rectangle, minimum height=3em, minimum width=3em}, +tmp/.style = {coordinate}, +sum/.style= {draw, fill=white, circle, node distance=1cm}, +input/.style = {coordinate}, +output/.style= {coordinate}, +pinstyle/.style = {pin edge={to-,thin,black}} +} + \maketitle \section{Introduction} @@ -117,11 +130,31 @@ The parameters of the sinusoidal model are: \begin{enumerate} \item Frequencies of each sine wave. As they are all harmonics of $F_0$ we can just send $F_0$ to the decoder, and it can reconstruct the frequency of each harmonic as $F_0,2F_0,3F_0,...,LF_0$. We used 5-7 bits/frame to represent the $F_0$ in Codec 2. \item The spectral magnitudes, $A_1,A_2,...,A_L$. These are really important as they convey the information the ear needs to make the speech intelligible. Most of the bits are used for spectral magnitude information. Codec 2 uses between 20 and 36 bits/frame for spectral amplitude information. -\item A voicing model. Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two. The example in Figure \ref{fig:hts2a_time} above is for voiced speech. So we need some way to tell the decoder if the speech is voiced or unvoiced, this requires just a few bits/frame. +\item A voicing model. Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two. The example in Figure \ref{fig:hts2a_time} above is for voiced speech. So we need some way to describe voicing to the decoder. This requires just a few bits/frame. \end{enumerate} \subsection{Codec 2 Block Diagram} +\begin{figure}[h] +\caption{Codec 2 Encoder.} +\label{fig:codec2_encoder} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm] + +\node [input] (rinput) {}; +\node [input, right of=rinput,node distance=1cm] (z) {}; +\node [block, right of=z,node distance=2cm] (pitch_est) {Pitch Estimator}; +\node [block, below of=pitch_est] (fft) {FFT}; +\node [block, right of=fft,node distance=3cm] (est_Am) {Estimate $A_m$}; + +\draw [->] (rinput) -- node[left,text width=2cm] {Input Speech} (pitch_est); +\draw [->] (z) |- (fft); +\draw [->] (pitch_est) -| (est_Am); +\draw [->] (fft) -- (est_Am); + +\end{tikzpicture} +\end{center} +\end{figure} \subsection{Bit Allocation} @@ -133,7 +166,7 @@ The parameters of the sinusoidal model are: \section{Further Work} \begin{enumerate} -\item Using c2sim to ectract and plot model parameters +\item Using c2sim to extract and plot model parameters \item How to use tools to single step through codec operation \end{enumerate} -- cgit v1.2.3 From 1b311ba01b1e4274d501440abacedba41a93a626 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 20 Nov 2023 09:02:42 +1030 Subject: encoder block diagram --- doc/codec2.pdf | Bin 140472 -> 141380 bytes doc/codec2.tex | 28 +++++++++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index c8332bf..8220e85 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index d60d557..7fea6d2 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -128,9 +128,10 @@ As the model parameters change over time, we need to keep updating them. This i The parameters of the sinusoidal model are: \begin{enumerate} -\item Frequencies of each sine wave. As they are all harmonics of $F_0$ we can just send $F_0$ to the decoder, and it can reconstruct the frequency of each harmonic as $F_0,2F_0,3F_0,...,LF_0$. We used 5-7 bits/frame to represent the $F_0$ in Codec 2. -\item The spectral magnitudes, $A_1,A_2,...,A_L$. These are really important as they convey the information the ear needs to make the speech intelligible. Most of the bits are used for spectral magnitude information. Codec 2 uses between 20 and 36 bits/frame for spectral amplitude information. -\item A voicing model. Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two. The example in Figure \ref{fig:hts2a_time} above is for voiced speech. So we need some way to describe voicing to the decoder. This requires just a few bits/frame. +\item The frequency of each sine wave. As they are all harmonics of $F_0$ we can just send $F_0$ to the decoder, and it can reconstruct the frequency of each harmonic as $F_0,2F_0,3F_0,...,LF_0$. We used 5-7 bits/frame to represent the $F_0$ in Codec 2. +\item The magnitude of each sine wave, $A_1,A_2,...,A_L$. These ``spectral magnitudes" are really important as they convey the information the ear needs to understand speech. Most of the bits are used for spectral magnitude information. Codec 2 uses between 20 and 36 bits/frame for spectral amplitude information. +\item Voicing information. Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two. The example in Figure \ref{fig:hts2a_time} above is for voiced speech. So we need some way to describe voicing to the decoder. This requires just a few bits/frame. +\item The phase of each sine wave Codec 2 discards the phases of each harmonic and reconstruct them at the decoder using an algorithm, so no bits are required for phases. This results in some drop in speech quality. \end{enumerate} \subsection{Codec 2 Block Diagram} @@ -139,18 +140,27 @@ The parameters of the sinusoidal model are: \caption{Codec 2 Encoder.} \label{fig:codec2_encoder} \begin{center} -\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm] +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center,text width=2cm] \node [input] (rinput) {}; \node [input, right of=rinput,node distance=1cm] (z) {}; -\node [block, right of=z,node distance=2cm] (pitch_est) {Pitch Estimator}; +\node [block, right of=z] (pitch_est) {Pitch Estimator}; \node [block, below of=pitch_est] (fft) {FFT}; -\node [block, right of=fft,node distance=3cm] (est_Am) {Estimate $A_m$}; +\node [block, right of=fft,node distance=3cm] (est_am) {Estimate Amplitudes}; +\node [block, below of=est_am] (est_v) {Estimate Voicing}; +\node [block, right of=est_am,node distance=3cm] (quant) {Quantise}; +\node [output, right of=quant,node distance=2cm] (routput) {}; -\draw [->] (rinput) -- node[left,text width=2cm] {Input Speech} (pitch_est); +\draw [->] node[align=left] {Input Speech} (rinput) -- (pitch_est); \draw [->] (z) |- (fft); -\draw [->] (pitch_est) -| (est_Am); -\draw [->] (fft) -- (est_Am); +\draw [->] (pitch_est) -| (est_am); +\draw [->] (fft) -- (est_am); +\draw [->] (est_am) -- (est_v); +\draw [->] (pitch_est) -| (quant); +\draw [->] (est_am) -- (quant); +\draw [->] (est_v) -| (quant); +\draw [->] (est_v) -| (quant); +\draw [->] (quant) -- (routput) node[right, align=left, text width=1.5cm] {Bit Stream}; \end{tikzpicture} \end{center} -- cgit v1.2.3 From 4d2492dcd994fc9465e0ef072976d70ca3a0f155 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Thu, 23 Nov 2023 07:04:45 +1030 Subject: building up detailed design intro --- doc/codec2.pdf | Bin 141380 -> 157596 bytes doc/codec2.tex | 137 +++++++++++++++++++++++++++++++++++++++++++++------- doc/codec2_refs.bib | 10 ++++ 3 files changed, 129 insertions(+), 18 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 8220e85..fba5127 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 7fea6d2..6bb67a0 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -43,7 +43,7 @@ Codec 2 is an open source speech codec designed for communications quality speec The Codec 2 project was started in 2009 in response to the problem of closed source, patented, proprietary voice codecs in the sub-5 kbit/s range, in particular for use in the Amateur Radio service. -This document describes Codec 2 at two levels. Section \ref{sect:overview} is a high level overview aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description with math and signal processing theory. This document is not a concise algorithmic description, instead the algorithm is defined by the reference C99 source code and automated tests (ctests). +This document describes Codec 2 at two levels. Section \ref{sect:overview} is a high level overview aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description with math and signal processing theory. Combined with the C source code, it is intended to give the reader enough information to understand the operation of Codec 2 in detail and embark on source code level projects, such as improvements, ports to other languages, student or academic research projects. Issues with the current algorithms and topics for further work are also included. This production of this document was kindly supported by an ARDC grant \cite{ardc2023}. As an open source project, many people have contributed to Codec 2 over the years - we deeply appreciate all of your support. @@ -52,7 +52,7 @@ This production of this document was kindly supported by an ARDC grant \cite{ard \subsection{Model Based Speech Coding} -A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at an 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (e.g. 700 bits/s for HF). Speech coding is the art of "what can we throw away". We need to lower the bit rate of the speech while retaining speech you can understand, and making it sound as natural as possible. +A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (e.g. 700 bits/s for HF). Speech coding is the art of "what can we throw away". We need to lower the bit rate of the speech while retaining speech you can understand, and making it sound as natural as possible. As such low bit rates we use a speech production ``model". The input speech is anlaysed, and we extract model parameters, which are then sent over the channel. An example of a model based parameter is the pitch of the person speaking. We estimate the pitch of the speaker, quantise it to a 7 bit number, and send that over the channel every 20ms. @@ -83,7 +83,7 @@ Note that each harmonic has it's own amplitude, that varies across frequency. T A sinewave will cause a spike or spectral line on a spectrum plot, so we can see each spike as a small sine wave generator. Each sine wave generator has it's own frequency that are all multiples of the fundamental pitch frequency (e.g. $230, 460, 690,...$ Hz). They will also have their own amplitude and phase. If we add all the sine waves together (Figure \ref{fig:sinusoidal_model}) we can produce reasonable quality synthesised speech. This is called sinusoidal speech coding and is the speech production ``model" at the heart of Codec 2. \begin{figure}[h] -\caption{The sinusoidal speech model. If we sum a series of sine waves, we can generate a speech signal. Each sinewave has it's own amplitude ($A_1,A_2,... A_L$), frequency, and phase (not shown). We assume the frequencies are multiples of the fundamental frequency $F_0$. $L$ is the total number of sinewaves.} +\caption{The sinusoidal speech model. If we sum a series of sine waves, we can generate a speech signal. Each sinewave has it's own amplitude ($A_1,A_2,... A_L$), frequency, and phase (not shown). We assume the frequencies are multiples of the fundamental frequency $F_0$. $L$ is the total number of sinewaves we can fit in 4kHz.} \label{fig:sinusoidal_model} \begin{center} \begin{tikzpicture}[>=triangle 45,x=1.0cm,y=1.0cm] @@ -114,7 +114,7 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see \draw [->] (0.45,0.7) -- (2.05,1.8); \draw [->] (0.3,-2.1) -- (2.2,1.6); -% output speec +% output speech \draw [->] (3,2) -- (4,2); \draw [xshift=4.2cm,yshift=2cm,color=blue] plot[smooth] file {hts2a_37_sn.txt}; @@ -122,33 +122,35 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see \end{center} \end{figure} -The model parameters evolve over time, but can generally be considered constant for short snap shots in time (a few 10s of ms). For example pitch evolves time, moving up or down as a word is articulated. +The model parameters evolve over time, but can generally be considered constant for short time window (a few 10s of ms). For example pitch evolves over time, moving up or down as a word is articulated. As the model parameters change over time, we need to keep updating them. This is known as the \emph{frame rate} of the codec, which can be expressed in terms of frequency (Hz) or time (ms). For sampling model parameters Codec 2 uses a frame rate of 10ms. For transmission over the channel we reduce this to 20-40ms, in order to lower the bit rate. The trade off with a lower frame rate is reduced speech quality. The parameters of the sinusoidal model are: \begin{enumerate} -\item The frequency of each sine wave. As they are all harmonics of $F_0$ we can just send $F_0$ to the decoder, and it can reconstruct the frequency of each harmonic as $F_0,2F_0,3F_0,...,LF_0$. We used 5-7 bits/frame to represent the $F_0$ in Codec 2. -\item The magnitude of each sine wave, $A_1,A_2,...,A_L$. These ``spectral magnitudes" are really important as they convey the information the ear needs to understand speech. Most of the bits are used for spectral magnitude information. Codec 2 uses between 20 and 36 bits/frame for spectral amplitude information. -\item Voicing information. Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two. The example in Figure \ref{fig:hts2a_time} above is for voiced speech. So we need some way to describe voicing to the decoder. This requires just a few bits/frame. -\item The phase of each sine wave Codec 2 discards the phases of each harmonic and reconstruct them at the decoder using an algorithm, so no bits are required for phases. This results in some drop in speech quality. +\item The frequency of each sine wave. As they are all harmonics of $F_0$ we can just send $F_0$ to the decoder, and it can reconstruct the frequency of each harmonic as $F_0,2F_0,3F_0,...,LF_0$. We used 5-7 bits/frame to represent $F_0$ in Codec 2. +\item The amplitude of each sine wave, $A_1,A_2,...,A_L$. These ``spectral amplitudes" are really important as they convey the information the ear needs to understand speech. Most of the bits are used for spectral amplitude information. Codec 2 uses between 18 and 50 bits/frame for spectral amplitude information. +\item Voicing information. Speech can be approximated into voiced speech (vowels) and unvoiced speech (like consonants), or some mixture of the two. The example in Figure \ref{fig:hts2a_time} above is voiced speech. So we need some way to describe voicing to the decoder. This requires just a few bits/frame. +\item The phase of each sine wave. Codec 2 discards the phases of each harmonic at the encoder and reconstruct them at the decoder using an algorithm, so no bits are required for phases. This results in some drop in speech quality. \end{enumerate} -\subsection{Codec 2 Block Diagram} +\subsection{Codec 2 Encoder and Decoder} + +This section explains how the Codec 2 encoder and decoder works using block diagrams. \begin{figure}[h] -\caption{Codec 2 Encoder.} +\caption{Codec 2 Encoder} \label{fig:codec2_encoder} \begin{center} \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center,text width=2cm] \node [input] (rinput) {}; -\node [input, right of=rinput,node distance=1cm] (z) {}; -\node [block, right of=z] (pitch_est) {Pitch Estimator}; +\node [input, right of=rinput,node distance=0.5cm] (z) {}; +\node [block, right of=z,node distance=1.5cm] (pitch_est) {Pitch Estimator}; \node [block, below of=pitch_est] (fft) {FFT}; \node [block, right of=fft,node distance=3cm] (est_am) {Estimate Amplitudes}; \node [block, below of=est_am] (est_v) {Estimate Voicing}; -\node [block, right of=est_am,node distance=3cm] (quant) {Quantise}; +\node [block, right of=est_am,node distance=3cm] (quant) {Decimate Quantise}; \node [output, right of=quant,node distance=2cm] (routput) {}; \draw [->] node[align=left] {Input Speech} (rinput) -- (pitch_est); @@ -159,25 +161,124 @@ The parameters of the sinusoidal model are: \draw [->] (pitch_est) -| (quant); \draw [->] (est_am) -- (quant); \draw [->] (est_v) -| (quant); -\draw [->] (est_v) -| (quant); +\draw [->] (est_v) -| (quant); \draw [->] (quant) -- (routput) node[right, align=left, text width=1.5cm] {Bit Stream}; \end{tikzpicture} \end{center} \end{figure} +The encoder is presented in Figure \ref{fig:codec2_encoder}. Frames of input speech samples are passed to a Fast Fourier Transform (FFT), which converts the time domain samples to the frequency domain. The same frame of input samples is used to estimate the pitch of the current frame. We then use the pitch and frequency domain speech to estimate the amplitude of each sine wave. + +Yet another algorithm is used to determine if the frame is voiced or unvoiced. This works by comparing the spectrum to what we would expect for voiced speech (e.g. lots of spectral lines). If the energy is more random and continuous rather than discrete lines, we consider it unvoiced. + +Up until this point the processing happens at a 10ms frame rate. However in the next step we ``decimate`` the model parameters - this means we discard some of the model parameters to lower the frame rate, which helps us lower the bit rate. Decimating to 20ms (throwing away every 2nd set of model parameters) doesn't have much effect, but beyond that the speech quality starts to degrade. So there is a trade off between decimation rate and bit rate over the channel. + +Once we have the desired frame rate, we ``quantise"" each model parameter. This means we use a fixed number of bits to represent it, so we can send the bits over the channel. Parameters like pitch and voicing are fairly easy, but quite a bit of DSP goes into quantising the spectral amplitudes. For the higher bit rate Codec 2 modes, we design a filter that matches the spectral amplitudes, then send a quantised version of the filter over the channel. Using the example in Figure \ref{fig:hts2a_time} - the filter would have a band pass peaks at 500 and 2300 Hz. It's frequency response would follow the red line. The filter is time varying - we redesign it for every frame. + +You'll notice the term "estimate" being used a lot. One of the problems with model based speech coding is the algorithms we use to extract the model parameters are not perfect. Occasionally the algorithms get it wrong. Look at the red crosses on the bottom plot of Figure \ref{fig:hts2a_time}. These mark the amplitude estimate of each harmonic. If you look carefully, you'll see that above 2000Hz, the crosses fall a little short of the exact centre of each harmonic. This is an example of a ``fine" pitch estimator error, a little off the correct value. + +Often the errors interact, for example the fine pitch error shown above will mean the amplitude estimates are a little bit off as well. Fortunately these errors tend to be temporary, and are sometimes not even noticeable to the listener - remember this codec is often used for HF/VHF radio where channel noise is part of the normal experience. + +\begin{figure}[h] +\caption{Codec 2 Decoder} +\label{fig:codec2_decoder} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center,text width=2cm] + +\node [input] (rinput) {}; +\node [block, right of=rinput,node distance=2cm] (dequantise) {Dequantise Interpolate}; +\node [block, right of=dequantise,node distance=3cm] (recover) {Recover Amplitudes}; +\node [block, right of=recover,node distance=3cm] (synthesise) {Synthesise Speech}; +\node [block, above of=synthesise] (phase) {Synthesise Phases}; +\node [output, right of=synthesise,node distance=2cm] (routput) {}; + +\draw [->] node[align=left, text width=1.5cm] {Bit Stream} (rinput) -- (dequantise); +\draw [->] (dequantise) -- (recover); +\draw [->] (recover) -- (synthesise); +\draw [->] (recover) |- (phase); +\draw [->] (phase) -- (synthesise); +\draw [->] (synthesise) -- (routput) node[right, align=left, text width=1.5cm] {Output Speech}; + +\end{tikzpicture} +\end{center} +\end{figure} + +Figure \ref{fig:codec2_decoder} shows the operation of the Codec 2 decoder. We take the sequence of bits received from the channel and recover the quantised model parameters, pitch, spectral amplitudes, and voicing. We then resample the model parameters back up to the 10ms frame rate using a technique called interpolation. For example say we receive a $F_0=200$ Hz pitch value then 20ms later $F_0=220$ Hz. We can use the average $F_0=210$ Hz for the middle 10ms frame. + +The phases of each harmonic are generated using the other model parameters and some DSP. It turns out that if you know the amplitude spectrum, you can determine a ``reasonable" phase spectrum using some DSP operations, which in practice is implemented with a couple of FFTs. We also use the voicing information - for unvoiced speech we use random phases (a good way to synthesise noise-like signals) - and for voiced speech we make sure the phases are chosen so the synthesised speech transitions smoothly from one frame to the next. + +Frames of speech are synthesised using an inverse FFT. We take a blank array of FFT samples, and at intervals of $F_0$ insert samples with the amplitude and phase for each harmonic. We then inverse FFT to create a frame of time domain samples. These frames of synthesised speech samples are carefully aligned with the previous frame to ensure smooth frame-frame transitions, and output to the listener. + \subsection{Bit Allocation} -\section{Signal Processing Details} +Table \ref{tab:bit_allocation} presents the bit allocation for two popular Codec 2 modes. One additional parameter is the frame energy, this is the average level of the spectral amplitudes, or ``AF gain" of the speech frame. + +At very low bit rates such as 700C, we use Vector Quantisation (VQ) to represent the spectral amplitudes. We construct a table such that each row of the table has a set of spectral amplitude samples. In Codec 2 700C the table has 512 rows. During the quantisation process, we choose the table row that best matches the spectral amplitudes for this frame, then send the \emph{index} of the table row. The decoder has a similar table, so can use the index to look up the output values. If the table is 512 rows, we can use a 9 bit number to quantise the spectral amplitudes. In Codec 2 700C, we use two tables of 512 entries each (18 bits total), the second one helps fine tune the quantisation from the first table. + +Vector Quantisation can only represent what is present in the tables, so if it sees anything unusual (for example a different microphone frequency response or background noise), the quantisation can become very rough and speech quality poor. We train the tables at design time using a database of speech samples and a training algorithm - an early form of machine learning. + +Codec 2 3200 uses the method of fitting a filter to the spectral amplitudes, this approach tends to be more forgiving of small variations in the input speech spectrum, but is not as efficient in terms of bit rate. + +\begin{table}[H] +\label{tab:bit_allocation} +\centering +\begin{tabular}{l c c } +\hline +Parameter & 3200 & 700C \\ +\hline +Pitch $F_0$ & 7 & 5 \\ +Spectral Amplitudes $\{A_m\}$ & 50 & 18 \\ +Energy & 5 & 3 \\ +Voicing & 2 & 1 \\ +Bits/frame & 64 & 28 \\ +Frame Rate & 20ms & 40ms \\ +Bit rate & 3200 & 700 \\ +\hline +\end{tabular} +\caption{Bit allocation of the 3200 and 700C modes} +\end{table} + +\section{Detailed Design} \label{sect:details} -\cite{griffin1988multiband} +Codec 2 is based on sinusoidal \cite{mcaulay1986speech} and Multi-Band Excitation (MBE) \cite{griffin1988multiband} vocoders that were first developed in the late 1980s. Descendants of the MBE vocoders (IMBE, AMBE etc) have enjoyed widespread use in many applications such as VHF/UHF hand held radios and satellite communications. In the 1990s the author studied sinusoidal speech coding \cite{rowe1997techniques}, which provided the skill set and a practical, patent free baseline for starting the Codec 2 project: + +Some features of Codec 2: +\begin{enumerate} +\item A range of modes supporting different bit rates, currently (Nov 2023): 3200, 2400, 1600, 1400, 1300, 1200, 700 bits/s. These are referred to as ``Codec 2 3200", ``Codec 700C"" etc. +\item Modest CPU (a few 10s of MIPs) and memory (a few 10s of kbytes of RAM) requirements such that it can run on stm32 class microcontrollers with hardware FPU. +\item An open source reference implementation in the C language for C99/gcc compilers, and a \emph{cmake} build and test framework that runs on Linux. Also included is a cross compiled stm32 reference implementation. +\item Ports to non-C99 compilers (e.g. MSVC, some microcontrollers, native builds on Windows) are left to third party developers - we recommend the tests also be ported and pass before considering the port successful. +\item Codec 2 has been designed for digital voice over radio applications, and retains intelligible speech at a few percent bit error rate. +\item A suite of automated tests used to verify the implementation. +\item A pitch estimator based on a 2nd order non-linearity developed by the author. +\item A single voiced/unvoiced binary voicing model. +\item A frequency domain IFFT/overlap-add synthesis model for voiced and unvoiced speech speech. +\item For the higher bit rate modes, spectral amplitudes are represented using LPCs extracted from time domain analysis and scalar LSP quantisation. +\item For Codec 2 700C, vector quantisation of resampled spectral amplitudes in the log domain. +\item Minimal interframe prediction in order to minimise error propagation and maximise robustness to channel errors. +\item A post filter that enhances the speech quality of the baseline codec, especially for low pitched (male) speakers. +\end{enumerate} + +\subsection{Non-Linear Pitch Estimation} + +The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}. There is nothing particularly unique about this pitch estimator or it's performance. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. This section presents an overview of the NLP algorithm extracted from \cite{rowe1997techniques}. + + +\subsection{Sinusoidal Analysis and Synthesis} + +\subsection{LPC/LSP based modes} + +\subsection{Codec 2 700C} \section{Further Work} \begin{enumerate} -\item Using c2sim to extract and plot model parameters +\item some examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters \item How to use tools to single step through codec operation +\item table summarising source files with one line description +\item Add doc license (Creative Commons?) \end{enumerate} diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib index e578439..7348902 100644 --- a/doc/codec2_refs.bib +++ b/doc/codec2_refs.bib @@ -22,3 +22,13 @@ note = {\url{https://www.ardc.net/apply/grants/2023-grants/enhancing-hf-digital-voice-with-freedv/}} } +@article{mcaulay1986speech, + title={Speech analysis/synthesis based on a sinusoidal representation}, + author={McAulay, Robert and Quatieri, Thomas}, + journal={IEEE Transactions on Acoustics, Speech, and Signal Processing}, + volume={34}, + number={4}, + pages={744--754}, + year={1986}, + publisher={IEEE} +} -- cgit v1.2.3 From 3dca356c4328e677d2f61c6d1c0e13a06fc8c9eb Mon Sep 17 00:00:00 2001 From: David Date: Thu, 23 Nov 2023 08:30:00 +1030 Subject: building up NLP figure --- doc/codec2.tex | 70 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/doc/codec2.tex b/doc/codec2.tex index 6bb67a0..dfe9dd3 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -29,12 +29,37 @@ \tikzset{ block/.style = {draw, fill=white, rectangle, minimum height=3em, minimum width=3em}, tmp/.style = {coordinate}, -sum/.style= {draw, fill=white, circle, node distance=1cm}, +sum/.style= {draw, fill=white, circle, node distance=1cm, minimum size=0.75cm}, +mult/.style= {draw, fill=white, circle, node distance=1cm, minimum size=0.75cm}, input/.style = {coordinate}, output/.style= {coordinate}, pinstyle/.style = {pin edge={to-,thin,black}} } +% tikz: draws a sine wave +\newcommand{\drawSine}[4]{% x, y, x_scale, y_scale + +\draw plot [smooth] coordinates {(#1-2*#3, #2 ) (#1-1.5*#3,#2+0.707*#4) + (#1-1*#3, #2+1*#4) (#1-0.5*#3,#2+0.707*#4) + (#1 ,#2+0) (#1+0.5*#3,#2-0.707*#4) + (#1+1*#3,#2-1*#4) (#1+1.5*#3,#2-0.707*#4) + (#1+2*#3,#2+0)} +} + +% tikz: draw a summer +\newcommand{\drawSummer}[2]{% x, y + \draw (#1,#2) circle (0.5); + \draw (#1-0.25,#2) -- (#1+0.25,#2); + \draw (#1,#2-0.25) -- (#1,#2+0.25); +} + +% tikz: draw a multiplier +\newcommand{\drawMultiplier}[2]{% x, y + \draw (#1,#2) circle (0.5); + \draw (#1-0.25,#2-0.25) -- (#1+0.25,#2+0.25); + \draw (#1-0.25,#2+0.25) -- (#1+0.25,#2-0.25); +} + \maketitle \section{Introduction} @@ -88,26 +113,13 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see \begin{center} \begin{tikzpicture}[>=triangle 45,x=1.0cm,y=1.0cm] -% draws little sine wave squiggle -\newcommand{\drawSine}[4]{% x, y, x_scale, y_scale - -\draw plot [smooth] coordinates {(#1-2*#3, #2 ) (#1-1.5*#3,#2+0.707*#4) - (#1-1*#3, #2+1*#4) (#1-0.5*#3,#2+0.707*#4) - (#1 ,#2+0) (#1+0.5*#3,#2-0.707*#4) - (#1+1*#3,#2-1*#4) (#1+1.5*#3,#2-0.707*#4) - (#1+2*#3,#2+0)} -} - % sine wave sources \draw (0, 2.0) circle (0.5); \drawSine{0}{ 2.0}{0.2}{0.2}; \draw (-2.0,2.0) node {$A_1, F_0$ Hz}; \draw (0, 0.5) circle (0.5); \drawSine{0}{ 0.5}{0.2}{0.2}; \draw (-2.0,0.5) node {$A_2, 2F_0$ Hz}; \draw (0,-2.5) circle (0.5); \drawSine{0}{-2.5}{0.2}{0.2}; \draw (-2.0,-2.5) node {$A_L, LF_0$ Hz}; \draw [dotted,thick] (0,0) -- (0,-2); -% summer -\draw (2.5,2) circle (0.5); -\draw (2.25,2) -- (2.75,2); -\draw (2.5,1.75) -- (2.5,2.25); +\drawSummer{2.5}{2}; % connecting lines \draw [->] (0.5,2) -- (2,2); @@ -265,6 +277,34 @@ Some features of Codec 2: The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}. There is nothing particularly unique about this pitch estimator or it's performance. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. This section presents an overview of the NLP algorithm extracted from \cite{rowe1997techniques}. +\begin{figure}[h] +\caption{The Non-Linear Pitch (NLP) algorithm} +\label{fig:nlp} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm] + +\node [input] (rinput) {}; +\node [tmp, right of=rinput,node distance=0.5cm] (z) {}; +\node [tmp, below of=z,node distance=1cm] (z1) {}; +\node [mult, right of=z,node distance=1.5cm] (mult1) {}; +\node [block, right of=mult1,node distance=2cm] (lpf) {Low Pass}; +\node [block, right of=lpf,node distance=3cm] (dec5) {5}; +\node [block, below of=dec5] (dft) {DFT}; +\node [block, below of=lpf] (peak) {Peak Pick}; +\node [output, left of=peak,node distance=2cm] (routput) {}; + +\draw [->] node[align=left,text width=2cm] {Input Speech} (rinput) -- (mult1); +%\draw (z) -- (z1) +\draw [->] (z1) -| (mult1); +\draw [->] (mult1) -- (lpf); +\draw [->] (lpf) -- (dec5); +\draw [->] (dec5) -- (dft); +\draw [->] (dft) -- (peak); +\draw [->] (peak) -- (routput) node[left, align=center] {Pitch\\Candidates}; + +\end{tikzpicture} +\end{center} +\end{figure} \subsection{Sinusoidal Analysis and Synthesis} -- cgit v1.2.3 From 70bf39eb015a7bd59e4aaaff23fc19f60b33caeb Mon Sep 17 00:00:00 2001 From: David Date: Thu, 23 Nov 2023 18:48:11 +1030 Subject: inserted DC notch into NLP --- doc/codec2.pdf | Bin 157596 -> 168242 bytes doc/codec2.tex | 27 ++++++++++++++------------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index fba5127..35aace6 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index dfe9dd3..bfd8698 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -4,7 +4,6 @@ \usepackage{tikz} \usetikzlibrary{calc,arrows,shapes,positioning} \usepackage{float} - \usepackage{xstring} \usepackage{catchfile} @@ -29,8 +28,7 @@ \tikzset{ block/.style = {draw, fill=white, rectangle, minimum height=3em, minimum width=3em}, tmp/.style = {coordinate}, -sum/.style= {draw, fill=white, circle, node distance=1cm, minimum size=0.75cm}, -mult/.style= {draw, fill=white, circle, node distance=1cm, minimum size=0.75cm}, +circ/.style= {draw, fill=white, circle, node distance=1cm, minimum size=0.6cm}, input/.style = {coordinate}, output/.style= {coordinate}, pinstyle/.style = {pin edge={to-,thin,black}} @@ -68,7 +66,7 @@ Codec 2 is an open source speech codec designed for communications quality speec The Codec 2 project was started in 2009 in response to the problem of closed source, patented, proprietary voice codecs in the sub-5 kbit/s range, in particular for use in the Amateur Radio service. -This document describes Codec 2 at two levels. Section \ref{sect:overview} is a high level overview aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description with math and signal processing theory. Combined with the C source code, it is intended to give the reader enough information to understand the operation of Codec 2 in detail and embark on source code level projects, such as improvements, ports to other languages, student or academic research projects. Issues with the current algorithms and topics for further work are also included. +This document describes Codec 2 at two levels. Section \ref{sect:overview} is a high level description aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description with math and signal processing theory. Combined with the C source code, it is intended to give the reader enough information to understand the operation of Codec 2 in detail and embark on source code level projects, such as improvements, ports to other languages, student or academic research projects. Issues with the current algorithms and topics for further work are also included. This production of this document was kindly supported by an ARDC grant \cite{ardc2023}. As an open source project, many people have contributed to Codec 2 over the years - we deeply appreciate all of your support. @@ -254,6 +252,8 @@ Bit rate & 3200 & 700 \\ \section{Detailed Design} \label{sect:details} +\subsection{Overview} + Codec 2 is based on sinusoidal \cite{mcaulay1986speech} and Multi-Band Excitation (MBE) \cite{griffin1988multiband} vocoders that were first developed in the late 1980s. Descendants of the MBE vocoders (IMBE, AMBE etc) have enjoyed widespread use in many applications such as VHF/UHF hand held radios and satellite communications. In the 1990s the author studied sinusoidal speech coding \cite{rowe1997techniques}, which provided the skill set and a practical, patent free baseline for starting the Codec 2 project: Some features of Codec 2: @@ -275,28 +275,29 @@ Some features of Codec 2: \subsection{Non-Linear Pitch Estimation} -The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}. There is nothing particularly unique about this pitch estimator or it's performance. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. This section presents an overview of the NLP algorithm extracted from \cite{rowe1997techniques}. +The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}. There is nothing particularly unique about this pitch estimator or it's performance. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. \begin{figure}[h] \caption{The Non-Linear Pitch (NLP) algorithm} \label{fig:nlp} \begin{center} -\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm] +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] \node [input] (rinput) {}; \node [tmp, right of=rinput,node distance=0.5cm] (z) {}; \node [tmp, below of=z,node distance=1cm] (z1) {}; -\node [mult, right of=z,node distance=1.5cm] (mult1) {}; -\node [block, right of=mult1,node distance=2cm] (lpf) {Low Pass}; -\node [block, right of=lpf,node distance=3cm] (dec5) {5}; +\node [circ, right of=z,node distance=1cm] (mult) {$\times$}; +\node [block, right of=mult,node distance=2cm,text width=2cm] (notch) {DC Notch Filter}; +\node [block, right of=notch,node distance=3cm,text width=2cm] (lpf) {Low Pass Filter}; +\node [block, right of=lpf,node distance=2.5cm] (dec5) {$\downarrow 5$}; \node [block, below of=dec5] (dft) {DFT}; \node [block, below of=lpf] (peak) {Peak Pick}; \node [output, left of=peak,node distance=2cm] (routput) {}; -\draw [->] node[align=left,text width=2cm] {Input Speech} (rinput) -- (mult1); -%\draw (z) -- (z1) -\draw [->] (z1) -| (mult1); -\draw [->] (mult1) -- (lpf); +\draw [->] node[align=left,text width=2cm] {Input Speech} (rinput) -- (mult); +\draw [->] (z) -- (z1) -| (mult); +\draw [->] (mult) -- (notch); +\draw [->] (notch) -- (lpf); \draw [->] (lpf) -- (dec5); \draw [->] (dec5) -- (dft); \draw [->] (dft) -- (peak); -- cgit v1.2.3 From 04ebf698b15cd7dc09870606729185766ded1099 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Thu, 23 Nov 2023 18:57:48 +1030 Subject: Mooneer's suggestions - thanks --- doc/codec2.pdf | Bin 168242 -> 159985 bytes doc/codec2.tex | 10 +++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 35aace6..4ca8f32 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index bfd8698..22fa749 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -75,7 +75,7 @@ This production of this document was kindly supported by an ARDC grant \cite{ard \subsection{Model Based Speech Coding} -A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (e.g. 700 bits/s for HF). Speech coding is the art of "what can we throw away". We need to lower the bit rate of the speech while retaining speech you can understand, and making it sound as natural as possible. +A speech codec takes speech samples from an A/D converter (e.g. 16 bit samples at 8 kHz or 128 kbits/s) and compresses them down to a low bit rate that can be more easily sent over a narrow bandwidth channel (e.g. 700 bits/s for HF). Speech coding is the art of ``what can we throw away". We need to lower the bit rate of the speech while retaining speech you can understand, and making it sound as natural as possible. As such low bit rates we use a speech production ``model". The input speech is anlaysed, and we extract model parameters, which are then sent over the channel. An example of a model based parameter is the pitch of the person speaking. We estimate the pitch of the speaker, quantise it to a 7 bit number, and send that over the channel every 20ms. @@ -106,7 +106,7 @@ Note that each harmonic has it's own amplitude, that varies across frequency. T A sinewave will cause a spike or spectral line on a spectrum plot, so we can see each spike as a small sine wave generator. Each sine wave generator has it's own frequency that are all multiples of the fundamental pitch frequency (e.g. $230, 460, 690,...$ Hz). They will also have their own amplitude and phase. If we add all the sine waves together (Figure \ref{fig:sinusoidal_model}) we can produce reasonable quality synthesised speech. This is called sinusoidal speech coding and is the speech production ``model" at the heart of Codec 2. \begin{figure}[h] -\caption{The sinusoidal speech model. If we sum a series of sine waves, we can generate a speech signal. Each sinewave has it's own amplitude ($A_1,A_2,... A_L$), frequency, and phase (not shown). We assume the frequencies are multiples of the fundamental frequency $F_0$. $L$ is the total number of sinewaves we can fit in 4kHz.} +\caption{The sinusoidal speech model. If we sum a series of sine waves, we can generate a speech signal. Each sinewave has it's own amplitude ($A_1,A_2,... A_L$), frequency, and phase (not shown). We assume the frequencies are multiples of the fundamental frequency $F_0$. $L$ is the total number of sinewaves we can fit in 4 kHz.} \label{fig:sinusoidal_model} \begin{center} \begin{tikzpicture}[>=triangle 45,x=1.0cm,y=1.0cm] @@ -132,7 +132,7 @@ A sinewave will cause a spike or spectral line on a spectrum plot, so we can see \end{center} \end{figure} -The model parameters evolve over time, but can generally be considered constant for short time window (a few 10s of ms). For example pitch evolves over time, moving up or down as a word is articulated. +The model parameters evolve over time, but can generally be considered constant for a short time window (a few 10s of ms). For example pitch evolves over time, moving up or down as a word is articulated. As the model parameters change over time, we need to keep updating them. This is known as the \emph{frame rate} of the codec, which can be expressed in terms of frequency (Hz) or time (ms). For sampling model parameters Codec 2 uses a frame rate of 10ms. For transmission over the channel we reduce this to 20-40ms, in order to lower the bit rate. The trade off with a lower frame rate is reduced speech quality. @@ -186,7 +186,7 @@ Up until this point the processing happens at a 10ms frame rate. However in the Once we have the desired frame rate, we ``quantise"" each model parameter. This means we use a fixed number of bits to represent it, so we can send the bits over the channel. Parameters like pitch and voicing are fairly easy, but quite a bit of DSP goes into quantising the spectral amplitudes. For the higher bit rate Codec 2 modes, we design a filter that matches the spectral amplitudes, then send a quantised version of the filter over the channel. Using the example in Figure \ref{fig:hts2a_time} - the filter would have a band pass peaks at 500 and 2300 Hz. It's frequency response would follow the red line. The filter is time varying - we redesign it for every frame. -You'll notice the term "estimate" being used a lot. One of the problems with model based speech coding is the algorithms we use to extract the model parameters are not perfect. Occasionally the algorithms get it wrong. Look at the red crosses on the bottom plot of Figure \ref{fig:hts2a_time}. These mark the amplitude estimate of each harmonic. If you look carefully, you'll see that above 2000Hz, the crosses fall a little short of the exact centre of each harmonic. This is an example of a ``fine" pitch estimator error, a little off the correct value. +You'll notice the term ``estimate" being used a lot. One of the problems with model based speech coding is the algorithms we use to extract the model parameters are not perfect. Occasionally the algorithms get it wrong. Look at the red crosses on the bottom plot of Figure \ref{fig:hts2a_time}. These mark the amplitude estimate of each harmonic. If you look carefully, you'll see that above 2000Hz, the crosses fall a little short of the exact centre of each harmonic. This is an example of a ``fine" pitch estimator error, a little off the correct value. Often the errors interact, for example the fine pitch error shown above will mean the amplitude estimates are a little bit off as well. Fortunately these errors tend to be temporary, and are sometimes not even noticeable to the listener - remember this codec is often used for HF/VHF radio where channel noise is part of the normal experience. @@ -224,7 +224,7 @@ Frames of speech are synthesised using an inverse FFT. We take a blank array of Table \ref{tab:bit_allocation} presents the bit allocation for two popular Codec 2 modes. One additional parameter is the frame energy, this is the average level of the spectral amplitudes, or ``AF gain" of the speech frame. -At very low bit rates such as 700C, we use Vector Quantisation (VQ) to represent the spectral amplitudes. We construct a table such that each row of the table has a set of spectral amplitude samples. In Codec 2 700C the table has 512 rows. During the quantisation process, we choose the table row that best matches the spectral amplitudes for this frame, then send the \emph{index} of the table row. The decoder has a similar table, so can use the index to look up the output values. If the table is 512 rows, we can use a 9 bit number to quantise the spectral amplitudes. In Codec 2 700C, we use two tables of 512 entries each (18 bits total), the second one helps fine tune the quantisation from the first table. +At very low bit rates such as 700 bits/s, we use Vector Quantisation (VQ) to represent the spectral amplitudes. We construct a table such that each row of the table has a set of spectral amplitude samples. In Codec 2 700C the table has 512 rows. During the quantisation process, we choose the table row that best matches the spectral amplitudes for this frame, then send the \emph{index} of the table row. The decoder has a similar table, so can use the index to look up the output values. If the table is 512 rows, we can use a 9 bit number to quantise the spectral amplitudes. In Codec 2 700C, we use two tables of 512 entries each (18 bits total), the second one helps fine tune the quantisation from the first table. Vector Quantisation can only represent what is present in the tables, so if it sees anything unusual (for example a different microphone frequency response or background noise), the quantisation can become very rough and speech quality poor. We train the tables at design time using a database of speech samples and a training algorithm - an early form of machine learning. -- cgit v1.2.3 From ed463b078809d942a2f0119ec9392c4a43fde0b1 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Thu, 23 Nov 2023 19:21:17 +1030 Subject: moved some introductory info from DD to Intro --- doc/codec2.pdf | Bin 159985 -> 160852 bytes doc/codec2.tex | 21 +++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 4ca8f32..8d60bb2 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 22fa749..882f0b3 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -62,7 +62,16 @@ pinstyle/.style = {pin edge={to-,thin,black}} \section{Introduction} -Codec 2 is an open source speech codec designed for communications quality speech between 700 and 3200 bit/s. The main application is low bandwidth HF/VHF digital radio. It fills a gap in open source voice codecs beneath 5000 bit/s and is released under the GNU Lesser General Public License (LGPL). It is written in C99 standard C. +Codec 2 is an open source speech codec designed for communications quality speech between 700 and 3200 bit/s. The main application is low bandwidth HF/VHF digital radio. It fills a gap in open source voice codecs beneath 5000 bit/s and is released under the GNU Lesser General Public License (LGPL). + +Key feature includes: +\begin{enumerate} +\item A range of modes supporting different bit rates, currently (Nov 2023): 3200, 2400, 1600, 1400, 1300, 1200, 700C. The number is the bit rate, and the supplementary letter the version (700C replaced the earlier 700, 700A, 700B versions). These are referred to as ``Codec 2 3200", ``Codec 700C"" etc. +\item Modest CPU (a few 10s of MIPs) and memory (a few 10s of kbytes of RAM) requirements such that it can run on stm32 class microcontrollers with hardware FPU. +\item Codec 2 has been designed for digital voice over radio applications, and retains intelligible speech at a few percent bit error rate. +\item An open source reference implementation in the C language for C99/gcc compilers, and a \emph{cmake} build and test framework that runs on Linux/MinGW. Also included is a cross compiled stm32 reference implementation. +\item Ports to non-C99 compilers (e.g. MSVC, some microcontrollers, native builds on Windows) are left to third party developers - we recommend the tests also be ported and pass before considering the port successful. +\end{enumerate} The Codec 2 project was started in 2009 in response to the problem of closed source, patented, proprietary voice codecs in the sub-5 kbit/s range, in particular for use in the Amateur Radio service. @@ -70,6 +79,8 @@ This document describes Codec 2 at two levels. Section \ref{sect:overview} is a This production of this document was kindly supported by an ARDC grant \cite{ardc2023}. As an open source project, many people have contributed to Codec 2 over the years - we deeply appreciate all of your support. + + \section{Codec 2 for the Radio Amateur} \label{sect:overview} @@ -256,14 +267,8 @@ Bit rate & 3200 & 700 \\ Codec 2 is based on sinusoidal \cite{mcaulay1986speech} and Multi-Band Excitation (MBE) \cite{griffin1988multiband} vocoders that were first developed in the late 1980s. Descendants of the MBE vocoders (IMBE, AMBE etc) have enjoyed widespread use in many applications such as VHF/UHF hand held radios and satellite communications. In the 1990s the author studied sinusoidal speech coding \cite{rowe1997techniques}, which provided the skill set and a practical, patent free baseline for starting the Codec 2 project: -Some features of Codec 2: +Some features of the Codec 2 Design: \begin{enumerate} -\item A range of modes supporting different bit rates, currently (Nov 2023): 3200, 2400, 1600, 1400, 1300, 1200, 700 bits/s. These are referred to as ``Codec 2 3200", ``Codec 700C"" etc. -\item Modest CPU (a few 10s of MIPs) and memory (a few 10s of kbytes of RAM) requirements such that it can run on stm32 class microcontrollers with hardware FPU. -\item An open source reference implementation in the C language for C99/gcc compilers, and a \emph{cmake} build and test framework that runs on Linux. Also included is a cross compiled stm32 reference implementation. -\item Ports to non-C99 compilers (e.g. MSVC, some microcontrollers, native builds on Windows) are left to third party developers - we recommend the tests also be ported and pass before considering the port successful. -\item Codec 2 has been designed for digital voice over radio applications, and retains intelligible speech at a few percent bit error rate. -\item A suite of automated tests used to verify the implementation. \item A pitch estimator based on a 2nd order non-linearity developed by the author. \item A single voiced/unvoiced binary voicing model. \item A frequency domain IFFT/overlap-add synthesis model for voiced and unvoiced speech speech. -- cgit v1.2.3 From 17a30f0d6a0a87c626cee72d151477ea5ed71d6b Mon Sep 17 00:00:00 2001 From: drowe67 Date: Fri, 24 Nov 2023 07:07:06 +1030 Subject: first draft of NLP section, Glossary --- doc/codec2.pdf | Bin 160852 -> 187419 bytes doc/codec2.tex | 69 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 59 insertions(+), 10 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 8d60bb2..8e81f73 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 882f0b3..cb03c38 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -66,7 +66,7 @@ Codec 2 is an open source speech codec designed for communications quality speec Key feature includes: \begin{enumerate} -\item A range of modes supporting different bit rates, currently (Nov 2023): 3200, 2400, 1600, 1400, 1300, 1200, 700C. The number is the bit rate, and the supplementary letter the version (700C replaced the earlier 700, 700A, 700B versions). These are referred to as ``Codec 2 3200", ``Codec 700C"" etc. +\item A range of modes supporting different bit rates, currently (Nov 2023): 3200, 2400, 1600, 1400, 1300, 1200, 700C. The number is the bit rate, and the supplementary letter the version (700C replaced the earlier 700, 700A, 700B versions). These are referred to as ``Codec 2 3200", ``Codec 2 700C"" etc. \item Modest CPU (a few 10s of MIPs) and memory (a few 10s of kbytes of RAM) requirements such that it can run on stm32 class microcontrollers with hardware FPU. \item Codec 2 has been designed for digital voice over radio applications, and retains intelligible speech at a few percent bit error rate. \item An open source reference implementation in the C language for C99/gcc compilers, and a \emph{cmake} build and test framework that runs on Linux/MinGW. Also included is a cross compiled stm32 reference implementation. @@ -79,8 +79,6 @@ This document describes Codec 2 at two levels. Section \ref{sect:overview} is a This production of this document was kindly supported by an ARDC grant \cite{ardc2023}. As an open source project, many people have contributed to Codec 2 over the years - we deeply appreciate all of your support. - - \section{Codec 2 for the Radio Amateur} \label{sect:overview} @@ -271,16 +269,29 @@ Some features of the Codec 2 Design: \begin{enumerate} \item A pitch estimator based on a 2nd order non-linearity developed by the author. \item A single voiced/unvoiced binary voicing model. -\item A frequency domain IFFT/overlap-add synthesis model for voiced and unvoiced speech speech. +\item A frequency domain IFFT/overlap-add synthesis model for voiced and unvoiced speech. \item For the higher bit rate modes, spectral amplitudes are represented using LPCs extracted from time domain analysis and scalar LSP quantisation. \item For Codec 2 700C, vector quantisation of resampled spectral amplitudes in the log domain. \item Minimal interframe prediction in order to minimise error propagation and maximise robustness to channel errors. \item A post filter that enhances the speech quality of the baseline codec, especially for low pitched (male) speakers. \end{enumerate} +\subsection{Naming Conventions} + +In Codec 2, signals are frequently moved between the time and frequency domain. In the source code and this document, time domain signals generally have the subscript $n$, and frequency domain signals the subscript $\omega$, for example $S_n$ and $S_\omega$ represent the same speech expressed in the time and frequency domain. Section \ref{sect:glossary} contains a glossary of symbols. + \subsection{Non-Linear Pitch Estimation} -The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}. There is nothing particularly unique about this pitch estimator or it's performance. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. +The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}, and portions of this description are reproduced here. The post processing algorithm used for pitch estimation in Codec 2 is different from \cite{rowe1997techniques} and is described here. The C code \emph{nlp.c} is a useful reference for the fine details of the implementation, and the Octave script \emph{plnlp.m} can by used to plot the internal states and single step through speech, illustrating the operation of the algorithm. + +The core pitch detector is based on a square law non-linearity, that is applied directly to the input speech signal. Given speech is composed of harmonics separated by $F_0$ the non-linearity generates intermodulation products at $F_0$, even if the fundamental is absent from the input signal due to high pass filtering. + +Figure \ref{fig:nlp} illustrates the algorithm. The fundamental frequency $F_0$ is estimated in the range of 50-400 Hz. The algorithm is designed to take blocks of $M = 320$ samples at a sample rate of 8 kHz (40 ms time window). This block length ensures at least two pitch periods lie within the analysis window at the lowest fundamental frequency. + +The speech signal is first squared then notch filtered to remove the DC component from the squared time domain signal. This prevents the large amplitude DC term from interfering with the somewhat smaller amplitude term at the fundamental. This is particularly important for male speakers, who may have low frequency fundamentals close to DC. The notch filter is applied in the time domain and has the experimentally derived transfer function: +\begin{equation} +H(z) = \frac{1-z^{-1}}{1-0.95z^{-1}} +\end{equation} \begin{figure}[h] \caption{The Non-Linear Pitch (NLP) algorithm} @@ -297,7 +308,9 @@ The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is d \node [block, right of=lpf,node distance=2.5cm] (dec5) {$\downarrow 5$}; \node [block, below of=dec5] (dft) {DFT}; \node [block, below of=lpf] (peak) {Peak Pick}; -\node [output, left of=peak,node distance=2cm] (routput) {}; +\node [block, below of=notch,text width=2cm] (search) {Sub \\Multiple Search}; +\node [block, left of=search,node distance=3cm] (refine) {Refinement}; +\node [output, left of=refine,node distance=2cm] (routput) {}; \draw [->] node[align=left,text width=2cm] {Input Speech} (rinput) -- (mult); \draw [->] (z) -- (z1) -| (mult); @@ -306,12 +319,26 @@ The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is d \draw [->] (lpf) -- (dec5); \draw [->] (dec5) -- (dft); \draw [->] (dft) -- (peak); -\draw [->] (peak) -- (routput) node[left, align=center] {Pitch\\Candidates}; +\draw [->] (peak) -- (search); +\draw [->] (search) -- (refine); +\draw [->] (refine) -- (routput) node[left, align=center] {$F_0$}; \end{tikzpicture} \end{center} \end{figure} +Before transforming the squared signal to the frequency domain, the signal is low pass filtered and decimated by a factor of 5. This operation is performed to limit the bandwidth of the squared signal to the approximate range of the fundamental frequency. All energy in the squared signal above 400 Hz is superfluous and would lower the resolution of the frequency domain peak picking stage. The low pass filter used for decimation is an FIR type with 48 taps and a cut off frequency of 600 Hz. The decimated signal is then windowed and the $N_{dft} = 512$ point DFT power spectrum $F_\omega(k)$ is computed by zero padding the decimated signal, where $k$ is the DFT bin. + +The DFT power spectrum of the squared signal $F_\omega(k)$ generally contains several local maxima. In most cases, the global maxima will correspond to $F_0$, however occasionally the global maxima $|F_\omega(k_{max})|$ corresponds to a spurious peak or multiple of $F_0$ . Thus it is not appropriate to simply choose the global maxima as the fundamental estimate for this frame. Instead, we look at submultiples of the global maxima frequency $k_{max}/2, k_{max}/3,... k_{min}$ for local maxima. If local maxima exists and is above an experimentally derived threshold we choose the submultiple as the $F_0$ estimate. The threshold is biased down for $F_0$ candidates near the previous frames $F_0$ estimate, a form of backwards pitch tracking. + +The accuracy of the pitch estimate in then refined by maximising the function: +\begin{equation} +E(\omega_0)=\sum_{m=1}^L|S_{\omega}(b m \omega_0)|^2 +\end{equation} +where the $\omega_0=2 \pi F_0 /F_s$ is the normalised angular fundamental frequency in radians/sample, $b$ is a constant that maps a frequency in radians/sample to a DFT bin, and $S_\omega$ is the DFT of the speech spectrum for the current frame. This function will be maximised when $mF_0$ samples the peak of each harmonic, corresponding with an accurate pitch estimate. It is evaluated in a small range about the coarse $F_0$ estimate. + +There is nothing particularly unique about this pitch estimator or it's performance. There are occasional artefacts in the synthesised speech that can be traced to ``gross" and ``fine" pitch estimator errors. In the real world no pitch estimator is perfect, partially because the model assumptions around pitch break down (e.g. in transition regions or unvoiced speech). The NLP algorithm could benefit from additional review, tuning and better pitch tracking. However it appears sufficient for the use case of a communications quality speech codec, and is a minor source of artefacts in the synthesised speech. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. + \subsection{Sinusoidal Analysis and Synthesis} \subsection{LPC/LSP based modes} @@ -321,12 +348,34 @@ The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is d \section{Further Work} \begin{enumerate} -\item some examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters -\item How to use tools to single step through codec operation -\item table summarising source files with one line description +\item Some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters +\item How to use Octave tools to single step through codec operation +\item Table summarising source files with one line description \item Add doc license (Creative Commons?) \end{enumerate} +\section{Glossary} +\label{sect:glossary} + +\begin{table}[H] +\label{tab:symbol_glossary} +\centering +\begin{tabular}{l l l } +\hline +Symbol & Description & Units \\ +\hline +$b$ & Constant that maps a frequency in radians to a DFT bin \\ +$\{A_m\}$ & Set of spectral amplitudes $m=1,...L$ & dB \\ +$F_0$ & Fundamental frequency (pitch) & Hz \\ +$F_s$ & Sample rate (usually 8 kHz) & Hz \\ +$F_\omega(k)$ & DFT of squared speech signal in NLP pitch estimator \\ +$L$ & Number of harmonics & \\ +$P$ & Pitch period & ms or samples \\ +$\omega_0$ & Fundamental frequency (pitch) & radians/sample \\ +\hline +\end{tabular} +\caption{Glossary of Symbols} +\end{table} \bibliographystyle{plain} \bibliography{codec2_refs} -- cgit v1.2.3 From f95b5902bb05c5c7055bc94e178c4c9b9ed26146 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Fri, 24 Nov 2023 07:47:23 +1030 Subject: sinusoidal encoder block diagram --- doc/codec2.pdf | Bin 187419 -> 189214 bytes doc/codec2.tex | 32 +++++++++++++++++++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 8e81f73..0371ab1 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index cb03c38..8cd0b04 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -51,13 +51,6 @@ pinstyle/.style = {pin edge={to-,thin,black}} \draw (#1,#2-0.25) -- (#1,#2+0.25); } -% tikz: draw a multiplier -\newcommand{\drawMultiplier}[2]{% x, y - \draw (#1,#2) circle (0.5); - \draw (#1-0.25,#2-0.25) -- (#1+0.25,#2+0.25); - \draw (#1-0.25,#2+0.25) -- (#1+0.25,#2-0.25); -} - \maketitle \section{Introduction} @@ -341,6 +334,31 @@ There is nothing particularly unique about this pitch estimator or it's performa \subsection{Sinusoidal Analysis and Synthesis} +\begin{figure}[h] +\caption{Block Diagram of the Sinusoidal Encoder} +\label{fig:encoder} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] + +\node [input] (rinput) {}; +\node [tmp, right of=rinput,node distance=0.5cm] (z) {}; +\node [block, right of=z,node distance=1.5cm] (window) {Window}; +\node [block, right of=window,node distance=2.5cm] (dft) {DFT}; +\node [block, right of=dft,node distance=3cm,text width=2cm] (est) {Est Amp and Phase}; +\node [block, below of=window] (nlp) {NLP}; +\node [output, right of=est,node distance=2cm] (routput) {}; + +\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window); +\draw [->] (z) |- (nlp); +\draw [->] (window) -- node[below] {$s_w(n)$} (dft); +\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est); +\draw [->] (nlp) -| node[below] {$\omega_0$} (est) ; +\draw [->] (est) -- (routput) node[right] {$\{A_m\}$ \\ $\{\theta_m\}$}; + +\end{tikzpicture} +\end{center} +\end{figure} + \subsection{LPC/LSP based modes} \subsection{Codec 2 700C} -- cgit v1.2.3 From 97b20b412041e4b10550480f5a21c7347c77bd3d Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sat, 25 Nov 2023 06:35:24 +1030 Subject: drafted sinusoidal analysis section --- doc/codec2.pdf | Bin 189214 -> 212388 bytes doc/codec2.tex | 99 ++++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 67 insertions(+), 32 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 0371ab1..cbb0f04 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 8cd0b04..eb9dd71 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -59,10 +59,10 @@ Codec 2 is an open source speech codec designed for communications quality speec Key feature includes: \begin{enumerate} -\item A range of modes supporting different bit rates, currently (Nov 2023): 3200, 2400, 1600, 1400, 1300, 1200, 700C. The number is the bit rate, and the supplementary letter the version (700C replaced the earlier 700, 700A, 700B versions). These are referred to as ``Codec 2 3200", ``Codec 2 700C"" etc. +\item A range of modes supporting different bit rates, currently (Nov 2023): 3200, 2400, 1600, 1400, 1300, 1200, 700C. The number is the bit rate, and the supplementary letter the version (700C replaced the earlier 700, 700A, 700B versions). These are referred to as ``Codec 2 3200", ``Codec 2 700C" etc. \item Modest CPU (a few 10s of MIPs) and memory (a few 10s of kbytes of RAM) requirements such that it can run on stm32 class microcontrollers with hardware FPU. \item Codec 2 has been designed for digital voice over radio applications, and retains intelligible speech at a few percent bit error rate. -\item An open source reference implementation in the C language for C99/gcc compilers, and a \emph{cmake} build and test framework that runs on Linux/MinGW. Also included is a cross compiled stm32 reference implementation. +\item An open source reference implementation in the C language for C99/gcc compilers, and a \emph{cmake} build and test framework that runs on Linux. Also included is a cross compiled stm32 reference implementation. \item Ports to non-C99 compilers (e.g. MSVC, some microcontrollers, native builds on Windows) are left to third party developers - we recommend the tests also be ported and pass before considering the port successful. \end{enumerate} @@ -93,7 +93,7 @@ Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} freq Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. -\begin{figure}[H] +\begin{figure} \caption{ A 40ms segment from the word "these" from a female speaker, sampled at 8kHz. Top is a plot again time, bottom (blue) is a plot against frequency. The waveform repeats itself every 4.3ms ($F_0=230$ Hz), this is the "pitch period" of this segment.} \label{fig:hts2a_time} \begin{center} @@ -269,9 +269,68 @@ Some features of the Codec 2 Design: \item A post filter that enhances the speech quality of the baseline codec, especially for low pitched (male) speakers. \end{enumerate} -\subsection{Naming Conventions} +\subsection{Sinusoidal Analysis and Synthesis} + +Both voiced and unvoiced speech is represented using a harmonic sinusoidal model: +\begin{equation} +\hat{s}(n) = \sum_{m=1}^L A_m cos(\omega_0 m n + \theta_m) +\end{equation} +where the parameters $A_m, \theta_m, m=1...L$ represent the magnitude and phases of each sinusoid, $\omega_0$ is the fundamental frequency in radians/sample, and $L=\lfloor \pi/\omega_0 \rfloor$ is the number of harmonics. + +Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal analysis system at the core of the Codec 2 encoder. + +\begin{figure}[h] +\caption{Sinusoidal Analysis} +\label{fig:analysis} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] + +\node [input] (rinput) {}; +\node [tmp, right of=rinput,node distance=0.5cm] (z) {}; +\node [block, right of=z,node distance=1.5cm] (window) {Window}; +\node [block, right of=window,node distance=2.5cm] (dft) {DFT}; +\node [block, right of=dft,node distance=3cm,text width=2cm] (est) {Est Amp and Phase}; +\node [block, below of=window] (nlp) {NLP}; +\node [output, right of=est,node distance=2cm] (routput) {}; + +\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window); +\draw [->] (z) |- (nlp); +\draw [->] (window) -- node[below] {$s_w(n)$} (dft); +\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est); +\draw [->] (nlp) -| node[below] {$\omega_0$} (est) ; +\draw [->] (est) -- (routput) node[right] {$\{A_m\}$ \\ $\{\theta_m\}$}; + +\end{tikzpicture} +\end{center} +\end{figure} + +For the purposes of speech analysis the time domain speech signal $s(n)$ is divided into overlapping analysis windows (frames) of $N_w=279$ samples. The centre of each analysis window is separated by $N=80$ samples, or an internal frame rate or 10ms. To analyse the $l$-th frame it is convenient to convert the fixed time reference to a sliding time reference centred on the current analysis window: +\begin{equation} +s_w(n) = s(lN + n) w(n), \quad n = - N_{w2} ... N_{w2} +\end{equation} +where $w(n)$ is a tapered even window of $N_w$ ($N_w$ odd) samples with: +\begin{equation} +N_{w2} = \left \lfloor \frac{N_w}{2} \right \rfloor +\end{equation} +A suitable window function is a shifted Hanning window: +\begin{equation} +w(n) = \frac{1}{2} - \frac{1}{2} cos \left(\frac{2 \pi (n- N_{w2})}{N_w-1} \right) +\end{equation} +To analyse $s(n)$ in the frequency domain the $N_{dft}$ point Discrete Fourier Transform (DFT) can be computed: +\begin{equation} +S_w(k) = \sum_{n=-N_{w2}}^{N_{w2}} s_w(n) e^{-j 2 \pi k n / N_{dft}} +\end{equation} +The magnitude and phase of each harmonic is given by: +\begin{equation} +\begin{split} +A_m &= \sqrt{\sum_{k=a_m}^{b_m-1} |S_w(k)|^2 } \\ +\theta_m &= arg \left( S_w(m \omega_0 N_{dft} / 2 \pi) \right) \\ +a_m &= \left \lfloor \frac{(m - 0.5)\omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor \\ +b_m &= \left \lfloor \frac{(m + 0.5)\omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor +\end{split} +\end{equation} +The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th sinusoid. The magnitude $A_m$ is the RMS level of the energy in the band containing the harmonic. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. -In Codec 2, signals are frequently moved between the time and frequency domain. In the source code and this document, time domain signals generally have the subscript $n$, and frequency domain signals the subscript $\omega$, for example $S_n$ and $S_\omega$ represent the same speech expressed in the time and frequency domain. Section \ref{sect:glossary} contains a glossary of symbols. \subsection{Non-Linear Pitch Estimation} @@ -326,38 +385,14 @@ The DFT power spectrum of the squared signal $F_\omega(k)$ generally contains se The accuracy of the pitch estimate in then refined by maximising the function: \begin{equation} -E(\omega_0)=\sum_{m=1}^L|S_{\omega}(b m \omega_0)|^2 +E(\omega_0)=\sum_{m=1}^L|S_w(b m \omega_0)|^2 \end{equation} -where the $\omega_0=2 \pi F_0 /F_s$ is the normalised angular fundamental frequency in radians/sample, $b$ is a constant that maps a frequency in radians/sample to a DFT bin, and $S_\omega$ is the DFT of the speech spectrum for the current frame. This function will be maximised when $mF_0$ samples the peak of each harmonic, corresponding with an accurate pitch estimate. It is evaluated in a small range about the coarse $F_0$ estimate. +where the $\omega_0=2 \pi F_0 /F_s$ is the normalised angular fundamental frequency in radians/sample, $b$ is a constant that maps a frequency in radians/sample to a DFT bin, and $S_\omega$ is the DFT of the speech spectrum for the current frame. This function will be maximised when $mF_0$ aligns with the peak of each harmonic, corresponding with an accurate pitch estimate. It is evaluated in a small range about the coarse $F_0$ estimate. There is nothing particularly unique about this pitch estimator or it's performance. There are occasional artefacts in the synthesised speech that can be traced to ``gross" and ``fine" pitch estimator errors. In the real world no pitch estimator is perfect, partially because the model assumptions around pitch break down (e.g. in transition regions or unvoiced speech). The NLP algorithm could benefit from additional review, tuning and better pitch tracking. However it appears sufficient for the use case of a communications quality speech codec, and is a minor source of artefacts in the synthesised speech. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. -\subsection{Sinusoidal Analysis and Synthesis} - -\begin{figure}[h] -\caption{Block Diagram of the Sinusoidal Encoder} -\label{fig:encoder} -\begin{center} -\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] -\node [input] (rinput) {}; -\node [tmp, right of=rinput,node distance=0.5cm] (z) {}; -\node [block, right of=z,node distance=1.5cm] (window) {Window}; -\node [block, right of=window,node distance=2.5cm] (dft) {DFT}; -\node [block, right of=dft,node distance=3cm,text width=2cm] (est) {Est Amp and Phase}; -\node [block, below of=window] (nlp) {NLP}; -\node [output, right of=est,node distance=2cm] (routput) {}; - -\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window); -\draw [->] (z) |- (nlp); -\draw [->] (window) -- node[below] {$s_w(n)$} (dft); -\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est); -\draw [->] (nlp) -| node[below] {$\omega_0$} (est) ; -\draw [->] (est) -- (routput) node[right] {$\{A_m\}$ \\ $\{\theta_m\}$}; - -\end{tikzpicture} -\end{center} -\end{figure} +\subsection{Voicing Estimation} \subsection{LPC/LSP based modes} -- cgit v1.2.3 From 899fce85d1a30f528f39c719f51b7adf19728fd6 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sat, 25 Nov 2023 09:48:43 +1030 Subject: building up synthesis section --- doc/codec2.pdf | Bin 212388 -> 216831 bytes doc/codec2.tex | 54 +++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index cbb0f04..8878fbb 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index eb9dd71..376289b 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -269,7 +269,7 @@ Some features of the Codec 2 Design: \item A post filter that enhances the speech quality of the baseline codec, especially for low pitched (male) speakers. \end{enumerate} -\subsection{Sinusoidal Analysis and Synthesis} +\subsection{Sinusoidal Analysis} Both voiced and unvoiced speech is represented using a harmonic sinusoidal model: \begin{equation} @@ -277,7 +277,7 @@ Both voiced and unvoiced speech is represented using a harmonic sinusoidal model \end{equation} where the parameters $A_m, \theta_m, m=1...L$ represent the magnitude and phases of each sinusoid, $\omega_0$ is the fundamental frequency in radians/sample, and $L=\lfloor \pi/\omega_0 \rfloor$ is the number of harmonics. -Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal analysis system at the core of the Codec 2 encoder. +Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal analysis system at the core of the Codec 2 encoder. This algorithms described in this section is based on the work in \cite{rowe1997techniques}, with some changes in notation. \begin{figure}[h] \caption{Sinusoidal Analysis} @@ -312,10 +312,14 @@ where $w(n)$ is a tapered even window of $N_w$ ($N_w$ odd) samples with: \begin{equation} N_{w2} = \left \lfloor \frac{N_w}{2} \right \rfloor \end{equation} -A suitable window function is a shifted Hanning window: +A suitable window function is a shifted Hann window: \begin{equation} w(n) = \frac{1}{2} - \frac{1}{2} cos \left(\frac{2 \pi (n- N_{w2})}{N_w-1} \right) \end{equation} +where the energy in the window is normalised such that: +\begin{equation} +\sum_{n=0}^{N_w-1}w^2(n) = \frac{1}{N_{dft}} +\end{equation} To analyse $s(n)$ in the frequency domain the $N_{dft}$ point Discrete Fourier Transform (DFT) can be computed: \begin{equation} S_w(k) = \sum_{n=-N_{w2}}^{N_{w2}} s_w(n) e^{-j 2 \pi k n / N_{dft}} @@ -329,8 +333,34 @@ a_m &= \left \lfloor \frac{(m - 0.5)\omega_0 N_{dft}}{2 \pi} + 0.5 \right \ b_m &= \left \lfloor \frac{(m + 0.5)\omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor \end{split} \end{equation} -The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th sinusoid. The magnitude $A_m$ is the RMS level of the energy in the band containing the harmonic. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. +The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic. The magnitude $A_m$ is the RMS level of the energy in the band containing the harmonic. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. + +\subsection{Sinusoidal Synthesis} + +Synthesis is achieved by constructing an estimate of the original speech spectrum using the sinusoidal model parameters for the current frame. This information is then transformed to the time domain using an Inverse DFT (IDFT). To produce a continuous time domain waveform the IDFTs from adjacent frames are smoothly interpolated using a weighted overlap add procedure \cite{mcaulay1986speech}. +The synthetic speech spectrum is constructed using the sinusoidal model parameters by populating a DFT array $\hat{S}_w(k)$ with weighted impulses at the harmonic centres: +\begin{equation} +\begin{split} +\hat{S}_w(k) &= \begin{cases} + A_m e^{\theta_m}, & m=1..L \\ + 0, & otherwise + \end{cases} \\ +k &= \left \lfloor \frac{m \omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor +\end{split} +\end{equation} + +As we wish to synthesise a real time domain signal, $S_w(k)$ is defined to be conjugate symmetric: +\begin{equation} +%\hat{S}_w(N_{dft} − k) = \hat{S}_w^{*}(k), \quad k = 1,.. N_{dft}/2-1 +\hat{S}_w(N_{dft}-k) = \hat{S}_w^{*}(k), \quad k = 1,.. N_{dft}/2-1 +\end{equation} +where $\hat{S}_w^*(k)$ is the complex conjugate of $\hat{S}_w(k)$. This signal is converted to the time domain +using the IDFT: +\begin{equation} +s_w(k) = \frac{1}{N_{dft}}\sum_{k=0}^{N_{dft}-1} \hat{S}_w(k) e^{j 2 \pi k n / N_{dft}} +\end{equation} +We introduce the notation $s_w^l(n)$ to denote the synthesised speech for the $l$-th frame. To reconstruct a continuous synthesised speech waveform, we need to smoothly connect adjacent synthesised frames of speech. This is performed by windowing each frame, then shifting and superimposing adjacent frames using an overlap add algorithm. \subsection{Non-Linear Pitch Estimation} @@ -379,9 +409,9 @@ H(z) = \frac{1-z^{-1}}{1-0.95z^{-1}} \end{center} \end{figure} -Before transforming the squared signal to the frequency domain, the signal is low pass filtered and decimated by a factor of 5. This operation is performed to limit the bandwidth of the squared signal to the approximate range of the fundamental frequency. All energy in the squared signal above 400 Hz is superfluous and would lower the resolution of the frequency domain peak picking stage. The low pass filter used for decimation is an FIR type with 48 taps and a cut off frequency of 600 Hz. The decimated signal is then windowed and the $N_{dft} = 512$ point DFT power spectrum $F_\omega(k)$ is computed by zero padding the decimated signal, where $k$ is the DFT bin. +Before transforming the squared signal to the frequency domain, the signal is low pass filtered and decimated by a factor of 5. This operation is performed to limit the bandwidth of the squared signal to the approximate range of the fundamental frequency. All energy in the squared signal above 400 Hz is superfluous and would lower the resolution of the frequency domain peak picking stage. The low pass filter used for decimation is an FIR type with 48 taps and a cut off frequency of 600 Hz. The decimated signal is then windowed and the $N_{dft} = 512$ point DFT power spectrum $F_w(k)$ is computed by zero padding the decimated signal, where $k$ is the DFT bin. -The DFT power spectrum of the squared signal $F_\omega(k)$ generally contains several local maxima. In most cases, the global maxima will correspond to $F_0$, however occasionally the global maxima $|F_\omega(k_{max})|$ corresponds to a spurious peak or multiple of $F_0$ . Thus it is not appropriate to simply choose the global maxima as the fundamental estimate for this frame. Instead, we look at submultiples of the global maxima frequency $k_{max}/2, k_{max}/3,... k_{min}$ for local maxima. If local maxima exists and is above an experimentally derived threshold we choose the submultiple as the $F_0$ estimate. The threshold is biased down for $F_0$ candidates near the previous frames $F_0$ estimate, a form of backwards pitch tracking. +The DFT power spectrum of the squared signal $F_w(k)$ generally contains several local maxima. In most cases, the global maxima will correspond to $F_0$, however occasionally the global maxima $|F_w(k_{max})|$ corresponds to a spurious peak or multiple of $F_0$. Thus it is not appropriate to simply choose the global maxima as the fundamental estimate for this frame. Instead, we look at submultiples of the global maxima frequency $k_{max}/2, k_{max}/3,... k_{min}$ for local maxima. If local maxima exists and is above an experimentally derived threshold we choose the submultiple as the $F_0$ estimate. The threshold is biased down for $F_0$ candidates near the previous frames $F_0$ estimate, a form of backwards pitch tracking. The accuracy of the pitch estimate in then refined by maximising the function: \begin{equation} @@ -417,14 +447,20 @@ There is nothing particularly unique about this pitch estimator or it's performa \hline Symbol & Description & Units \\ \hline +$a_m$ & lower DFT index of current band \\ +$b_m$ & upper DFT index of current band \\ $b$ & Constant that maps a frequency in radians to a DFT bin \\ -$\{A_m\}$ & Set of spectral amplitudes $m=1,...L$ & dB \\ +$\{A_m\}$ & Set of harmonic magnitudes $m=1,...L$ & dB \\ $F_0$ & Fundamental frequency (pitch) & Hz \\ $F_s$ & Sample rate (usually 8 kHz) & Hz \\ -$F_\omega(k)$ & DFT of squared speech signal in NLP pitch estimator \\ -$L$ & Number of harmonics & \\ +$F_w(k)$ & DFT of squared speech signal in NLP pitch estimator \\ +$L$ & Number of harmonics \\ $P$ & Pitch period & ms or samples \\ +$\{\theta_m\}$ & Set of harmonic phases $m=1,...L$ & dB \\ $\omega_0$ & Fundamental frequency (pitch) & radians/sample \\ +$s(n)$ & Input speech \\ +$s_w(n)$ & Time domain windowed input speech \\ +$S_w(k)$ & Frequency domain windowed input speech \\ \hline \end{tabular} \caption{Glossary of Symbols} -- cgit v1.2.3 From 0b6a2074eb3b1a240ff01e4074b62dd15f1c8734 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sat, 25 Nov 2023 20:44:13 +1030 Subject: first pass of synthesis section --- doc/codec2.pdf | Bin 216831 -> 219739 bytes doc/codec2.tex | 28 +++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 8878fbb..3f1202a 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 376289b..45a1f45 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -352,15 +352,37 @@ k &= \left \lfloor \frac{m \omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor As we wish to synthesise a real time domain signal, $S_w(k)$ is defined to be conjugate symmetric: \begin{equation} -%\hat{S}_w(N_{dft} − k) = \hat{S}_w^{*}(k), \quad k = 1,.. N_{dft}/2-1 \hat{S}_w(N_{dft}-k) = \hat{S}_w^{*}(k), \quad k = 1,.. N_{dft}/2-1 \end{equation} where $\hat{S}_w^*(k)$ is the complex conjugate of $\hat{S}_w(k)$. This signal is converted to the time domain using the IDFT: \begin{equation} -s_w(k) = \frac{1}{N_{dft}}\sum_{k=0}^{N_{dft}-1} \hat{S}_w(k) e^{j 2 \pi k n / N_{dft}} +\label{eq:synth_idft} +\hat{s}_l(n) = \frac{1}{N_{dft}}\sum_{k=0}^{N_{dft}-1} \hat{S}_w(k) e^{j 2 \pi k n / N_{dft}} \end{equation} -We introduce the notation $s_w^l(n)$ to denote the synthesised speech for the $l$-th frame. To reconstruct a continuous synthesised speech waveform, we need to smoothly connect adjacent synthesised frames of speech. This is performed by windowing each frame, then shifting and superimposing adjacent frames using an overlap add algorithm. +Where $N_{dft} > 2N$, to support the overlap add procedure below. + +We introduce the notation $\hat{s}_l(n)$ to denote the synthesised speech for the $l$-th frame. To reconstruct a continuous synthesised speech waveform, we need to smoothly connect adjacent synthesised frames of speech. This is performed by windowing each frame of synthesised speech, then shifting and superimposing adjacent frames using an overlap add algorithm. A triangular window is defined by: +\begin{equation} +t(n) = \begin{cases} + n/N, & 0 \le n < N \\ + 1 - (n-N)/N, & N \le n < 2N \\ + 0, & otherwise + \end{cases} +\end{equation} +The frame size, $N=80$, is the same as the encoder. The shape and overlap of the synthesis window is not important, as long as sections separated by the frame size (frame to frame shift) sum to 1: +\begin{equation} +t(n) + t(N-1) = 1 +\end{equation} +The continuous synthesised speech signal $\hat{s}(n)$ for the $l$-th frame is obtained using: +\begin{equation} +\hat{s}(n+lN) = \begin{cases} + \hat{s}(n+(l-1)N) + \hat{s}_l(N_{dft}-N+1+n)t(n), & n=0,1,...,N-2 \\ + \hat{s}_l(n - N - 1)t(n) & n=N-1,..,2N-1 + \end{cases} +\end{equation} + +From the $N_{dft}$ samples produced by the IDFT (\ref{eq:synth_idft}), after windowing we have $2N$ output samples. The first $N$ output samples $n=0,...N-1$ complete the current frame $l$ and are output from the synthesiser. However we must also compute the contribution to the next frame $n = N,N+1,...,2N-1$. These are stored, and added to samples from the next synthesised frame. \subsection{Non-Linear Pitch Estimation} -- cgit v1.2.3 From 125a16926a6a6eef4205e378677efa7a1784ee89 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 26 Nov 2023 08:07:27 +1030 Subject: sinusoidal synthesiser figure --- doc/codec2.pdf | Bin 219739 -> 221074 bytes doc/codec2.tex | 38 +++++++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 3f1202a..60c0d08 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 45a1f45..a64e0b8 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -87,14 +87,14 @@ Recently, machine learning has been applied to speech coding. This technology p \subsection{Speech in Time and Frequency} -To explain how Codec 2 works, lets look at some speech. Figure \ref{fig:hts2a_time} shows a short 40ms segment of speech in the time and frequency domain. On the time plot we can see the waveform is changing slowly over time as the word is articulated. On the right hand side it also appears to repeat itself - one cycle looks very similar to the last. This cycle time is the "pitch period", which for this example is around $P=35$ samples. Given we are sampling at $F_s=8000$ Hz, the pitch period is $P/F_s=35/8000=0.0044$ seconds, or 4.4ms. +To explain how Codec 2 works, lets look at some speech. Figure \ref{fig:hts2a_time} shows a short 40ms segment of speech in the time and frequency domain. On the time plot we can see the waveform is changing slowly over time as the word is articulated. On the right hand side it also appears to repeat itself - one cycle looks very similar to the last. This cycle time is the ``pitch period", which for this example is around $P=35$ samples. Given we are sampling at $F_s=8000$ Hz, the pitch period is $P/F_s=35/8000=0.0044$ seconds, or 4.4ms. Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. \begin{figure} -\caption{ A 40ms segment from the word "these" from a female speaker, sampled at 8kHz. Top is a plot again time, bottom (blue) is a plot against frequency. The waveform repeats itself every 4.3ms ($F_0=230$ Hz), this is the "pitch period" of this segment.} +\caption{ A 40ms segment from the word ``these" from a female speaker, sampled at 8kHz. Top is a plot again time, bottom (blue) is a plot against frequency. The waveform repeats itself every 4.3ms ($F_0=230$ Hz), this is the ``pitch period" of this segment.} \label{fig:hts2a_time} \begin{center} \input hts2a_37_sn.tex @@ -186,7 +186,7 @@ Yet another algorithm is used to determine if the frame is voiced or unvoiced. Up until this point the processing happens at a 10ms frame rate. However in the next step we ``decimate`` the model parameters - this means we discard some of the model parameters to lower the frame rate, which helps us lower the bit rate. Decimating to 20ms (throwing away every 2nd set of model parameters) doesn't have much effect, but beyond that the speech quality starts to degrade. So there is a trade off between decimation rate and bit rate over the channel. -Once we have the desired frame rate, we ``quantise"" each model parameter. This means we use a fixed number of bits to represent it, so we can send the bits over the channel. Parameters like pitch and voicing are fairly easy, but quite a bit of DSP goes into quantising the spectral amplitudes. For the higher bit rate Codec 2 modes, we design a filter that matches the spectral amplitudes, then send a quantised version of the filter over the channel. Using the example in Figure \ref{fig:hts2a_time} - the filter would have a band pass peaks at 500 and 2300 Hz. It's frequency response would follow the red line. The filter is time varying - we redesign it for every frame. +Once we have the desired frame rate, we ``quantise" each model parameter. This means we use a fixed number of bits to represent it, so we can send the bits over the channel. Parameters like pitch and voicing are fairly easy, but quite a bit of DSP goes into quantising the spectral amplitudes. For the higher bit rate Codec 2 modes, we design a filter that matches the spectral amplitudes, then send a quantised version of the filter over the channel. Using the example in Figure \ref{fig:hts2a_time} - the filter would have a band pass peaks at 500 and 2300 Hz. It's frequency response would follow the red line. The filter is time varying - we redesign it for every frame. You'll notice the term ``estimate" being used a lot. One of the problems with model based speech coding is the algorithms we use to extract the model parameters are not perfect. Occasionally the algorithms get it wrong. Look at the red crosses on the bottom plot of Figure \ref{fig:hts2a_time}. These mark the amplitude estimate of each harmonic. If you look carefully, you'll see that above 2000Hz, the crosses fall a little short of the exact centre of each harmonic. This is an example of a ``fine" pitch estimator error, a little off the correct value. @@ -339,11 +339,39 @@ The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th har Synthesis is achieved by constructing an estimate of the original speech spectrum using the sinusoidal model parameters for the current frame. This information is then transformed to the time domain using an Inverse DFT (IDFT). To produce a continuous time domain waveform the IDFTs from adjacent frames are smoothly interpolated using a weighted overlap add procedure \cite{mcaulay1986speech}. +\begin{figure}[h] +\caption{Sinusoidal Synthesis. At frame $l$ we have $2N$ samples from the windowing function. The first $N$ complete the current frame and are the synthesiser output. The second $N$ are stored for summing with the next frame.} +\label{fig:synthesis} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] + +\node [input] (rinput) {}; +\node [block, right of=rinput,node distance=1.5cm,text width=1.5cm] (construct) {Construct $S_w(k)$}; +\node [block, right of=construct,node distance=2cm] (idft) {IDFT}; +\node [block, right of=idft,node distance=2.5cm,text width=1.5cm] (window) {Window $t(n)$}; +\node [circ, right of=window,node distance=3cm] (sum) {$+$}; +\node [block, below of=sum,text width=1.5cm] (delay) {1 frame delay}; +\node [output, right of=sum,node distance=1cm] (routput) {}; + +\draw [->] node[left of=rinput,node distance=0.5cm] {$\omega_0$\\$\{A_m\}$\\$\{\theta_m\}$} (rinput) -- (construct); +\draw [->] (construct) --(idft); +\draw [->] (idft) -- node[below] {$\hat{s}_l(n)$} (window); +\draw [->] (window) -- node[above of=window, node distance=1cm] + {$\begin{aligned} n =& 0,..,\\ & N-1 \end{aligned}$} (sum); +\draw [->] (window) |- (delay) node[left of=delay,below, node distance=2cm] + {$\begin{aligned} n =& N,...,\\ & 2N-1 \end{aligned}$}; +\draw [->] (delay) -- (sum); +\draw [->] (sum) -- (routput) node[right] {$\hat{s}(n+lN)$}; + +\end{tikzpicture} +\end{center} +\end{figure} + The synthetic speech spectrum is constructed using the sinusoidal model parameters by populating a DFT array $\hat{S}_w(k)$ with weighted impulses at the harmonic centres: \begin{equation} \begin{split} \hat{S}_w(k) &= \begin{cases} - A_m e^{\theta_m}, & m=1..L \\ + A_m e^{j\theta_m}, & m=1..L \\ 0, & otherwise \end{cases} \\ k &= \left \lfloor \frac{m \omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor @@ -372,7 +400,7 @@ t(n) = \begin{cases} \end{equation} The frame size, $N=80$, is the same as the encoder. The shape and overlap of the synthesis window is not important, as long as sections separated by the frame size (frame to frame shift) sum to 1: \begin{equation} -t(n) + t(N-1) = 1 +t(n) + t(N-n) = 1 \end{equation} The continuous synthesised speech signal $\hat{s}(n)$ for the $l$-th frame is obtained using: \begin{equation} -- cgit v1.2.3 From b3ed5776c5a3a6bc7dea36508b6a4da0a2108558 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 26 Nov 2023 09:23:45 +1030 Subject: rough draft of phase synthesis copied from source --- doc/codec2.pdf | Bin 221074 -> 225778 bytes doc/codec2.tex | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 60c0d08..7dda932 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index a64e0b8..9298b87 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -263,8 +263,9 @@ Some features of the Codec 2 Design: \item A pitch estimator based on a 2nd order non-linearity developed by the author. \item A single voiced/unvoiced binary voicing model. \item A frequency domain IFFT/overlap-add synthesis model for voiced and unvoiced speech. -\item For the higher bit rate modes, spectral amplitudes are represented using LPCs extracted from time domain analysis and scalar LSP quantisation. -\item For Codec 2 700C, vector quantisation of resampled spectral amplitudes in the log domain. +\item Phases are not transmitted, they are synthesised at the decoder from the magnitude spectrum and voicing decision. +\item For the higher bit rate modes (1200 to 3200 bits/s), spectral magnitudes are represented using LPCs extracted from time domain analysis and scalar LSP quantisation. +\item For Codec 2 700C, vector quantisation of resampled spectral magnitudes in the log domain. \item Minimal interframe prediction in order to minimise error propagation and maximise robustness to channel errors. \item A post filter that enhances the speech quality of the baseline codec, especially for low pitched (male) speakers. \end{enumerate} @@ -328,7 +329,7 @@ The magnitude and phase of each harmonic is given by: \begin{equation} \begin{split} A_m &= \sqrt{\sum_{k=a_m}^{b_m-1} |S_w(k)|^2 } \\ -\theta_m &= arg \left( S_w(m \omega_0 N_{dft} / 2 \pi) \right) \\ +\theta_m &= arg \left[ S_w(m \omega_0 N_{dft} / 2 \pi) \right)] \\ a_m &= \left \lfloor \frac{(m - 0.5)\omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor \\ b_m &= \left \lfloor \frac{(m + 0.5)\omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor \end{split} @@ -340,7 +341,7 @@ The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th har Synthesis is achieved by constructing an estimate of the original speech spectrum using the sinusoidal model parameters for the current frame. This information is then transformed to the time domain using an Inverse DFT (IDFT). To produce a continuous time domain waveform the IDFTs from adjacent frames are smoothly interpolated using a weighted overlap add procedure \cite{mcaulay1986speech}. \begin{figure}[h] -\caption{Sinusoidal Synthesis. At frame $l$ we have $2N$ samples from the windowing function. The first $N$ complete the current frame and are the synthesiser output. The second $N$ are stored for summing with the next frame.} +\caption{Sinusoidal Synthesis. At frame $l$ the windowing function generates $2N$ samples. The first $N$ complete the current frame and are the synthesiser output. The second $N$ are stored for summing with the next frame.} \label{fig:synthesis} \begin{center} \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] @@ -356,10 +357,10 @@ Synthesis is achieved by constructing an estimate of the original speech spectru \draw [->] node[left of=rinput,node distance=0.5cm] {$\omega_0$\\$\{A_m\}$\\$\{\theta_m\}$} (rinput) -- (construct); \draw [->] (construct) --(idft); \draw [->] (idft) -- node[below] {$\hat{s}_l(n)$} (window); -\draw [->] (window) -- node[above of=window, node distance=1cm] - {$\begin{aligned} n =& 0,..,\\ & N-1 \end{aligned}$} (sum); +\draw [->] (window) -- node[above of=window, node distance=0.75cm] + {$\begin{aligned} n =& 0,..,\\[-0.5ex] & N-1 \end{aligned}$} (sum); \draw [->] (window) |- (delay) node[left of=delay,below, node distance=2cm] - {$\begin{aligned} n =& N,...,\\ & 2N-1 \end{aligned}$}; + {$\begin{aligned} n =& N,...,\\[-0.5ex] & 2N-1 \end{aligned}$}; \draw [->] (delay) -- (sum); \draw [->] (sum) -- (routput) node[right] {$\hat{s}(n+lN)$}; @@ -471,11 +472,46 @@ where the $\omega_0=2 \pi F_0 /F_s$ is the normalised angular fundamental freque There is nothing particularly unique about this pitch estimator or it's performance. There are occasional artefacts in the synthesised speech that can be traced to ``gross" and ``fine" pitch estimator errors. In the real world no pitch estimator is perfect, partially because the model assumptions around pitch break down (e.g. in transition regions or unvoiced speech). The NLP algorithm could benefit from additional review, tuning and better pitch tracking. However it appears sufficient for the use case of a communications quality speech codec, and is a minor source of artefacts in the synthesised speech. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. +\subsection{Voicing Estimation and Phase Synthesis} -\subsection{Voicing Estimation} +TODO: Clean up. Introduce continuous time index, perhaps l-th frame. Expressions for phase spectra as cascade of two systems. Hilbert transform, might need to study this. Figures and simulation plots would be useful. Voicing decision algorithm. Figure of phase synthesis. + +In Codec 2 the harmonic phases $\theta_m$ are not transmitted to the decoder, instead they are synthesised at the decoder using a rules based algorithm and information from the remaining model parameters. The phase of each harmonic is modelled as the phase of a synthesis filter excited by an impulse train. We create the excitation pulse train using $\omega_0$, a binary voicing decision $v$ and a rules based algorithm. + +Consider a pulse train with a pulse starting time $n=0$, with pulses repeated at a rate of $\omega_0$. A pulse train in the time domain is equivalent to harmonics in the frequency domain. We can construct an excitation pulse train using a sum of sinusoids: +\begin{equation} +e(n) = \sum_{m-1}^L cos(m \omega_0 n) +\end{equation} +The phase of each excitation harmonic is: +\begin{equation} +\phi_m = m \omega_0 +\end{equation} +As we don't transmit the pulse position for this model, we need to synthesise it. The excitation pulses occur at a rate of $\omega_0$ (one for each pitch period). The phase of the first harmonic advances by $N \phi_0$ radians over a synthesis frame of $N$ samples. For example if $\omega_0 = \pi /20$ (200 Hz), then over a (10ms $N=80$) sample frame, the phase of the first harmonic would advance $(\pi/20)*80 = 4 \pi$ radians or two complete cycles. + +We generate the excitation phase of the fundamental (first harmonic): +\begin{equation} +\phi_1 = \omega_0 N +\end{equation} +We then relate the phase of the m-th excitation harmonic to the phase of the fundamental as: +\begin{equation} +\phi_m = m\phi_m +\end{equation} +This phase spectra then gets passed through the LPC synthesis filter to determine the final harmonic phase. + +Comparing to speech synthesised using original phases: +\begin{enumerate} +\item Through headphones speech synthesised with this model is not as good. Through a loudspeaker it is very close to original phases. +\item If there are voicing errors, the speech can sound clicky or staticy. If V speech is mistakenly declared UV, this model tends to synthesise impulses or clicks, as there is usually very little shift or dispersion through the LPC synthesis filter. +\item When combined with LPC amplitude modelling there is an additional drop in quality. I am not sure why, theory is interformant energy is raised making any phase errors more obvious. +\item This synthesis model is effectively the same as a simple LPC-10 vocoders, and yet sounds much better. Why? Conventional wisdom (AMBE, MELP) says mixed voicing is required for high quality speech. +\item I am pretty sure the Lincoln Lab sinusoidal coding guys (like xMBE also from MIT) first described this zero phase model, I need to look up the paper. +\item Note that this approach could cause some discontinuities in the phase at the edge of synthesis frames, as no attempt is made to make sure that the phase tracks are continuous (the excitation phases are continuous, but not the final phases after filtering by the LPC spectra). +\end{enumerate} \subsection{LPC/LSP based modes} +Block diagram of LPC/LSP mode encoder and decoder. Walk through operation + \subsection{Codec 2 700C} \section{Further Work} -- cgit v1.2.3 From 12bbb03f0fb89102dc1cfe196e96b0fed72c885f Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 27 Nov 2023 21:26:40 +1030 Subject: first draft of voicing estimation --- doc/codec2.pdf | Bin 225778 -> 231649 bytes doc/codec2.tex | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 7dda932..a366711 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 9298b87..2bf1d51 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -327,6 +327,7 @@ S_w(k) = \sum_{n=-N_{w2}}^{N_{w2}} s_w(n) e^{-j 2 \pi k n / N_{dft}} \end{equation} The magnitude and phase of each harmonic is given by: \begin{equation} +\label{eq:mag_est} \begin{split} A_m &= \sqrt{\sum_{k=a_m}^{b_m-1} |S_w(k)|^2 } \\ \theta_m &= arg \left[ S_w(m \omega_0 N_{dft} / 2 \pi) \right)] \\ @@ -472,11 +473,50 @@ where the $\omega_0=2 \pi F_0 /F_s$ is the normalised angular fundamental freque There is nothing particularly unique about this pitch estimator or it's performance. There are occasional artefacts in the synthesised speech that can be traced to ``gross" and ``fine" pitch estimator errors. In the real world no pitch estimator is perfect, partially because the model assumptions around pitch break down (e.g. in transition regions or unvoiced speech). The NLP algorithm could benefit from additional review, tuning and better pitch tracking. However it appears sufficient for the use case of a communications quality speech codec, and is a minor source of artefacts in the synthesised speech. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. -\subsection{Voicing Estimation and Phase Synthesis} +\subsection{Voicing Estimation} -TODO: Clean up. Introduce continuous time index, perhaps l-th frame. Expressions for phase spectra as cascade of two systems. Hilbert transform, might need to study this. Figures and simulation plots would be useful. Voicing decision algorithm. Figure of phase synthesis. +In Codec 2 the harmonic phases $\theta_m$ are not transmitted to the decoder, instead they are synthesised at the decoder using a rules based algorithm and information from the remaining model parameters, $\{A_m\}$, $\omega_0$, and $v$, the voicing decision for the current frame. + +Voicing is determined using a variation of the MBE voicing algorithm \cite{griffin1988multiband}. Voiced speech consists of a harmonic series of frequency domain impulses, separated by $\omega_0$. When we multiply a segment of the inout speech samples by the window function $w(n)$, we convolve the frequency domain impulses with $W(k)$, the DFT of the $(w)$. Thus for the $m$-th voiced harmonic, we expect to see the shape of the window function $W(k)$ in the band $Sw(k), k=a_m,...,b_m$. The MBE voicing algorithm starts with the assumption that the band is voiced, and measures the error between $S_w(k)$ and the ideal voiced harmonic $\hat{S}_w(k)$. -In Codec 2 the harmonic phases $\theta_m$ are not transmitted to the decoder, instead they are synthesised at the decoder using a rules based algorithm and information from the remaining model parameters. The phase of each harmonic is modelled as the phase of a synthesis filter excited by an impulse train. We create the excitation pulse train using $\omega_0$, a binary voicing decision $v$ and a rules based algorithm. +For each band we first estimate the complex harmonic amplitude (magnitude and phase) using \cite{griffin1988multiband}: +\begin{equation} +B_m = \frac{\sum_{k=a_m}^{b_m} S_w(k) W^* (k - \lfloor mr \rceil)}{|\sum_{k=a_m}^{b_m} W (k - \lfloor mr \rceil)|^2} +\end{equation} +where $r= \omega_0 N_{dft}/2 \pi$ is a constant that maps the $m$-th harmonic to a DFT bin, and $ \lfloor x \rceil$ is the rounding operator. As $w(n)$ is a real and even, $W(k)$ is real and even so we can write: +\begin{equation} +\label{eq:est_amp_mbe} +B_m = \frac{\sum_{k=a_m}^{b_m} S_w(k) W (k + \lfloor mr \rceil)}{\sum_{k=a_m}^{b_m} |W (k + \lfloor mr \rceil)|^2} +\end{equation} +Note this procedure is different to the $A_m$ magnitude estimation procedure in (\ref{eq:mag_est}), and is only used locally for the MBE voicing estimation procedure. The MBE amplitude estimation (\ref{eq:est_amp_mbe}) assumes the energy in the band of $S_w(k)$ is from the DFT of a sine wave in that band, and unlike (\ref{eq:mag_est}) is complex valued. + +The synthesised frequency domain speech for this band is defined as: +\begin{equation} +\hat{S}_w(k) = B_m W(k - \lfloor mr \rceil), \quad k=a_m,...,b_m-1 +\end{equation} +The error between the input and synthesised speech in this band is then: +\begin{equation} +\begin{split} +E_m &= \sum_{k=a_m}^{b_m-1} |S_w(k) - \hat{S}_w(k)|^2 \\ + &=\sum_{k=a_m}^{b_m-1} |S_w(k) - B_m W(k + \lfloor mr \rceil)|^2 +\end{split} +\end{equation} +A Signal to Noise Ratio (SNR) ratio is defined as: +\begin{equation} +SNR = \sum_{m=1}^{m_{1000}} \frac{A^2_m}{E_m} +\end{equation} +where $m_{1000}= \lfloor L/4 \rceil$ is the band at approximately 1000 Hz. If the energy in the bands up to 1000 Hz is a good match to a harmonic series of sinusoids then $\hat{S}_w(k) \approx S_w(k)$ and $E_m$ will be small compared to the energy in the band leading to a high SNR. Voicing is declared using the following rule: +\begin{equation} +v = \begin{cases} + 1, & SNR > 6 dB \\ + 0, & otherwise + \end{cases} +\end{equation} +The voicing decision is post processed by several experimentally defined rules applied to $v$ to prevent some of the common voicing errors, see the C source code in \emph{sine.c} for details. + +\subsection{Phase Synthesis} + +The phase of each harmonic is modelled as the phase of a synthesis filter excited by an impulse train. We create the excitation pulse train using $\omega_0$, a binary voicing decision $v$ and a rules based algorithm. Consider a pulse train with a pulse starting time $n=0$, with pulses repeated at a rate of $\omega_0$. A pulse train in the time domain is equivalent to harmonics in the frequency domain. We can construct an excitation pulse train using a sum of sinusoids: \begin{equation} @@ -508,9 +548,11 @@ Comparing to speech synthesised using original phases: \item Note that this approach could cause some discontinuities in the phase at the edge of synthesis frames, as no attempt is made to make sure that the phase tracks are continuous (the excitation phases are continuous, but not the final phases after filtering by the LPC spectra). \end{enumerate} +TODO: Clean up. Introduce continuous time index, perhaps l-th frame. Expressions for phase spectra as cascade of two systems. Hilbert transform, might need to study this. Figures and simulation plots would be useful. Voicing decision algorithm. Figure of phase synthesis. + \subsection{LPC/LSP based modes} -Block diagram of LPC/LSP mode encoder and decoder. Walk through operation +Block diagram of LPC/LSP mode encoder and decoder. Walk through operation. Decimation and interpolation. \subsection{Codec 2 700C} @@ -547,6 +589,7 @@ $\omega_0$ & Fundamental frequency (pitch) & radians/sample \\ $s(n)$ & Input speech \\ $s_w(n)$ & Time domain windowed input speech \\ $S_w(k)$ & Frequency domain windowed input speech \\ +$v$ & Voicing decision for the current frame \\ \hline \end{tabular} \caption{Glossary of Symbols} -- cgit v1.2.3 From 9a182563d0613cf1b08f70266ab112f3576b753b Mon Sep 17 00:00:00 2001 From: drowe67 Date: Tue, 28 Nov 2023 07:03:01 +1030 Subject: make notation more consistent across sections --- doc/codec2.pdf | Bin 231649 -> 231965 bytes doc/codec2.tex | 57 +++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index a366711..e577925 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 2bf1d51..336ae2c 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -330,19 +330,20 @@ The magnitude and phase of each harmonic is given by: \label{eq:mag_est} \begin{split} A_m &= \sqrt{\sum_{k=a_m}^{b_m-1} |S_w(k)|^2 } \\ -\theta_m &= arg \left[ S_w(m \omega_0 N_{dft} / 2 \pi) \right)] \\ -a_m &= \left \lfloor \frac{(m - 0.5)\omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor \\ -b_m &= \left \lfloor \frac{(m + 0.5)\omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor +\theta_m &= arg \left[ S_w(\lfloor m r \rceil \right] \\ +a_m &= \lfloor (m - 0.5)r \rceil \\ +b_m &= \lfloor (m + 0.5)r \rceil \\ +r &= \frac{\omega_0 N_{dft}}{2 \pi} \end{split} \end{equation} -The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic. The magnitude $A_m$ is the RMS level of the energy in the band containing the harmonic. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. +The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ is a constant that maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. The magnitude $A_m$ is the RMS level of the energy in the band containing the harmonic. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. \subsection{Sinusoidal Synthesis} Synthesis is achieved by constructing an estimate of the original speech spectrum using the sinusoidal model parameters for the current frame. This information is then transformed to the time domain using an Inverse DFT (IDFT). To produce a continuous time domain waveform the IDFTs from adjacent frames are smoothly interpolated using a weighted overlap add procedure \cite{mcaulay1986speech}. \begin{figure}[h] -\caption{Sinusoidal Synthesis. At frame $l$ the windowing function generates $2N$ samples. The first $N$ complete the current frame and are the synthesiser output. The second $N$ are stored for summing with the next frame.} +\caption{Sinusoidal Synthesis. At frame $l$ the windowing function generates $2N$ samples. The first $N$ samples complete the current frame and are the synthesiser output. The second $N$ samples are stored for summing with the next frame.} \label{fig:synthesis} \begin{center} \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] @@ -371,13 +372,10 @@ Synthesis is achieved by constructing an estimate of the original speech spectru The synthetic speech spectrum is constructed using the sinusoidal model parameters by populating a DFT array $\hat{S}_w(k)$ with weighted impulses at the harmonic centres: \begin{equation} -\begin{split} -\hat{S}_w(k) &= \begin{cases} - A_m e^{j\theta_m}, & m=1..L \\ +\hat{S}_w(k) = \begin{cases} + A_m e^{j\theta_m}, & k = \lfloor m r \rceil, m=1..L \\ 0, & otherwise - \end{cases} \\ -k &= \left \lfloor \frac{m \omega_0 N_{dft}}{2 \pi} + 0.5 \right \rfloor -\end{split} + \end{cases} \end{equation} As we wish to synthesise a real time domain signal, $S_w(k)$ is defined to be conjugate symmetric: @@ -467,17 +465,15 @@ The DFT power spectrum of the squared signal $F_w(k)$ generally contains several The accuracy of the pitch estimate in then refined by maximising the function: \begin{equation} -E(\omega_0)=\sum_{m=1}^L|S_w(b m \omega_0)|^2 +E(\omega_0)=\sum_{m=1}^L|S_w(\lfloor r m \rceil)|^2 \end{equation} -where the $\omega_0=2 \pi F_0 /F_s$ is the normalised angular fundamental frequency in radians/sample, $b$ is a constant that maps a frequency in radians/sample to a DFT bin, and $S_\omega$ is the DFT of the speech spectrum for the current frame. This function will be maximised when $mF_0$ aligns with the peak of each harmonic, corresponding with an accurate pitch estimate. It is evaluated in a small range about the coarse $F_0$ estimate. +where $r=\omega_0 N_{dft}/2 \pi$ maps the harmonic number $m$ to a DFT bin. This function will be maximised when $m \omega_0$ aligns with the peak of each harmonic, corresponding with an accurate pitch estimate. It is evaluated in a small range about the coarse $F_0$ estimate. There is nothing particularly unique about this pitch estimator or it's performance. There are occasional artefacts in the synthesised speech that can be traced to ``gross" and ``fine" pitch estimator errors. In the real world no pitch estimator is perfect, partially because the model assumptions around pitch break down (e.g. in transition regions or unvoiced speech). The NLP algorithm could benefit from additional review, tuning and better pitch tracking. However it appears sufficient for the use case of a communications quality speech codec, and is a minor source of artefacts in the synthesised speech. Other pitch estimators could also be used, provided they have practical, real world implementations that offer comparable performance and CPU/memory requirements. \subsection{Voicing Estimation} -In Codec 2 the harmonic phases $\theta_m$ are not transmitted to the decoder, instead they are synthesised at the decoder using a rules based algorithm and information from the remaining model parameters, $\{A_m\}$, $\omega_0$, and $v$, the voicing decision for the current frame. - -Voicing is determined using a variation of the MBE voicing algorithm \cite{griffin1988multiband}. Voiced speech consists of a harmonic series of frequency domain impulses, separated by $\omega_0$. When we multiply a segment of the inout speech samples by the window function $w(n)$, we convolve the frequency domain impulses with $W(k)$, the DFT of the $(w)$. Thus for the $m$-th voiced harmonic, we expect to see the shape of the window function $W(k)$ in the band $Sw(k), k=a_m,...,b_m$. The MBE voicing algorithm starts with the assumption that the band is voiced, and measures the error between $S_w(k)$ and the ideal voiced harmonic $\hat{S}_w(k)$. +Voicing is determined using a variation of the MBE voicing algorithm \cite{griffin1988multiband}. Voiced speech consists of a harmonic series of frequency domain impulses, separated by $\omega_0$. When we multiply a segment of the input speech samples by the window function $w(n)$, we convolve the frequency domain impulses with $W(k)$, the DFT of the $(w)$. Thus for the $m$-th voiced harmonic, we expect to see a cop yof the window function $W(k)$ in the band $Sw(k), k=a_m,...,b_m$. The MBE voicing algorithm starts with the assumption that the band is voiced, and measures the error between $S_w(k)$ and the ideal voiced harmonic $\hat{S}_w(k)$. For each band we first estimate the complex harmonic amplitude (magnitude and phase) using \cite{griffin1988multiband}: \begin{equation} @@ -488,11 +484,11 @@ where $r= \omega_0 N_{dft}/2 \pi$ is a constant that maps the $m$-th harmonic to \label{eq:est_amp_mbe} B_m = \frac{\sum_{k=a_m}^{b_m} S_w(k) W (k + \lfloor mr \rceil)}{\sum_{k=a_m}^{b_m} |W (k + \lfloor mr \rceil)|^2} \end{equation} -Note this procedure is different to the $A_m$ magnitude estimation procedure in (\ref{eq:mag_est}), and is only used locally for the MBE voicing estimation procedure. The MBE amplitude estimation (\ref{eq:est_amp_mbe}) assumes the energy in the band of $S_w(k)$ is from the DFT of a sine wave in that band, and unlike (\ref{eq:mag_est}) is complex valued. +Note this procedure is different to the $A_m$ magnitude estimation procedure in (\ref{eq:mag_est}), and is only used locally for the MBE voicing estimation procedure. Unlike (\ref{eq:mag_est}), the MBE amplitude estimation (\ref{eq:est_amp_mbe}) assumes the energy in the band of $S_w(k)$ is from the DFT of a sine wave, and $B_m$ is complex valued. The synthesised frequency domain speech for this band is defined as: \begin{equation} -\hat{S}_w(k) = B_m W(k - \lfloor mr \rceil), \quad k=a_m,...,b_m-1 +\hat{S}_w(k) = B_m W(k + \lfloor mr \rceil), \quad k=a_m,...,b_m-1 \end{equation} The error between the input and synthesised speech in this band is then: \begin{equation} @@ -505,17 +501,19 @@ A Signal to Noise Ratio (SNR) ratio is defined as: \begin{equation} SNR = \sum_{m=1}^{m_{1000}} \frac{A^2_m}{E_m} \end{equation} -where $m_{1000}= \lfloor L/4 \rceil$ is the band at approximately 1000 Hz. If the energy in the bands up to 1000 Hz is a good match to a harmonic series of sinusoids then $\hat{S}_w(k) \approx S_w(k)$ and $E_m$ will be small compared to the energy in the band leading to a high SNR. Voicing is declared using the following rule: +where $m_{1000}= \lfloor L/4 \rceil$ is the band closest to 1000 Hz. If the energy in the bands up to 1000 Hz is a good match to a harmonic series of sinusoids then $\hat{S}_w(k) \approx S_w(k)$ and $E_m$ will be small compared to the energy in the band resulting in a high SNR. Voicing is declared using the following rule: \begin{equation} v = \begin{cases} 1, & SNR > 6 dB \\ 0, & otherwise \end{cases} \end{equation} -The voicing decision is post processed by several experimentally defined rules applied to $v$ to prevent some of the common voicing errors, see the C source code in \emph{sine.c} for details. +The voicing decision is post processed by several experimentally derived rules to prevent common voicing errors, see the C source code in \emph{sine.c} for details. \subsection{Phase Synthesis} +In Codec 2 the harmonic phases $\theta_m$ are not transmitted to the decoder, instead they are synthesised at the decoder using a rules based algorithm and information from the remaining model parameters, $\{A_m\}$, $\omega_0$, and $v$, the voicing decision for the current frame. + The phase of each harmonic is modelled as the phase of a synthesis filter excited by an impulse train. We create the excitation pulse train using $\omega_0$, a binary voicing decision $v$ and a rules based algorithm. Consider a pulse train with a pulse starting time $n=0$, with pulses repeated at a rate of $\omega_0$. A pulse train in the time domain is equivalent to harmonics in the frequency domain. We can construct an excitation pulse train using a sum of sinusoids: @@ -566,7 +564,22 @@ Block diagram of LPC/LSP mode encoder and decoder. Walk through operation. Dec \end{enumerate} \section{Glossary} + \label{sect:glossary} +\begin{table}[H] +\label{tab:acronyms} +\centering +\begin{tabular}{l l l } +\hline +Acronym & Description \\ +\hline +DFT & Discrete Fourier Transform \\ +IDFT & Inverse Discrete Fourier Transform \\ +NLP & Non Linear Pitch (algorithm) \\ +\hline +\end{tabular} +\caption{Glossary of Acronyms} +\end{table} \begin{table}[H] \label{tab:symbol_glossary} @@ -577,7 +590,6 @@ Symbol & Description & Units \\ \hline $a_m$ & lower DFT index of current band \\ $b_m$ & upper DFT index of current band \\ -$b$ & Constant that maps a frequency in radians to a DFT bin \\ $\{A_m\}$ & Set of harmonic magnitudes $m=1,...L$ & dB \\ $F_0$ & Fundamental frequency (pitch) & Hz \\ $F_s$ & Sample rate (usually 8 kHz) & Hz \\ @@ -585,10 +597,11 @@ $F_w(k)$ & DFT of squared speech signal in NLP pitch estimator \\ $L$ & Number of harmonics \\ $P$ & Pitch period & ms or samples \\ $\{\theta_m\}$ & Set of harmonic phases $m=1,...L$ & dB \\ -$\omega_0$ & Fundamental frequency (pitch) & radians/sample \\ +$r$ & Constant that maps a frequency in radians to a DFT index \\ $s(n)$ & Input speech \\ $s_w(n)$ & Time domain windowed input speech \\ $S_w(k)$ & Frequency domain windowed input speech \\ +$\omega_0$ & Fundamental frequency (pitch) & radians/sample \\ $v$ & Voicing decision for the current frame \\ \hline \end{tabular} -- cgit v1.2.3 From ba7321c6f0b2fbd9d08eeda7b03b7a31c0aa878c Mon Sep 17 00:00:00 2001 From: drowe67 Date: Tue, 28 Nov 2023 20:37:41 +1030 Subject: draft of phase synthesis section --- doc/codec2.pdf | Bin 231965 -> 236214 bytes doc/codec2.tex | 68 ++++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index e577925..7f81e46 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 336ae2c..f8d0a5d 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -336,7 +336,7 @@ b_m &= \lfloor (m + 0.5)r \rceil \\ r &= \frac{\omega_0 N_{dft}}{2 \pi} \end{split} \end{equation} -The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ is a constant that maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. The magnitude $A_m$ is the RMS level of the energy in the band containing the harmonic. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. +The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. The magnitude $A_m$ is the RMS level of the energy in the band containing the harmonic. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. \subsection{Sinusoidal Synthesis} @@ -512,47 +512,68 @@ The voicing decision is post processed by several experimentally derived rules t \subsection{Phase Synthesis} -In Codec 2 the harmonic phases $\theta_m$ are not transmitted to the decoder, instead they are synthesised at the decoder using a rules based algorithm and information from the remaining model parameters, $\{A_m\}$, $\omega_0$, and $v$, the voicing decision for the current frame. - -The phase of each harmonic is modelled as the phase of a synthesis filter excited by an impulse train. We create the excitation pulse train using $\omega_0$, a binary voicing decision $v$ and a rules based algorithm. +In Codec 2 the harmonic phases $\{\theta_m\}$ are not transmitted to the decoder, instead they are synthesised at the decoder using a rules based algorithm and information from the remaining model parameters, $\{A_m\}$, $\omega_0$, and $v$. Consider the source-filter model of speech production: +\begin{equation} +\hat{S}(z)=E(z)H(z) +\end{equation} +where $E(z)$ is an excitation signal with a relatively flat spectrum, and $H(z)$ is a synthesis filter that shapes the magnitude spectrum. The phase of each harmonic is the sum of the excitation and synthesis filter phase: +\begin{equation} +\begin{split} +arg \left[ \hat{S}(e^{j \omega_0 m}) \right] &= arg \left[ E(e^{j \omega_0 m}) H(e^{j \omega_0 m}) \right] \\ +\hat{theta}_m &= arg \left[ E(e^{j \omega_0 m}) \right] + arg \left[ H(e^{j \omega_0 m}) \right] \\ +&= \phi_m + arg \left[ H(e^{j \omega_0 m}) \right] +\end{split} +\end{equation} -Consider a pulse train with a pulse starting time $n=0$, with pulses repeated at a rate of $\omega_0$. A pulse train in the time domain is equivalent to harmonics in the frequency domain. We can construct an excitation pulse train using a sum of sinusoids: +For voiced speech $E(z)$ is an impulse train (period $P$ in the time domain and $\omega_0$ in the frequency domain). We can construct a time domain excitation pulse train using a sum of sinusoids: \begin{equation} -e(n) = \sum_{m-1}^L cos(m \omega_0 n) +e(n) = \sum_{m-1}^L e^{j m \omega_0 (n - n_0)} \end{equation} -The phase of each excitation harmonic is: +Where $n_0$ is a time shift that represents the pulse position relative to the centre of the synthesis frame $n=0$. By finding the DTCF transform of $e(n)$ we can determine the phase of each excitation harmonic: \begin{equation} -\phi_m = m \omega_0 +\phi_m = - m \omega_0 n_0 \end{equation} -As we don't transmit the pulse position for this model, we need to synthesise it. The excitation pulses occur at a rate of $\omega_0$ (one for each pitch period). The phase of the first harmonic advances by $N \phi_0$ radians over a synthesis frame of $N$ samples. For example if $\omega_0 = \pi /20$ (200 Hz), then over a (10ms $N=80$) sample frame, the phase of the first harmonic would advance $(\pi/20)*80 = 4 \pi$ radians or two complete cycles. +As we don't transmit any phase information the pulse position $n_0$ is unknown. Fortunately the ears is insensitive to the absolute position of pitch pulses in voiced speech, as long as they evolve smoothly over time (discontinuities in phase are a characteristic of unvoiced speech). -We generate the excitation phase of the fundamental (first harmonic): +The excitation pulses occur at a rate of $\omega_0$ (one for each pitch period). The phase of the first harmonic advances by $N \phi_0$ radians over a synthesis frame of $N$ samples. For example if $\omega_0 = \pi /20$ (200 Hz), then over a (10ms $N=80$) sample frame, the phase of the first harmonic would advance $(\pi/20)*80 = 4 \pi$ radians or two complete cycles. + +We therefore derive $n_0$ from the excitation phase of the fundamental, which we treat as a timing reference. Each frame we advance the phase of the fundamental: \begin{equation} -\phi_1 = \omega_0 N +\phi_1^l = \phi_1^{l-1} + N\omega_0 \end{equation} -We then relate the phase of the m-th excitation harmonic to the phase of the fundamental as: +Given $\phi_1$ we can compute $n_0$ and the excitation phase of the other harmonics: \begin{equation} -\phi_m = m\phi_m +\begin{split} +n_0 &= -\phi_1 / \omega_0 \\ +\phi_m &= - m \omega_0 n_0 \\ + &= m \phi_1, \quad m=2,...,L +\end{split} \end{equation} -This phase spectra then gets passed through the LPC synthesis filter to determine the final harmonic phase. -Comparing to speech synthesised using original phases: +For unvoiced speech $E(z)$ is a white noise signal. At each frame we sample a random number generator on the interval $-/pi ... /pi$ to obtain the excitation phase of each harmonic. We set $\omega_0 = F0_min$ to use a large number of harmonics to synthesise to approximate a noise signal. + +An additional phase component is provided by sampling $H(z)$ at the harmonic centres. The phase spectra of $H(z)$ is derived from the filter magnitude response described by $\{A_m\}$ available at the decoder using minimum phase techniques. The method for deriving the phase differs between Codec 2 modes and is described below in Sections \ref{sect:mode_lpc_lsp} and \ref{sect:mode_newamp1}. This component of the phase tends to disperse the pitch pulse energy in time, especially around spectral peaks (formants) where ``ringing" occurs. + +TODO: phase postfilter + +Comparing to speech synthesised using original phases $\{\theta_m\}$ the following observations have been made: \begin{enumerate} -\item Through headphones speech synthesised with this model is not as good. Through a loudspeaker it is very close to original phases. -\item If there are voicing errors, the speech can sound clicky or staticy. If V speech is mistakenly declared UV, this model tends to synthesise impulses or clicks, as there is usually very little shift or dispersion through the LPC synthesis filter. -\item When combined with LPC amplitude modelling there is an additional drop in quality. I am not sure why, theory is interformant energy is raised making any phase errors more obvious. -\item This synthesis model is effectively the same as a simple LPC-10 vocoders, and yet sounds much better. Why? Conventional wisdom (AMBE, MELP) says mixed voicing is required for high quality speech. -\item I am pretty sure the Lincoln Lab sinusoidal coding guys (like xMBE also from MIT) first described this zero phase model, I need to look up the paper. -\item Note that this approach could cause some discontinuities in the phase at the edge of synthesis frames, as no attempt is made to make sure that the phase tracks are continuous (the excitation phases are continuous, but not the final phases after filtering by the LPC spectra). +\item Through headphones speech synthesised with this model drops in quality. Through a small loudspeaker it is very close to original phases. +\item If there are voicing errors, the speech can sound clicky or staticy. If voiced speech is mistakenly declared unvoiced, this model tends to synthesise annoying impulses or clicks, as for voiced speech $H(z)$ is relatively flat (broad, high frequency formants), so there is very little dispersion of the excitation impulses through $H(z)$. +\item When combined with amplitude modelling or quantisation, such that $H(z)$ is derived from $\{\hat{A}_m\}$ there is an additional drop in quality. +\item This synthesis model is effectively the same as a simple LPC-10 vocoders, and yet (especially when $H(z)$ is derived from $\{A_m\}$) sounds much better. Conventional wisdom (AMBE, MELP) says mixed voicing is required for high quality speech. +\item If $H(z)$ is changing rapidly between frames, it's phase contribution may also change rapidly. This approach could cause some discontinuities in the phase at the edge of synthesis frames, as no attempt is made to make sure that the phase tracks are continuous (the excitation phases are continuous, but not the final phases after filtering by $H(z)$). \end{enumerate} -TODO: Clean up. Introduce continuous time index, perhaps l-th frame. Expressions for phase spectra as cascade of two systems. Hilbert transform, might need to study this. Figures and simulation plots would be useful. Voicing decision algorithm. Figure of phase synthesis. +TODO: Energy distribution theory. Need to V model, neural vocoders, non-linear function. Figures and simulation plots would be useful. Figure of phase synthesis. \subsection{LPC/LSP based modes} +\label{sect:mode_lpc_lsp} Block diagram of LPC/LSP mode encoder and decoder. Walk through operation. Decimation and interpolation. \subsection{Codec 2 700C} +\label{sect:mode_newamp1} \section{Further Work} @@ -575,6 +596,7 @@ Acronym & Description \\ \hline DFT & Discrete Fourier Transform \\ IDFT & Inverse Discrete Fourier Transform \\ +MBE & Multi-Band Excitation \\ NLP & Non Linear Pitch (algorithm) \\ \hline \end{tabular} @@ -597,7 +619,7 @@ $F_w(k)$ & DFT of squared speech signal in NLP pitch estimator \\ $L$ & Number of harmonics \\ $P$ & Pitch period & ms or samples \\ $\{\theta_m\}$ & Set of harmonic phases $m=1,...L$ & dB \\ -$r$ & Constant that maps a frequency in radians to a DFT index \\ +$r$ & Maps a harmonic number $m$ to a DFT index \\ $s(n)$ & Input speech \\ $s_w(n)$ & Time domain windowed input speech \\ $S_w(k)$ & Frequency domain windowed input speech \\ -- cgit v1.2.3 From fbbea0946111c90e3c9ab90ac641cb2d5b8b4bc0 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Wed, 29 Nov 2023 07:35:19 +1030 Subject: phase synthesis edits --- doc/codec2.pdf | Bin 236214 -> 236412 bytes doc/codec2.tex | 35 +++++++++++++++++++---------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 7f81e46..dbb7d25 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index f8d0a5d..bc2096f 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -512,7 +512,9 @@ The voicing decision is post processed by several experimentally derived rules t \subsection{Phase Synthesis} -In Codec 2 the harmonic phases $\{\theta_m\}$ are not transmitted to the decoder, instead they are synthesised at the decoder using a rules based algorithm and information from the remaining model parameters, $\{A_m\}$, $\omega_0$, and $v$. Consider the source-filter model of speech production: +In Codec 2 the harmonic phases $\{\theta_m\}$ are not transmitted, instead they are synthesised at the decoder from the remaining model parameters, $\{A_m\}$, $\omega_0$, and $v$. The phase model described in this section is referred to as ``zero order" or \emph{phase0} in the source code, as it requires zero model parameters to be transmitted over the channel. + +Consider the source-filter model of speech production: \begin{equation} \hat{S}(z)=E(z)H(z) \end{equation} @@ -520,24 +522,22 @@ where $E(z)$ is an excitation signal with a relatively flat spectrum, and $H(z)$ \begin{equation} \begin{split} arg \left[ \hat{S}(e^{j \omega_0 m}) \right] &= arg \left[ E(e^{j \omega_0 m}) H(e^{j \omega_0 m}) \right] \\ -\hat{theta}_m &= arg \left[ E(e^{j \omega_0 m}) \right] + arg \left[ H(e^{j \omega_0 m}) \right] \\ +\hat{\theta}_m &= arg \left[ E(e^{j \omega_0 m}) \right] + arg \left[ H(e^{j \omega_0 m}) \right] \\ &= \phi_m + arg \left[ H(e^{j \omega_0 m}) \right] \end{split} \end{equation} -For voiced speech $E(z)$ is an impulse train (period $P$ in the time domain and $\omega_0$ in the frequency domain). We can construct a time domain excitation pulse train using a sum of sinusoids: +For voiced speech $E(z)$ is an impulse train (in both the time and frequency domain). We can construct a time domain excitation pulse train using a sum of sinusoids: \begin{equation} -e(n) = \sum_{m-1}^L e^{j m \omega_0 (n - n_0)} +e(n) = \sum_{m-1}^L cos( m \omega_0 (n - n_0)) \end{equation} Where $n_0$ is a time shift that represents the pulse position relative to the centre of the synthesis frame $n=0$. By finding the DTCF transform of $e(n)$ we can determine the phase of each excitation harmonic: \begin{equation} \phi_m = - m \omega_0 n_0 \end{equation} -As we don't transmit any phase information the pulse position $n_0$ is unknown. Fortunately the ears is insensitive to the absolute position of pitch pulses in voiced speech, as long as they evolve smoothly over time (discontinuities in phase are a characteristic of unvoiced speech). - -The excitation pulses occur at a rate of $\omega_0$ (one for each pitch period). The phase of the first harmonic advances by $N \phi_0$ radians over a synthesis frame of $N$ samples. For example if $\omega_0 = \pi /20$ (200 Hz), then over a (10ms $N=80$) sample frame, the phase of the first harmonic would advance $(\pi/20)*80 = 4 \pi$ radians or two complete cycles. +As we don't transmit any phase information the pulse position $n_0$ is unknown at the decoder. Fortunately the ear is insensitive to the absolute position of pitch pulses in voiced speech, as long as they evolve smoothly over time (discontinuities in phase are a characteristic of unvoiced speech). -We therefore derive $n_0$ from the excitation phase of the fundamental, which we treat as a timing reference. Each frame we advance the phase of the fundamental: +The excitation pulses occur at a rate of $\omega_0$ (one for each pitch period). The phase of the first harmonic advances by $N \phi_1$ radians over a synthesis frame of $N$ samples. For example if $\omega_1 = \pi /20$ (200 Hz), then over a (10ms $N=80$) sample frame, the phase of the first harmonic would advance $(\pi/20)80 = 4 \pi$ radians or two complete cycles. We therefore derive $n_0$ from the excitation phase of the fundamental, which we treat as a timing reference. Each frame we advance the phase of the fundamental: \begin{equation} \phi_1^l = \phi_1^{l-1} + N\omega_0 \end{equation} @@ -546,27 +546,27 @@ Given $\phi_1$ we can compute $n_0$ and the excitation phase of the other harmon \begin{split} n_0 &= -\phi_1 / \omega_0 \\ \phi_m &= - m \omega_0 n_0 \\ - &= m \phi_1, \quad m=2,...,L + &= m \phi_1 \quad \quad m=2,...,L \end{split} \end{equation} -For unvoiced speech $E(z)$ is a white noise signal. At each frame we sample a random number generator on the interval $-/pi ... /pi$ to obtain the excitation phase of each harmonic. We set $\omega_0 = F0_min$ to use a large number of harmonics to synthesise to approximate a noise signal. +For unvoiced speech $E(z)$ is a white noise signal. At each frame we sample a random number generator on the interval $-\pi ... \pi$ to obtain the excitation phase of each harmonic. We set $F_0 = 50$ Hz to use a large number of harmonics $L=4000/50=80$ for synthesis to best approximate a noise signal. -An additional phase component is provided by sampling $H(z)$ at the harmonic centres. The phase spectra of $H(z)$ is derived from the filter magnitude response described by $\{A_m\}$ available at the decoder using minimum phase techniques. The method for deriving the phase differs between Codec 2 modes and is described below in Sections \ref{sect:mode_lpc_lsp} and \ref{sect:mode_newamp1}. This component of the phase tends to disperse the pitch pulse energy in time, especially around spectral peaks (formants) where ``ringing" occurs. +An additional phase component is provided by sampling $H(z)$ at the harmonic centres. The phase spectra of $H(z)$ is derived from the filter magnitude response using minimum phase techniques. The method for deriving the phase spectra of $H(z)$ differs between Codec 2 modes and is described below in Sections \ref{sect:mode_lpc_lsp} and \ref{sect:mode_newamp1}. This component of the phase tends to disperse the pitch pulse energy in time, especially around spectral peaks (formants). -TODO: phase postfilter +The zero phase model tends to make speech with background noise sound "clicky". With high levels of background noise the low level inter-formant parts of the spectrum will contain noise rather than speech harmonics, so modelling them as voiced (i.e. a continuous, non-random phase track) is inaccurate. Some codecs (like MBE) have a mixed voicing model that breaks the spectrum into voiced and unvoiced regions. However (5-12) bits/frame (5-12) are required to transmit the frequency selective voicing information. Mixed excitation also requires accurate voicing estimation (parameter estimators always break occasionally under exceptional conditions). + +In our case we use a post processing approach which requires no additional bits to be transmitted. The decoder measures the average level of the background noise during unvoiced frames. If a harmonic is less than this level it is made unvoiced by randomising it's phases. Comparing to speech synthesised using original phases $\{\theta_m\}$ the following observations have been made: \begin{enumerate} \item Through headphones speech synthesised with this model drops in quality. Through a small loudspeaker it is very close to original phases. \item If there are voicing errors, the speech can sound clicky or staticy. If voiced speech is mistakenly declared unvoiced, this model tends to synthesise annoying impulses or clicks, as for voiced speech $H(z)$ is relatively flat (broad, high frequency formants), so there is very little dispersion of the excitation impulses through $H(z)$. \item When combined with amplitude modelling or quantisation, such that $H(z)$ is derived from $\{\hat{A}_m\}$ there is an additional drop in quality. -\item This synthesis model is effectively the same as a simple LPC-10 vocoders, and yet (especially when $H(z)$ is derived from $\{A_m\}$) sounds much better. Conventional wisdom (AMBE, MELP) says mixed voicing is required for high quality speech. +\item This synthesis model (e.g. a pulse train exciting a LPC filter) is effectively the same as a simple LPC-10 vocoders, and yet (especially when $H(z)$ is derived from unquantised $\{A_m\}$) sounds much better. Conventional wisdom (AMBE, MELP) says mixed voicing is required for high quality speech. \item If $H(z)$ is changing rapidly between frames, it's phase contribution may also change rapidly. This approach could cause some discontinuities in the phase at the edge of synthesis frames, as no attempt is made to make sure that the phase tracks are continuous (the excitation phases are continuous, but not the final phases after filtering by $H(z)$). \end{enumerate} -TODO: Energy distribution theory. Need to V model, neural vocoders, non-linear function. Figures and simulation plots would be useful. Figure of phase synthesis. - \subsection{LPC/LSP based modes} \label{sect:mode_lpc_lsp} @@ -578,10 +578,11 @@ Block diagram of LPC/LSP mode encoder and decoder. Walk through operation. Dec \section{Further Work} \begin{enumerate} -\item Some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters +\item Some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters. Listen to various phases of quantisation. \item How to use Octave tools to single step through codec operation \item Table summarising source files with one line description \item Add doc license (Creative Commons?) +\item Energy distribution theory. Need for V model, neural vocoders, non-linear function. Figures and simulation plots would be useful. Figure of phase synthesis. \end{enumerate} \section{Glossary} @@ -595,6 +596,7 @@ Block diagram of LPC/LSP mode encoder and decoder. Walk through operation. Dec Acronym & Description \\ \hline DFT & Discrete Fourier Transform \\ +DTCF & Discrete Time Continuous Frequency Fourier Transform \\ IDFT & Inverse Discrete Fourier Transform \\ MBE & Multi-Band Excitation \\ NLP & Non Linear Pitch (algorithm) \\ @@ -623,6 +625,7 @@ $r$ & Maps a harmonic number $m$ to a DFT index \\ $s(n)$ & Input speech \\ $s_w(n)$ & Time domain windowed input speech \\ $S_w(k)$ & Frequency domain windowed input speech \\ +$\phi_m$ & Phase of excitation harmonic \\ $\omega_0$ & Fundamental frequency (pitch) & radians/sample \\ $v$ & Voicing decision for the current frame \\ \hline -- cgit v1.2.3 From f3b4305e8711209f0925f5bd383f867365c92f35 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Thu, 30 Nov 2023 06:39:36 +1030 Subject: phase model edits and LPC/LSP encoder block diagram --- doc/codec2.pdf | Bin 236412 -> 237912 bytes doc/codec2.tex | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index dbb7d25..b1a1a34 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index bc2096f..6047a3c 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -6,6 +6,7 @@ \usepackage{float} \usepackage{xstring} \usepackage{catchfile} +\usepackage{siunitx} \CatchFileDef{\headfull}{../.git/HEAD}{} \StrGobbleRight{\headfull}{1}[\head] @@ -151,7 +152,7 @@ The parameters of the sinusoidal model are: This section explains how the Codec 2 encoder and decoder works using block diagrams. \begin{figure}[h] -\caption{Codec 2 Encoder} +\caption{Codec 2 Encoder.} \label{fig:codec2_encoder} \begin{center} \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center,text width=2cm] @@ -328,15 +329,22 @@ S_w(k) = \sum_{n=-N_{w2}}^{N_{w2}} s_w(n) e^{-j 2 \pi k n / N_{dft}} The magnitude and phase of each harmonic is given by: \begin{equation} \label{eq:mag_est} +A_m = \sqrt{\sum_{k=a_m}^{b_m-1} |S_w(k)|^2 } +\end{equation} +\begin{equation} +\theta_m = arg \left[ S_w(\lfloor m r \rceil \right] +\end{equation} +where: +\begin{equation} \begin{split} -A_m &= \sqrt{\sum_{k=a_m}^{b_m-1} |S_w(k)|^2 } \\ -\theta_m &= arg \left[ S_w(\lfloor m r \rceil \right] \\ a_m &= \lfloor (m - 0.5)r \rceil \\ b_m &= \lfloor (m + 0.5)r \rceil \\ r &= \frac{\omega_0 N_{dft}}{2 \pi} \end{split} \end{equation} -The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. The magnitude $A_m$ is the RMS level of the energy in the band containing the harmonic. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. +The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. + +The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. \subsection{Sinusoidal Synthesis} @@ -413,6 +421,7 @@ The continuous synthesised speech signal $\hat{s}(n)$ for the $l$-th frame is ob From the $N_{dft}$ samples produced by the IDFT (\ref{eq:synth_idft}), after windowing we have $2N$ output samples. The first $N$ output samples $n=0,...N-1$ complete the current frame $l$ and are output from the synthesiser. However we must also compute the contribution to the next frame $n = N,N+1,...,2N-1$. These are stored, and added to samples from the next synthesised frame. \subsection{Non-Linear Pitch Estimation} +\label{sect:nlp} The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}, and portions of this description are reproduced here. The post processing algorithm used for pitch estimation in Codec 2 is different from \cite{rowe1997techniques} and is described here. The C code \emph{nlp.c} is a useful reference for the fine details of the implementation, and the Octave script \emph{plnlp.m} can by used to plot the internal states and single step through speech, illustrating the operation of the algorithm. @@ -501,10 +510,10 @@ A Signal to Noise Ratio (SNR) ratio is defined as: \begin{equation} SNR = \sum_{m=1}^{m_{1000}} \frac{A^2_m}{E_m} \end{equation} -where $m_{1000}= \lfloor L/4 \rceil$ is the band closest to 1000 Hz. If the energy in the bands up to 1000 Hz is a good match to a harmonic series of sinusoids then $\hat{S}_w(k) \approx S_w(k)$ and $E_m$ will be small compared to the energy in the band resulting in a high SNR. Voicing is declared using the following rule: +where $m_{1000}= \lfloor L/4 \rceil$ is the band closest to 1000 Hz, and $\{A_m\}$ are computed from (\ref{eq:mag_est}). If the energy in the bands up to 1000 Hz is a good match to a harmonic series of sinusoids then $\hat{S}_w(k) \approx S_w(k)$ and $E_m$ will be small compared to the energy in the band resulting in a high SNR. Voicing is declared using the following rule: \begin{equation} v = \begin{cases} - 1, & SNR > 6 dB \\ + 1, & SNR > 6 \si{dB} \\ 0, & otherwise \end{cases} \end{equation} @@ -556,7 +565,7 @@ An additional phase component is provided by sampling $H(z)$ at the harmonic cen The zero phase model tends to make speech with background noise sound "clicky". With high levels of background noise the low level inter-formant parts of the spectrum will contain noise rather than speech harmonics, so modelling them as voiced (i.e. a continuous, non-random phase track) is inaccurate. Some codecs (like MBE) have a mixed voicing model that breaks the spectrum into voiced and unvoiced regions. However (5-12) bits/frame (5-12) are required to transmit the frequency selective voicing information. Mixed excitation also requires accurate voicing estimation (parameter estimators always break occasionally under exceptional conditions). -In our case we use a post processing approach which requires no additional bits to be transmitted. The decoder measures the average level of the background noise during unvoiced frames. If a harmonic is less than this level it is made unvoiced by randomising it's phases. +In our case we use a post processing approach which requires no additional bits to be transmitted. The decoder measures the average level of the background noise during unvoiced frames. If a harmonic is less than this level it is made unvoiced by randomising it's phases. See the C source code for implementation details. Comparing to speech synthesised using original phases $\{\theta_m\}$ the following observations have been made: \begin{enumerate} @@ -565,11 +574,47 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi \item When combined with amplitude modelling or quantisation, such that $H(z)$ is derived from $\{\hat{A}_m\}$ there is an additional drop in quality. \item This synthesis model (e.g. a pulse train exciting a LPC filter) is effectively the same as a simple LPC-10 vocoders, and yet (especially when $H(z)$ is derived from unquantised $\{A_m\}$) sounds much better. Conventional wisdom (AMBE, MELP) says mixed voicing is required for high quality speech. \item If $H(z)$ is changing rapidly between frames, it's phase contribution may also change rapidly. This approach could cause some discontinuities in the phase at the edge of synthesis frames, as no attempt is made to make sure that the phase tracks are continuous (the excitation phases are continuous, but not the final phases after filtering by $H(z)$). +\item The recent crop of neural vocoders produce high quality speech using a similar parameters set, and notably without transmitting phase information. Although many of these vocoders operate in the time domain, this approach can be interpreted as implementing a function $\{ \hat{\theta}_m\} = F(\omega_0, \{Am\},v)$. This validates the general approach used here, and as future work Codec 2 may benefit from being augmented by machine learning. \end{enumerate} \subsection{LPC/LSP based modes} \label{sect:mode_lpc_lsp} +\begin{figure}[h] +\caption{LPC/LSP Modes Encoder} +\label{fig:encoder_lpc_lsp} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] + +\node [input] (rinput) {}; +\node [tmp, right of=rinput,node distance=0.5cm] (z) {}; +\node [block, right of=z,node distance=1.5cm] (window) {Window}; +\node [tmp, right of=window,node distance=1cm] (z1) {}; +\node [block, right of=z1,node distance=1.5cm] (dft) {DFT}; +\node [block, above of=dft,text width=2cm] (lpc) {LPC Analysis}; +\node [block, right of=lpc,node distance=3cm,text width=2cm] (lsp) {LSP Quantisation}; +\node [block, below of=dft,text width=2cm] (est) {Est Amp}; +\node [block, right of=est,node distance=3cm,text width=2cm] (voicing) {Est Voicing}; +\node [block, below of=window] (nlp) {NLP}; +\node [block, below of=lsp,text width=2cm] (pack) {Bit Packing}; +\node [output, right of=pack,node distance=2cm] (routput) {}; + +\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window); +\draw [->] (z) |- (nlp); +\draw [->] (window) -- (dft); +\draw [->] (z1) |- (lpc); +\draw [->] (lpc) -- (lsp); +\draw [->] (lsp) -- (pack); +\draw [->] (dft) -- (est); +\draw [->] (nlp) -- (est); +\draw [->] (est) -- (voicing); +\draw [->] (voicing) -- (pack); +\draw [->] (pack) -- (routput) node[right,align=left,text width=1.5cm] {Bit Stream}; + +\end{tikzpicture} +\end{center} +\end{figure} + Block diagram of LPC/LSP mode encoder and decoder. Walk through operation. Decimation and interpolation. \subsection{Codec 2 700C} -- cgit v1.2.3 From 067eaa7998240509c89e32b849654e8f347bed24 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Fri, 1 Dec 2023 11:32:39 +1030 Subject: LPC/LSP enocder description, decoder block diagram --- doc/codec2.pdf | Bin 237912 -> 243348 bytes doc/codec2.tex | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++-- doc/codec2_refs.bib | 11 ++++++++++ 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index b1a1a34..e718b7f 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 6047a3c..62176e7 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -474,6 +474,7 @@ The DFT power spectrum of the squared signal $F_w(k)$ generally contains several The accuracy of the pitch estimate in then refined by maximising the function: \begin{equation} +\label{eq:pitch_refinement} E(\omega_0)=\sum_{m=1}^L|S_w(\lfloor r m \rceil)|^2 \end{equation} where $r=\omega_0 N_{dft}/2 \pi$ maps the harmonic number $m$ to a DFT bin. This function will be maximised when $m \omega_0$ aligns with the peak of each harmonic, corresponding with an accurate pitch estimate. It is evaluated in a small range about the coarse $F_0$ estimate. @@ -508,6 +509,7 @@ E_m &= \sum_{k=a_m}^{b_m-1} |S_w(k) - \hat{S}_w(k)|^2 \\ \end{equation} A Signal to Noise Ratio (SNR) ratio is defined as: \begin{equation} +\label{eq:voicing_snr} SNR = \sum_{m=1}^{m_{1000}} \frac{A^2_m}{E_m} \end{equation} where $m_{1000}= \lfloor L/4 \rceil$ is the band closest to 1000 Hz, and $\{A_m\}$ are computed from (\ref{eq:mag_est}). If the energy in the bands up to 1000 Hz is a good match to a harmonic series of sinusoids then $\hat{S}_w(k) \approx S_w(k)$ and $E_m$ will be small compared to the energy in the band resulting in a high SNR. Voicing is declared using the following rule: @@ -580,6 +582,12 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi \subsection{LPC/LSP based modes} \label{sect:mode_lpc_lsp} +In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode. This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pair (LSP) model to quantise and transmit the spectral magnitude information over the channel. There is a great deal of material on the topics of linear prediction and LSPs, so they will not be explained here. An excellent reference for LPCs is \cite{makhoul1975linear}. + +Figure \ref{fig:encoder_lpc_lsp} presents the encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). The LPC analysis extracts $p=10$ LPC coefficients $\{a_k\}, k=1..10$ and the LPC energy $E$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{f_k\}, k=1..10$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. + +Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}). + \begin{figure}[h] \caption{LPC/LSP Modes Encoder} \label{fig:encoder_lpc_lsp} @@ -593,10 +601,12 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi \node [block, right of=z1,node distance=1.5cm] (dft) {DFT}; \node [block, above of=dft,text width=2cm] (lpc) {LPC Analysis}; \node [block, right of=lpc,node distance=3cm,text width=2cm] (lsp) {LSP Quantisation}; +\node [tmp, right of=nlp,node distance=1cm] (z2) {}; +\node [tmp, above of=z2,node distance=1cm] (z3) {}; \node [block, below of=dft,text width=2cm] (est) {Est Amp}; \node [block, right of=est,node distance=3cm,text width=2cm] (voicing) {Est Voicing}; \node [block, below of=window] (nlp) {NLP}; -\node [block, below of=lsp,text width=2cm] (pack) {Bit Packing}; +\node [block, below of=lsp,text width=2.5cm] (pack) {Decimation \&\\Bit Packing}; \node [output, right of=pack,node distance=2cm] (routput) {}; \draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window); @@ -607,6 +617,7 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi \draw [->] (lsp) -- (pack); \draw [->] (dft) -- (est); \draw [->] (nlp) -- (est); +\draw [->] (z2) -- (z3) -| (pack); \draw [->] (est) -- (voicing); \draw [->] (voicing) -- (pack); \draw [->] (pack) -- (routput) node[right,align=left,text width=1.5cm] {Bit Stream}; @@ -615,13 +626,59 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi \end{center} \end{figure} -Block diagram of LPC/LSP mode encoder and decoder. Walk through operation. Decimation and interpolation. +One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame. As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters. Using a fixed order LPC model is a neat solution to this problem. Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope. + +In CELP codecs these problems can be accommodated by the (high bit rate) excitation, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model. + +Before bit packing, the Codec 2 parameters are decimated in time. An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality. The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes. This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech. Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis. Neverthless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate. + +TODO: table of LPC/LSP modes, frame rate. Perhaps make this a table covering all modes. + +\begin{figure}[h] +\caption{LPC/LSP Modes Decoder} +\label{fig:decoder_lpc_lsp} +\begin{center} +\begin{tikzpicture}[auto, node distance=3cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center] + +\node [input] (rinput) {}; +\node [block, right of=rinput,node distance=1.5cm] (unpack) {Unpack}; +\node [block, right of=unpack,node distance=2.5cm] (interp) {Interpolate}; +\node [block, right of=interp,text width=2cm] (lpc) {LSP to LPC}; +\node [block, right of=lpc,text width=2cm] (sample) {Sample $A_m$}; +\node [block, below of=sample,text width=2cm,node distance=2cm] (post) {Post Filter}; +\node [block, left of=post,text width=2.5cm] (synth) {Sinusoidal\\Synthesis}; +\node [output, left of=synth,node distance=2cm] (routput) {}; + +\draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack); +\draw [->] (unpack) -- (interp); +\draw [->] (interp) -- (lpc); +\draw [->] (lpc) -- (sample); +\draw [->] (sample) -- (post); +\draw [->] (post) -- (synth); +\draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$}; +%\draw [->] (dft) -- (est); +%\draw [->] (nlp) -- (est); +%\draw [->] (z2) -- (z3) -| (pack); +%\draw [->] (est) -- (voicing); +%\draw [->] (voicing) -- (pack); +%\draw [->] (pack) -- (routput) node[right,align=left,text width=1.5cm] {Bit Stream}; + +\end{tikzpicture} +\end{center} +\end{figure} + +TODO expression for linear interpolation. Interpolation in LSP domain. Ear protection. \subsection{Codec 2 700C} \label{sect:mode_newamp1} +Microphone equaliser +ratek study + \section{Further Work} +Summary of mysteries/interesting points drawn out above. + \begin{enumerate} \item Some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters. Listen to various phases of quantisation. \item How to use Octave tools to single step through codec operation diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib index 7348902..039accc 100644 --- a/doc/codec2_refs.bib +++ b/doc/codec2_refs.bib @@ -32,3 +32,14 @@ year={1986}, publisher={IEEE} } + +@article{makhoul1975linear, + title={Linear prediction: A tutorial review}, + author={Makhoul, John}, + journal={Proceedings of the IEEE}, + volume={63}, + number={4}, + pages={561--580}, + year={1975}, + publisher={IEEE} +} -- cgit v1.2.3 From 43defe5bbed510a8fdbb7c8f0fba155d3238b084 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 3 Dec 2023 07:11:51 +1030 Subject: decoder description, mode table --- doc/codec2.pdf | Bin 243348 -> 250500 bytes doc/codec2.tex | 80 ++++++++++++++++++++++++++++++++++++++++++---------- doc/codec2_refs.bib | 11 ++++++++ 3 files changed, 76 insertions(+), 15 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index e718b7f..7fcb0ff 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 62176e7..612c196 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -527,6 +527,7 @@ In Codec 2 the harmonic phases $\{\theta_m\}$ are not transmitted, instead they Consider the source-filter model of speech production: \begin{equation} +\label{eq:source_filter} \hat{S}(z)=E(z)H(z) \end{equation} where $E(z)$ is an excitation signal with a relatively flat spectrum, and $H(z)$ is a synthesis filter that shapes the magnitude spectrum. The phase of each harmonic is the sum of the excitation and synthesis filter phase: @@ -582,11 +583,28 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi \subsection{LPC/LSP based modes} \label{sect:mode_lpc_lsp} -In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode. This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pair (LSP) model to quantise and transmit the spectral magnitude information over the channel. There is a great deal of material on the topics of linear prediction and LSPs, so they will not be explained here. An excellent reference for LPCs is \cite{makhoul1975linear}. +In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode. This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pairs (LSPs) to quantise and transmit the spectral magnitude information. There is a great deal of information available on these techniques so they are only briefly described here. -Figure \ref{fig:encoder_lpc_lsp} presents the encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). The LPC analysis extracts $p=10$ LPC coefficients $\{a_k\}, k=1..10$ and the LPC energy $E$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{f_k\}, k=1..10$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. +The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}). A relatively flat excitation source $E(z)$ excites a filter $(H(z)$ which models the magnitude spectrum. Linear Predictive Coding (LPC) defines $H(z)$ as an all pole filter: +\begin{equation} +H(z) = \frac{G}{1-\sum_{k=1}^p a_k z^{-k}} = \frac{G}{A(z)} +\end{equation} +where $\{a_k\}, k=1..10$ is a set of p linear prediction coefficients that characterise the filter's frequency response and G is a scalar gain factor. An excellent reference for LPC is \cite{makhoul1975linear}. + +To be useful in low bit rate speech coding it is necessary to quantise and transmit the LPC coefficients using a small number of bits. Direct quantisation of these LPC coefficients is inappropriate due to their large dynamic range (8-10 bits/coefficient). Thus for transmission purposes, especially at low bit rates, other forms such as the Line Spectral Pair (LSP) \cite{itakura1975line} frequencies are used to represent the LPC parameters. The LSP frequencies can be derived by decomposing the $p$-th order polynomial $A(z)$, into symmetric and anti-symmetric polynomials $P(z)$ and $Q(z)$, shown here in factored form: +\begin{equation} +\begin{split} +P(z) &= (1+z^{-1}) \prod_{i=1}^{p/2} (1 - 2cos(\omega_{2i-1} z^{-1} + z^{-2} ) \\ +Q(z) &= (1-z^{-1}) \prod_{i=1}^{p/2} (1 - 2cos(\omega_{2i} z^{-1} + z^{-2} ) +\end{split} +\end{equation} +where $\omega_{2i-1}$ and $\omega_{2i}$ are the LSP frequencies, found by evaluating the polynomials on the unit circle. The LSP frequencies are interlaced with each other, where $0<\omega_1 < \omega_2 <,..., < \omega_p < \pi$. The separation of adjacent LSP frequencies is related to the bandwidth of spectral peaks in $H(z)=G/A(z)$. A small separation indicates a narrow bandwidth. $A(z)$ may be reconstructed from $P(z)$ and $Q(z)$ using: +\begin{equation} +A(z) = \frac{P(z)+Q(z)}{2} +\end{equation} +Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $(A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$. More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources. -Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}). +Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$. The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}. Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}). \begin{figure}[h] \caption{LPC/LSP Modes Encoder} @@ -632,7 +650,15 @@ In CELP codecs these problems can be accommodated by the (high bit rate) excitat Before bit packing, the Codec 2 parameters are decimated in time. An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality. The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes. This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech. Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis. Neverthless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate. -TODO: table of LPC/LSP modes, frame rate. Perhaps make this a table covering all modes. +Figure \ref{fig:decoder_lpc_lsp} shows the LPC/LSP mode decoder. Frames of bits received at the frame rate are unpacked and resampled to the 10ms internal frame rate using linear interpolation. The spectral magnitude information is resampled by linear interpolation of the LSP frequencies, and converted back to a quantised LPC model $\hat{H}(z)$. The harmonic magnitudes are recovered by averaging the energy of the LPC +spectrum over the region of each harmonic: +\begin{equation} +\hat{A}_m = \sqrt{ \sum_{k=a_m}^{b_m-1} | \hat{H}(k) |^2 } +\end{equation} +where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame. For phase synthesis, the phase of $H(z)$ is determined by sampling $\hat{H}(k)$ in the centre of each harmonic: +\begin{equation} +arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right] +\end{equation} \begin{figure}[h] \caption{LPC/LSP Modes Decoder} @@ -644,9 +670,11 @@ TODO: table of LPC/LSP modes, frame rate. Perhaps make this a table covering al \node [block, right of=rinput,node distance=1.5cm] (unpack) {Unpack}; \node [block, right of=unpack,node distance=2.5cm] (interp) {Interpolate}; \node [block, right of=interp,text width=2cm] (lpc) {LSP to LPC}; +\node [tmp, right of=interp,node distance=1.25cm] (z1) {}; \node [block, right of=lpc,text width=2cm] (sample) {Sample $A_m$}; -\node [block, below of=sample,text width=2cm,node distance=2cm] (post) {Post Filter}; -\node [block, left of=post,text width=2.5cm] (synth) {Sinusoidal\\Synthesis}; +\node [block, below of=lpc,text width=2cm,node distance=2cm] (phase) {Phase Synthesis}; +\node [block, below of=phase,text width=2.5cm,node distance=2cm] (synth) {Sinusoidal\\Synthesis}; +\node [block, right of=synth,text width=2cm] (post) {Post Filter}; \node [output, left of=synth,node distance=2cm] (routput) {}; \draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack); @@ -655,20 +683,15 @@ TODO: table of LPC/LSP modes, frame rate. Perhaps make this a table covering al \draw [->] (lpc) -- (sample); \draw [->] (sample) -- (post); \draw [->] (post) -- (synth); +\draw [->] (z1) |- (phase); +\draw [->] (phase) -- (synth); +\draw [->] (sample) |- (phase); \draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$}; -%\draw [->] (dft) -- (est); -%\draw [->] (nlp) -- (est); -%\draw [->] (z2) -- (z3) -| (pack); -%\draw [->] (est) -- (voicing); -%\draw [->] (voicing) -- (pack); -%\draw [->] (pack) -- (routput) node[right,align=left,text width=1.5cm] {Bit Stream}; \end{tikzpicture} \end{center} \end{figure} -TODO expression for linear interpolation. Interpolation in LSP domain. Ear protection. - \subsection{Codec 2 700C} \label{sect:mode_newamp1} @@ -687,9 +710,32 @@ Summary of mysteries/interesting points drawn out above. \item Energy distribution theory. Need for V model, neural vocoders, non-linear function. Figures and simulation plots would be useful. Figure of phase synthesis. \end{enumerate} -\section{Glossary} +\section{Codec 2 Modes} \label{sect:glossary} + +\begin{table}[H] +\label{tab:codec2_modes} +\centering +\begin{tabular}{p{0.75cm}|p{0.75cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{5cm}} +\hline +Mode & Frm (ms) & Bits & $A_m$ & $E$ & $\omega_0$ & $v$ & Comment \\ +\hline +3200 & 20 & 64 & 50 & 5 & 7 & 2 & LSP differences \\ +2400 & 20 & 50 & 36 & 8 & - & 2 & Joint $\omega_0$/E VQ, 2 spare bits \\ +1600 & 40 & 64 & 36 & 10 & 14 & 4 \\ +1400 & 40 & 56 & 36 & 16 & - & 4 \\ +1300 & 40 & 52 & 36 & 5 & 7 & 4 & Joint $\omega_0$/E VQ \\ +1200 & 48 & 40 & 27 & 16 & - & 4 & LSP VQ, Joint $\omega_0$/E VQ, 1 spare \\ +700C & 40 & 28 & 18 & 4 & 6 & - & VQ of log magnitudes \\ +\hline +\end{tabular} +\caption{Codec 2 Modes} +\end{table} + +\section{Glossary} +\label{sect:glossary} + \begin{table}[H] \label{tab:acronyms} \centering @@ -700,6 +746,8 @@ Acronym & Description \\ DFT & Discrete Fourier Transform \\ DTCF & Discrete Time Continuous Frequency Fourier Transform \\ IDFT & Inverse Discrete Fourier Transform \\ +LPC & Linear Predictive Coding \\ +LSP & Line Spectrum Pair \\ MBE & Multi-Band Excitation \\ NLP & Non Linear Pitch (algorithm) \\ \hline @@ -714,6 +762,7 @@ NLP & Non Linear Pitch (algorithm) \\ \hline Symbol & Description & Units \\ \hline +$A(z)$ & LPC (analysis) filter \\ $a_m$ & lower DFT index of current band \\ $b_m$ & upper DFT index of current band \\ $\{A_m\}$ & Set of harmonic magnitudes $m=1,...L$ & dB \\ @@ -729,6 +778,7 @@ $s_w(n)$ & Time domain windowed input speech \\ $S_w(k)$ & Frequency domain windowed input speech \\ $\phi_m$ & Phase of excitation harmonic \\ $\omega_0$ & Fundamental frequency (pitch) & radians/sample \\ +$\{\omega_i\}$ & set of LSP frequencies \\ $v$ & Voicing decision for the current frame \\ \hline \end{tabular} diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib index 039accc..f100db3 100644 --- a/doc/codec2_refs.bib +++ b/doc/codec2_refs.bib @@ -43,3 +43,14 @@ year={1975}, publisher={IEEE} } + +@article{itakura1975line, + title={Line spectrum representation of linear predictor coefficients of speech signals}, + author={Itakura, Fumitada}, + journal={The Journal of the Acoustical Society of America}, + volume={57}, + number={S1}, + pages={S35--S35}, + year={1975}, + publisher={AIP Publishing} +} -- cgit v1.2.3 From 009897669310d1c74cc512b5b82cc952df078294 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Thu, 7 Dec 2023 05:52:41 +1030 Subject: building up 700C section --- doc/codec2.pdf | Bin 250500 -> 286543 bytes doc/codec2.tex | 152 ++++++++++++++++++++++++++++++++++++++++++++++++-- doc/codec2_refs.bib | 8 +++ doc/ratek_mel_fhz.png | Bin 0 -> 11685 bytes doc/warp_fhz_k.png | Bin 0 -> 9209 bytes 5 files changed, 155 insertions(+), 5 deletions(-) create mode 100644 doc/ratek_mel_fhz.png create mode 100644 doc/warp_fhz_k.png diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 7fcb0ff..d9b5294 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 612c196..d034dc6 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -3,6 +3,7 @@ \usepackage{hyperref} \usepackage{tikz} \usetikzlibrary{calc,arrows,shapes,positioning} +\usepackage{tkz-euclide} \usepackage{float} \usepackage{xstring} \usepackage{catchfile} @@ -583,9 +584,9 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi \subsection{LPC/LSP based modes} \label{sect:mode_lpc_lsp} -In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode. This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pairs (LSPs) to quantise and transmit the spectral magnitude information. There is a great deal of information available on these techniques so they are only briefly described here. +In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode. This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pairs (LSPs) to quantise and transmit the spectral magnitude information. There is a great deal of information available on these topics so they are only briefly described here. -The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}). A relatively flat excitation source $E(z)$ excites a filter $(H(z)$ which models the magnitude spectrum. Linear Predictive Coding (LPC) defines $H(z)$ as an all pole filter: +The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}). A relatively flat excitation source $E(z)$ excites a filter $H(z)$ which models the magnitude spectrum of the speech. Linear Predictive Coding (LPC) defines $H(z)$ as an all pole filter: \begin{equation} H(z) = \frac{G}{1-\sum_{k=1}^p a_k z^{-k}} = \frac{G}{A(z)} \end{equation} @@ -602,7 +603,7 @@ where $\omega_{2i-1}$ and $\omega_{2i}$ are the LSP frequencies, found by evalua \begin{equation} A(z) = \frac{P(z)+Q(z)}{2} \end{equation} -Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $(A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$. More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources. +Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$. More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources. Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$. The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}. Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}). @@ -695,8 +696,149 @@ arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \subsection{Codec 2 700C} \label{sect:mode_newamp1} -Microphone equaliser -ratek study +To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}. One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation. + +Consider a vector $\mathbf{a}$ of $L$ harmonic spectral magnitudes in dB: +\begin{equation} +\mathbf{a} = \begin{bmatrix} 20log_{10}A_1, 20log_{10}A_2, \ldots 20log_{10}A_L \end{bmatrix} +\end{equation} +\begin{equation} +L=\left \lfloor \frac{F_s}{2F_0} \right \rfloor = \left \lfloor \frac{\pi}{\omega_0} \right \rfloor +\end{equation} +$F_0$ and $L$ are time varying as the pitch track evolves over time. For speech sampled at $F_s=8$ kHz $F_0$ is typically in the range of 50 to 400 Hz, giving $L$ in the range of 10 $\ldots$ 80. \\ + +To quantise and transmit $\mathbf{a}$, it is convenient to resample $\mathbf{a}$ to a fixed length $K$ element vector $\mathbf{b}$ using a resampling function: +\begin{equation} +\begin{split} +\mathbf{y} &= \begin{bmatrix} Y_1, Y_2, \ldots Y_L \end{bmatrix} = H(\mathbf{a}) \\ +\mathbf{b} &= \begin{bmatrix} B_1, B_2, \ldots B_K \end{bmatrix} = R(\mathbf{y}) +\end{split} +\end{equation} +Where $H$ is a filter function chosen to smooth the spectral amplitude samples $A_m$ while not significantly altering the perceptual quality of the speech; and $R$ is a resampling function. To model the response of the human ear $B_k$ are sampled on $K$ non-linearly spaced points on the frequency axis: +\begin{equation} +\begin{split} +f_k &= warp(k,K) \ \textrm{Hz} \quad k=1 \ldots K \\ +warp(1,K) &= 200 \ \textrm{Hz} \\ +warp(K,K) &= 3700 \ \textrm{Hz} +\end{split} +\end{equation} +where $warp()$ is a frequency warping function. Codec 2 700C uses $K=20$, $H=1$, and $warp()$ is defined using the Mel function \cite[p 150]{o1997human} (Figure \ref{fig:mel_fhz}) which samples the spectrum more densely at low frequencies, and less densely at high frequencies: +\begin{equation} \label{eq:mel_f} +mel(f) = 2595log_{10}(1+f/700) +\end{equation} +The inverse mapping of $f$ in Hz from $mel(f)$ is given by: +\begin{equation} \label{eq:f_mel} +f = mel^{-1}(x) = 700(10^{x/2595} - 1); +\end{equation} + +\begin{figure}[h] +\caption{Mel function} +\label{fig:mel_fhz} +\begin{center} +\includegraphics[width=8cm]{ratek_mel_fhz} +\end{center} +\end{figure} + +We wish to use $mel(f)$ to construct $warp(k,K)$, such that there are $K$ evenly spaced points on the $mel(f)$ axis (Figure \ref{fig:mel_k}). Solving for the equation of a straight line we can obtain $mel(f)$ as a function of $k$, and hence $warp(k,K)$ (Figure \ref{fig:warp_fhz_k}): +\begin{equation} \label{eq:mel_k} +\begin{split} +g &= \frac{mel(3700)-mel(200)}{K-1} \\ +mel(f) &= g(k-1) + mel(200) +\end{split} +\end{equation} +Substituting (\ref{eq:f_mel}) into the LHS: +\begin{equation} \label{eq:warp} +\begin{split} +2595log_{10}(1+f/700) &= g(k-1) + mel(200) \\ +f = warp(k,K) &= mel^{-1} ( g(k-1) + mel(200) ) \\ +\end{split} +\end{equation} +and the inverse warp function: +\begin{equation} \label{warp_inv} +k = warp^{-1}(f,K) = \frac{mel(f)-mel(200)}{g} + 1 +\end{equation} + +\begin{figure}[h] +\caption{Linear mapping of $mel(f)$ to Rate $K$ sample index $k$} +\vspace{5mm} +\label{fig:mel_k} +\centering +\begin{tikzpicture} +\tkzDefPoint(1,1){A} +\tkzDefPoint(5,5){B} +\draw[thick] (1,1) node [right]{(1,mel(200))} -- (5,5) node [right]{(K,mel(3700))}; +\draw[thick,->] (0,0) -- (6,0) node [below]{k}; +\draw[thick,->] (0,0) -- (0,6) node [left]{mel(f)}; +\foreach \n in {A,B} + \node at (\n)[circle,fill,inner sep=1.5pt]{}; +\end{tikzpicture} +\end{figure} + +\begin{figure}[h] +\caption{$warp(k,K)$ function for $K=20$} +\label{fig:warp_fhz_k} +\begin{center} +\includegraphics[width=8cm]{warp_fhz_k} +\end{center} +\end{figure} + +The rate $K$ vector $\mathbf{b}$ is vector quantised for transmission over the channel: +\begin{equation} +\hat{\mathbf{b}} = Q(\mathbf{b}) +\end{equation} +Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. The rate filtered rate $L$ vector can then be recovered by resampling $\mathbf{\hat{b}}$ using another resampling function: +\begin{equation} +\hat{\mathbf{y}} = S(\hat{\mathbf{b}}) +\end{equation} + +Figure \ref{fig:newamp1_encoder} is the Codec 2 700C encoder. Some notes on this algorithm: +\begin{enumerate} +\item The amplitudes and Vector Quantiser (VQ) entries are in dB, which is very nice to work in and matches the ears logarithmic amplitude response. +\item The mode is capable of communications quality speech and is in common use with FreeDV, but is close to the lower limits of intelligibility, and doesn't do well in some languages (problems have been reported with German and Japanese). +\item The VQ was trained on just 120 seconds of data - way too short. +\item The parameter set (pitch, voicing, log spectral magnitudes) is very similar to that used for the latest neural vocoders. +\item The input speech may be subject to arbitrary filtering, for example due to the microphone frequency response, room acoustics, and anti-aliasing filter. This filtering is fixed or slowly time varying. The filtering biases the target vectors away from the VQ training material, resulting in significant additional mean square error. The filtering does not greatly affect the input speech quality, however the VQ performance distortion increases and the output speech quality is reduced. This is exacerbated by operating in the log domain, the VQ will try to match very low level, perceptually insignificant energy near 0 and 4000 Hz. A microphone equaliser algorithm has been developed to help adjust to arbitrary microphone filtering. +\end{enumerate} + +\begin{figure}[h] +\caption{Codec 2 700C (newamp1) encoder} + +\label{fig:newamp1_encoder} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] + +\node [input] (rinput) {}; +\node [tmp, right of=rinput,node distance=0.5cm] (z) {}; +\node [block, right of=z,node distance=1.5cm] (window) {Window}; +\node [block, right of=window,node distance=2.5cm] (dft) {DFT}; +\node [block, right of=dft,node distance=3cm,text width=1.5cm] (est) {Est Amp}; +\node [block, below of=window] (nlp) {NLP}; +\node [block, below of=nlp] (log) {log $\omega_0$}; +\node [block, below of=est,node distance=2cm,text width=2cm] (resample) {Resample Rate $K$}; +\node [block, right of=est,node distance=2.5cm,text width=1.5cm] (voicing) {Est Voicing}; +\node [tmp, below of=resample,node distance=1cm] (z1) {}; +\node [block, below of=dft,node distance=2cm,text width=2cm] (vq) {Decimate \& VQ}; +\node [block, below of=vq,node distance=2cm,text width=2cm] (pack) {Bit Packing}; +\node [output, right of=pack,node distance=2cm] (routput) {}; + +\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window); +\draw [->] (z) |- (nlp); +\draw [->] (window) -- node[below] {$s_w(n)$} (dft); +\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est); +\draw [->] (est) -- node[right] {$\mathbf{a}$} (resample); +\draw [->] (est) -- (voicing); +\draw [->] (resample) -- node[below] {$\mathbf{b}$} (vq); +\draw [->] (vq) -- (pack); +\draw [->] (nlp) -- (log); +\draw [->] (log) -- (pack); +\draw [->] (voicing) |- (z1) -| (pack); +\draw [->] (pack) -- (routput) node[right] {Bit Stream}; + +\end{tikzpicture} +\end{center} +\end{figure} + +TODO: Microphone equaliser. ratek study \section{Further Work} diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib index f100db3..ea9ee6c 100644 --- a/doc/codec2_refs.bib +++ b/doc/codec2_refs.bib @@ -54,3 +54,11 @@ year={1975}, publisher={AIP Publishing} } + +@book{o1997human, + title={Speech Communication - Human and machine}, + author={O‘Shaughnessy, Douglas}, + publisher={Addison-Wesley Publishing Company}, + year={1997} +} + diff --git a/doc/ratek_mel_fhz.png b/doc/ratek_mel_fhz.png new file mode 100644 index 0000000..c51d409 Binary files /dev/null and b/doc/ratek_mel_fhz.png differ diff --git a/doc/warp_fhz_k.png b/doc/warp_fhz_k.png new file mode 100644 index 0000000..a6cbf3c Binary files /dev/null and b/doc/warp_fhz_k.png differ -- cgit v1.2.3 From 71b86a8a1167b03d650d8ec3930770d8d17a9259 Mon Sep 17 00:00:00 2001 From: David Rowe Date: Thu, 7 Dec 2023 08:00:21 +1030 Subject: mic EQ and VQ mean removal maths --- doc/codec2.pdf | Bin 286543 -> 301693 bytes doc/codec2.tex | 133 +++++++++++++++++++++++++++++----------------------- doc/codec2_refs.bib | 6 +++ 3 files changed, 80 insertions(+), 59 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index d9b5294..d91f1f8 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index d034dc6..2c5d13c 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -696,25 +696,63 @@ arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \subsection{Codec 2 700C} \label{sect:mode_newamp1} -To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}. One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation. +To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}. One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation. Figure \ref{fig:newamp1_encoder} presents the Codec 2 700C encoder. -Consider a vector $\mathbf{a}$ of $L$ harmonic spectral magnitudes in dB: +\begin{figure}[h] +\caption{Codec 2 700C (newamp1) encoder} + +\label{fig:newamp1_encoder} +\begin{center} +\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] + +\node [input] (rinput) {}; +\node [tmp, right of=rinput,node distance=0.5cm] (z) {}; +\node [block, right of=z,node distance=1.5cm] (window) {Window}; +\node [block, right of=window,node distance=2.5cm] (dft) {DFT}; +\node [block, right of=dft,node distance=3cm,text width=1.5cm] (est) {Est Amp}; +\node [block, below of=est,node distance=2cm,text width=2cm] (resample) {Resample Rate $K$}; +\node [block, below of=dft,node distance=2cm,text width=2cm] (eq) {Microphone EQ}; +\node [block, below of=eq,node distance=2cm,text width=2cm] (vq) {Decimate \& VQ}; +\node [block, below of=window] (nlp) {NLP}; +\node [block, below of=nlp] (log) {log $\omega_0$}; +\node [block, below of=resample,node distance=2cm,text width=1.5cm] (voicing) {Est Voicing}; +\node [block, below of=vq,node distance=2cm,text width=2cm] (pack) {Bit Packing}; +\node [tmp, right of=resample,node distance=2cm] (z1) {}; +\node [tmp, below of=vq,node distance=1cm] (z2) {}; +\node [output, right of=pack,node distance=2cm] (routput) {}; + +\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window); +\draw [->] (z) |- (nlp); +\draw [->] (window) -- node[below] {$s_w(n)$} (dft); +\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est); +\draw [->] (est) -- node[right] {$\mathbf{a}$} (resample); +\draw [->] (resample) -- node[below] {$\mathbf{b}$} (eq); +\draw [->] (eq) -- (vq); +\draw [->] (vq) -- (pack); +\draw [->] (est) -| (z1) |- (voicing); +\draw [->] (nlp) -- (log); +\draw [->] (log) |- (pack); +\draw [->] (voicing) |- (z2) -| (pack); +\draw [->] (pack) -- (routput) node[right] {Bit Stream}; + +\end{tikzpicture} +\end{center} +\end{figure} + +Consider a vector $\mathbf{a}$ of $L$ harmonic spectral magnitudes expressed in dB: \begin{equation} \mathbf{a} = \begin{bmatrix} 20log_{10}A_1, 20log_{10}A_2, \ldots 20log_{10}A_L \end{bmatrix} \end{equation} \begin{equation} L=\left \lfloor \frac{F_s}{2F_0} \right \rfloor = \left \lfloor \frac{\pi}{\omega_0} \right \rfloor \end{equation} -$F_0$ and $L$ are time varying as the pitch track evolves over time. For speech sampled at $F_s=8$ kHz $F_0$ is typically in the range of 50 to 400 Hz, giving $L$ in the range of 10 $\ldots$ 80. \\ +$F_0$ and $L$ are time varying as the pitch track evolves over time. For speech sampled at $F_s=8$ kHz $F_0$ is typically in the range of 50 to 400 Hz, giving $L$ in the range of 10 $\ldots$ 80. To quantise and transmit $\mathbf{a}$, it is convenient to resample $\mathbf{a}$ to a fixed length $K$ element vector $\mathbf{b}$ using a resampling function: \begin{equation} -\begin{split} -\mathbf{y} &= \begin{bmatrix} Y_1, Y_2, \ldots Y_L \end{bmatrix} = H(\mathbf{a}) \\ -\mathbf{b} &= \begin{bmatrix} B_1, B_2, \ldots B_K \end{bmatrix} = R(\mathbf{y}) -\end{split} +\mathbf{b} = \begin{bmatrix} B_1, B_2, \ldots B_K \end{bmatrix} = R(\mathbf{a}) \end{equation} -Where $H$ is a filter function chosen to smooth the spectral amplitude samples $A_m$ while not significantly altering the perceptual quality of the speech; and $R$ is a resampling function. To model the response of the human ear $B_k$ are sampled on $K$ non-linearly spaced points on the frequency axis: +Where $R$ is a resampling function. To model the response of the human ear $B_k$ are sampled on $K$ non-linearly spaced points on the frequency axis: \begin{equation} \begin{split} f_k &= warp(k,K) \ \textrm{Hz} \quad k=1 \ldots K \\ @@ -722,7 +760,7 @@ warp(1,K) &= 200 \ \textrm{Hz} \\ warp(K,K) &= 3700 \ \textrm{Hz} \end{split} \end{equation} -where $warp()$ is a frequency warping function. Codec 2 700C uses $K=20$, $H=1$, and $warp()$ is defined using the Mel function \cite[p 150]{o1997human} (Figure \ref{fig:mel_fhz}) which samples the spectrum more densely at low frequencies, and less densely at high frequencies: +where $warp()$ is a frequency warping function. Codec 2 700C uses $K=20$, and $warp()$ is defined using the Mel function \cite[p 150]{o1997human} (Figure \ref{fig:mel_fhz}) which samples the spectrum more densely at low frequencies, and less densely at high frequencies: \begin{equation} \label{eq:mel_f} mel(f) = 2595log_{10}(1+f/700) \end{equation} @@ -782,63 +820,37 @@ k = warp^{-1}(f,K) = \frac{mel(f)-mel(200)}{g} + 1 \end{center} \end{figure} -The rate $K$ vector $\mathbf{b}$ is vector quantised for transmission over the channel: +The input speech may be subject to arbitrary filtering, for example due to the microphone frequency response, room acoustics, and anti-aliasing filter. This filtering is fixed or slowly time varying. The filtering biases the target vectors away from the VQ training material, resulting in significant additional mean square error. The filtering does not greatly affect the input speech quality, however the VQ performance distortion increases and the output speech quality is reduced. This is exacerbated by operating in the log domain, the VQ will try to match very low level, perceptually insignificant energy near 0 and 4000 Hz. A microphone equaliser algorithm has been developed to help adjust to arbitrary microphone filtering. + +For every input frame $l$, the equaliser (EQ) updates the dimension $K$ equaliser vector $\mathbf{e}$: \begin{equation} -\hat{\mathbf{b}} = Q(\mathbf{b}) +\mathbf{e}^{l+1} = \mathbf{e}^l + \beta(\mathbf{b} - \mathbf{t}) \end{equation} -Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. The rate filtered rate $L$ vector can then be recovered by resampling $\mathbf{\hat{b}}$ using another resampling function: +where $\mathbf{t}$ is a fixed target vector set to the mean of the VQ quantiser, and $\beta$ is a small adaption constant. + +The equalised, mean removed rate $K$ vector $\mathbf{d}$ is vector quantised for transmission over the channel: \begin{equation} -\hat{\mathbf{y}} = S(\hat{\mathbf{b}}) +\begin{split} +\mathbf{c} &= \mathbf{b} - \mathbf{e} \\ +\mathbf{d} &= \mathbf{c} - \bar{\mathbf{c}} \\ +\hat{\mathbf{c}} &= VQ(\mathbf{d}) + Q(\bar{\mathbf{c}}) +\end{split} \end{equation} - -Figure \ref{fig:newamp1_encoder} is the Codec 2 700C encoder. Some notes on this algorithm: +Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. Note that VQ is performed in the $log$ amplitude (dB) domain. The mean of $\mathbf{c}$ is removed prior to VQ and scalar quantised and transmitted separately as the frame energy. The rate $L$ vector $\hat{\mathbf{y}}$ can then be recovered by resampling $\mathbf{\hat{c}}$: +\begin{equation} +\hat{\mathbf{y}} = S(\hat{\mathbf{c}}) +\end{equation} + +Some notes on the Codec 2 700C \emph{newamp1} algorithms: \begin{enumerate} -\item The amplitudes and Vector Quantiser (VQ) entries are in dB, which is very nice to work in and matches the ears logarithmic amplitude response. +\item The amplitudes and Vector Quantiser (VQ) entries are in dB, which matches the ears logarithmic amplitude response. \item The mode is capable of communications quality speech and is in common use with FreeDV, but is close to the lower limits of intelligibility, and doesn't do well in some languages (problems have been reported with German and Japanese). \item The VQ was trained on just 120 seconds of data - way too short. \item The parameter set (pitch, voicing, log spectral magnitudes) is very similar to that used for the latest neural vocoders. -\item The input speech may be subject to arbitrary filtering, for example due to the microphone frequency response, room acoustics, and anti-aliasing filter. This filtering is fixed or slowly time varying. The filtering biases the target vectors away from the VQ training material, resulting in significant additional mean square error. The filtering does not greatly affect the input speech quality, however the VQ performance distortion increases and the output speech quality is reduced. This is exacerbated by operating in the log domain, the VQ will try to match very low level, perceptually insignificant energy near 0 and 4000 Hz. A microphone equaliser algorithm has been developed to help adjust to arbitrary microphone filtering. -\end{enumerate} - -\begin{figure}[h] -\caption{Codec 2 700C (newamp1) encoder} - -\label{fig:newamp1_encoder} -\begin{center} -\begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] - -\node [input] (rinput) {}; -\node [tmp, right of=rinput,node distance=0.5cm] (z) {}; -\node [block, right of=z,node distance=1.5cm] (window) {Window}; -\node [block, right of=window,node distance=2.5cm] (dft) {DFT}; -\node [block, right of=dft,node distance=3cm,text width=1.5cm] (est) {Est Amp}; -\node [block, below of=window] (nlp) {NLP}; -\node [block, below of=nlp] (log) {log $\omega_0$}; -\node [block, below of=est,node distance=2cm,text width=2cm] (resample) {Resample Rate $K$}; -\node [block, right of=est,node distance=2.5cm,text width=1.5cm] (voicing) {Est Voicing}; -\node [tmp, below of=resample,node distance=1cm] (z1) {}; -\node [block, below of=dft,node distance=2cm,text width=2cm] (vq) {Decimate \& VQ}; -\node [block, below of=vq,node distance=2cm,text width=2cm] (pack) {Bit Packing}; -\node [output, right of=pack,node distance=2cm] (routput) {}; - -\draw [->] node[align=left,text width=2cm] {$s(n)$} (rinput) -- (window); -\draw [->] (z) |- (nlp); -\draw [->] (window) -- node[below] {$s_w(n)$} (dft); -\draw [->] (dft) -- node[below] {$S_\omega(k)$} (est); -\draw [->] (est) -- node[right] {$\mathbf{a}$} (resample); -\draw [->] (est) -- (voicing); -\draw [->] (resample) -- node[below] {$\mathbf{b}$} (vq); -\draw [->] (vq) -- (pack); -\draw [->] (nlp) -- (log); -\draw [->] (log) -- (pack); -\draw [->] (voicing) |- (z1) -| (pack); -\draw [->] (pack) -- (routput) node[right] {Bit Stream}; - -\end{tikzpicture} -\end{center} -\end{figure} +\item The Rate K algorithms were recently revisited, several improvements proposed and prototyped \cite{rowe2023ratek}. +\end{enumerate}. -TODO: Microphone equaliser. ratek study +TODO: Post filters for LPC/LSP and 700C. \section{Further Work} @@ -853,7 +865,7 @@ Summary of mysteries/interesting points drawn out above. \end{enumerate} -\section{Codec 2 Modes} +\section{Summary of Codec 2 Modes} \label{sect:glossary} \begin{table}[H] @@ -868,7 +880,7 @@ Mode & Frm (ms) & Bits & $A_m$ & $E$ & $\omega_0$ & $v$ & Comment \\ 1600 & 40 & 64 & 36 & 10 & 14 & 4 \\ 1400 & 40 & 56 & 36 & 16 & - & 4 \\ 1300 & 40 & 52 & 36 & 5 & 7 & 4 & Joint $\omega_0$/E VQ \\ -1200 & 48 & 40 & 27 & 16 & - & 4 & LSP VQ, Joint $\omega_0$/E VQ, 1 spare \\ +1200 & 40 & 48 & 27 & 16 & - & 4 & LSP VQ, Joint $\omega_0$/E VQ, 1 spare \\ 700C & 40 & 28 & 18 & 4 & 6 & - & VQ of log magnitudes \\ \hline \end{tabular} @@ -887,11 +899,14 @@ Acronym & Description \\ \hline DFT & Discrete Fourier Transform \\ DTCF & Discrete Time Continuous Frequency Fourier Transform \\ +EQ & (microphone) Equaliser \\ IDFT & Inverse Discrete Fourier Transform \\ LPC & Linear Predictive Coding \\ LSP & Line Spectrum Pair \\ MBE & Multi-Band Excitation \\ +MSE & Mean Square Error \\ NLP & Non Linear Pitch (algorithm) \\ +VQ & Vector Quantiser \\ \hline \end{tabular} \caption{Glossary of Acronyms} diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib index ea9ee6c..9999286 100644 --- a/doc/codec2_refs.bib +++ b/doc/codec2_refs.bib @@ -55,6 +55,7 @@ publisher={AIP Publishing} } + @book{o1997human, title={Speech Communication - Human and machine}, author={O‘Shaughnessy, Douglas}, @@ -62,3 +63,8 @@ year={1997} } +@misc{rowe2023ratek, + title = {{FreeDV-015 Codec 2 Rate K Resampler}}, + year = {2023}, + note = {\url{https://github.com/drowe67/misc/blob/master/ratek_resampler/ratek_resampler.pdf}} +} -- cgit v1.2.3 From 670b278f60b796ce3717960a28985d121f8ea68b Mon Sep 17 00:00:00 2001 From: David Rowe Date: Sat, 9 Dec 2023 08:16:53 +1030 Subject: aligning 700C figures with maths --- doc/codec2.pdf | Bin 301693 -> 310563 bytes doc/codec2.tex | 126 +++++++++++++++++++++++++++++++++++++--------------- doc/codec2_refs.bib | 14 ++++++ 3 files changed, 104 insertions(+), 36 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index d91f1f8..f5f2804 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 2c5d13c..f1ea924 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -307,7 +307,7 @@ Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal ana \end{center} \end{figure} -For the purposes of speech analysis the time domain speech signal $s(n)$ is divided into overlapping analysis windows (frames) of $N_w=279$ samples. The centre of each analysis window is separated by $N=80$ samples, or an internal frame rate or 10ms. To analyse the $l$-th frame it is convenient to convert the fixed time reference to a sliding time reference centred on the current analysis window: +The time domain speech signal $s(n)$ is divided into overlapping analysis windows (frames) of $N_w=279$ samples. The centre of each analysis window is separated by $N=80$ or 10ms. Codec 2 operates at an internal frame rate of 100 Hz. To analyse the $l$-th frame it is convenient to convert the fixed time reference to a sliding time reference centred on the current analysis window: \begin{equation} s_w(n) = s(lN + n) w(n), \quad n = - N_{w2} ... N_{w2} \end{equation} @@ -352,7 +352,7 @@ The phase is sampled at the centre of the band. For all practical Codec 2 modes Synthesis is achieved by constructing an estimate of the original speech spectrum using the sinusoidal model parameters for the current frame. This information is then transformed to the time domain using an Inverse DFT (IDFT). To produce a continuous time domain waveform the IDFTs from adjacent frames are smoothly interpolated using a weighted overlap add procedure \cite{mcaulay1986speech}. \begin{figure}[h] -\caption{Sinusoidal Synthesis. At frame $l$ the windowing function generates $2N$ samples. The first $N$ samples complete the current frame and are the synthesiser output. The second $N$ samples are stored for summing with the next frame.} +\caption{Sinusoidal Synthesis. At frame $l$ the windowing function generates $2N$ samples. The first $N$ samples complete the current frame. The second $N$ samples are stored for summing with the next frame.} \label{fig:synthesis} \begin{center} \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] @@ -565,7 +565,7 @@ n_0 &= -\phi_1 / \omega_0 \\ For unvoiced speech $E(z)$ is a white noise signal. At each frame we sample a random number generator on the interval $-\pi ... \pi$ to obtain the excitation phase of each harmonic. We set $F_0 = 50$ Hz to use a large number of harmonics $L=4000/50=80$ for synthesis to best approximate a noise signal. -An additional phase component is provided by sampling $H(z)$ at the harmonic centres. The phase spectra of $H(z)$ is derived from the filter magnitude response using minimum phase techniques. The method for deriving the phase spectra of $H(z)$ differs between Codec 2 modes and is described below in Sections \ref{sect:mode_lpc_lsp} and \ref{sect:mode_newamp1}. This component of the phase tends to disperse the pitch pulse energy in time, especially around spectral peaks (formants). +The second phase component is provided by sampling the phase of $H(z)$ at the harmonic centres. The phase spectra of $H(z)$ is derived from the magnitude response using minimum phase techniques. The method for deriving the phase spectra of $H(z)$ differs between Codec 2 modes and is described below in Sections \ref{sect:mode_lpc_lsp} and \ref{sect:mode_newamp1}. This component of the phase tends to disperse the pitch pulse energy in time, especially around spectral peaks (formants). The zero phase model tends to make speech with background noise sound "clicky". With high levels of background noise the low level inter-formant parts of the spectrum will contain noise rather than speech harmonics, so modelling them as voiced (i.e. a continuous, non-random phase track) is inaccurate. Some codecs (like MBE) have a mixed voicing model that breaks the spectrum into voiced and unvoiced regions. However (5-12) bits/frame (5-12) are required to transmit the frequency selective voicing information. Mixed excitation also requires accurate voicing estimation (parameter estimators always break occasionally under exceptional conditions). @@ -645,23 +645,15 @@ Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping \end{center} \end{figure} -One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame. As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters. Using a fixed order LPC model is a neat solution to this problem. Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope. +One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame. As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters. Using a fixed order LPC model is a neat solution to this problem. Another feature of LPC modelling combined with scalar LSP quantisation is a tolerance to variations in the input frequency response (see section \ref{sect:mode_newamp1} for more information on this issue). + +Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope. In CELP codecs these problems can be accommodated by the (high bit rate) excitation, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model. -Before bit packing, the Codec 2 parameters are decimated in time. An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality. The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes. This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech. Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis. Neverthless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate. +Before bit packing, the Codec 2 parameters are decimated in time. An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality. The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes. This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech. Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis. Nevertheless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate. -Figure \ref{fig:decoder_lpc_lsp} shows the LPC/LSP mode decoder. Frames of bits received at the frame rate are unpacked and resampled to the 10ms internal frame rate using linear interpolation. The spectral magnitude information is resampled by linear interpolation of the LSP frequencies, and converted back to a quantised LPC model $\hat{H}(z)$. The harmonic magnitudes are recovered by averaging the energy of the LPC -spectrum over the region of each harmonic: -\begin{equation} -\hat{A}_m = \sqrt{ \sum_{k=a_m}^{b_m-1} | \hat{H}(k) |^2 } -\end{equation} -where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame. For phase synthesis, the phase of $H(z)$ is determined by sampling $\hat{H}(k)$ in the centre of each harmonic: -\begin{equation} -arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right] -\end{equation} - -\begin{figure}[h] +\begin{figure}[H] \caption{LPC/LSP Modes Decoder} \label{fig:decoder_lpc_lsp} \begin{center} @@ -675,7 +667,7 @@ arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \node [block, right of=lpc,text width=2cm] (sample) {Sample $A_m$}; \node [block, below of=lpc,text width=2cm,node distance=2cm] (phase) {Phase Synthesis}; \node [block, below of=phase,text width=2.5cm,node distance=2cm] (synth) {Sinusoidal\\Synthesis}; -\node [block, right of=synth,text width=2cm] (post) {Post Filter}; +\node [block, right of=phase,text width=2cm] (post) {Post Filter}; \node [output, left of=synth,node distance=2cm] (routput) {}; \draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack); @@ -683,25 +675,45 @@ arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \draw [->] (interp) -- (lpc); \draw [->] (lpc) -- (sample); \draw [->] (sample) -- (post); -\draw [->] (post) -- (synth); +\draw [->] (post) |- (synth); \draw [->] (z1) |- (phase); \draw [->] (phase) -- (synth); -\draw [->] (sample) |- (phase); +\draw [->] (post) -- (phase); \draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$}; \end{tikzpicture} \end{center} \end{figure} +Figure \ref{fig:decoder_lpc_lsp} shows the LPC/LSP mode decoder. Frames of bits received at the frame rate are unpacked and resampled to the 10ms internal frame rate using linear interpolation. The spectral magnitude information is resampled by linear interpolation of the LSP frequencies, and converted back to a quantised LPC model $\hat{H}(z)$. The harmonic magnitudes are recovered by averaging the energy of the LPC spectrum over the region of each harmonic: +\begin{equation} +\hat{A}_m = \sqrt{ \sum_{k=a_m}^{b_m-1} | \hat{H}(k) |^2 } +\end{equation} +where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame. For phase synthesis, the phase of $H(z)$ is determined by sampling $\hat{H}(k)$ in the centre of each harmonic: +\begin{equation} +arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right] +\end{equation} +Prior to sampling the amplitude and phase, a frequency domain post filter is applied to the LPC power spectrum. The algorithm is based on the MBE frequency domain post filter \cite[Section 8.6, p 267]{kondoz1994digital}, which is turn based on the frequency domain post filter from McAulay and Quatieri \cite[Section 4.3, p 148]{kleijn1995speech}. The authors report a significant improvement in speech quality from the post filter, which has also been our experience when applied to Codec 2. The post filter is given by: +\begin{equation} +\label{eq:lpc_lsp_pf} +\begin{split} +P_f(e^{j\omega}) &= g \left( R_w(e^{j \omega} \right))^\beta \\ +R_w(^{j\omega}) &= A(e^{j \omega/ \gamma})/A(e^{j \omega}) +\end{split} +\end{equation} +where $g$ is a gain chosen to such that the energy of at the output of the post filter is the same as the input, $\beta=0.2$, and $\gamma=0.5$. The post filter raises the spectral peaks (formants), and pushes down the energy between formants. The $\beta$ term compensates for spectral tilt, such that $R_w$ is similar to the LPC synthesis filter $1/A(z)$ however with equal emphasis at low and high frequencies. The authors suggest the post filter reduces the noise level between formants, an explanation commonly given to post filters used for CELP codecs where significant inter-formant noise exists from the noisy excitation source. However in harmonic sinusoidal codecs there is no excitation noise between formants in $E(z)$. Our theory is the post filter also acts to reduce the bandwidth of spectral peaks, modifying the energy distribution across the time domain pitch cycle in a way that improves intelligibility, especially for low pitched speakers. + +A disadvantage of the post filter is the need for experimentally derived constants. It performs a non-linear operation on the speech spectrum, and if mis-applied can worsen speech quality. As it's operation is not completely understood, it represents a source of future quality improvement. + \subsection{Codec 2 700C} \label{sect:mode_newamp1} -To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}. One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation. Figure \ref{fig:newamp1_encoder} presents the Codec 2 700C encoder. +To efficiently transmit spectral amplitude information Codec 2 700C uses a set of algorithms collectively denoted \emph{newamp1}. One of these algorithms is the Rate K resampler which transforms the variable length vectors of spectral magnitude samples to fixed length $K$ vectors suitable for vector quantisation. Figure \ref{fig:encoder_newamp1} presents the Codec 2 700C encoder. -\begin{figure}[h] -\caption{Codec 2 700C (newamp1) encoder} +\begin{figure}[H] +\caption{Codec 2 700C (newamp1) Encoder} -\label{fig:newamp1_encoder} +\label{fig:encoder_newamp1} \begin{center} \begin{tikzpicture}[auto, node distance=2cm,>=triangle 45,x=1.0cm,y=1.0cm, align=center] @@ -727,7 +739,7 @@ To efficiently transmit spectral amplitude information Codec 2 700C uses a set o \draw [->] (dft) -- node[below] {$S_\omega(k)$} (est); \draw [->] (est) -- node[right] {$\mathbf{a}$} (resample); \draw [->] (resample) -- node[below] {$\mathbf{b}$} (eq); -\draw [->] (eq) -- (vq); +\draw [->] (eq) -- node[left] {$\mathbf{c}$} (vq); \draw [->] (vq) -- (pack); \draw [->] (est) -| (z1) |- (voicing); \draw [->] (nlp) -- (log); @@ -778,17 +790,19 @@ f = mel^{-1}(x) = 700(10^{x/2595} - 1); \end{figure} We wish to use $mel(f)$ to construct $warp(k,K)$, such that there are $K$ evenly spaced points on the $mel(f)$ axis (Figure \ref{fig:mel_k}). Solving for the equation of a straight line we can obtain $mel(f)$ as a function of $k$, and hence $warp(k,K)$ (Figure \ref{fig:warp_fhz_k}): -\begin{equation} \label{eq:mel_k} +\begin{equation} +\label{eq:mel_k} \begin{split} g &= \frac{mel(3700)-mel(200)}{K-1} \\ mel(f) &= g(k-1) + mel(200) \end{split} \end{equation} Substituting (\ref{eq:f_mel}) into the LHS: -\begin{equation} \label{eq:warp} +\begin{equation} +\label{eq:warp} \begin{split} 2595log_{10}(1+f/700) &= g(k-1) + mel(200) \\ -f = warp(k,K) &= mel^{-1} ( g(k-1) + mel(200) ) \\ +f_k = warp(k,K) &= mel^{-1} ( g(k-1) + mel(200) ) \\ \end{split} \end{equation} and the inverse warp function: @@ -833,13 +847,54 @@ The equalised, mean removed rate $K$ vector $\mathbf{d}$ is vector quantised for \begin{split} \mathbf{c} &= \mathbf{b} - \mathbf{e} \\ \mathbf{d} &= \mathbf{c} - \bar{\mathbf{c}} \\ -\hat{\mathbf{c}} &= VQ(\mathbf{d}) + Q(\bar{\mathbf{c}}) +\hat{\mathbf{c}} &= VQ(\mathbf{d}) + Q(\bar{\mathbf{c}}) \\ + &= \hat{\mathbf{d}} + \hat{\bar{\mathbf{c}}} \end{split} \end{equation} -Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. Note that VQ is performed in the $log$ amplitude (dB) domain. The mean of $\mathbf{c}$ is removed prior to VQ and scalar quantised and transmitted separately as the frame energy. The rate $L$ vector $\hat{\mathbf{y}}$ can then be recovered by resampling $\mathbf{\hat{c}}$: +Codec 2 700C uses a two stage VQ with 9 bits (512 entries) per stage. The \emph{mbest} multi-stage search algorithm is used to jointly search the two stages (using 5 survivors from the first stage). Note that VQ is performed in the $log$ amplitude (dB) domain. The mean of $\mathbf{c}$ is removed prior to VQ and scalar quantised and transmitted separately as the frame energy. At the decoder, the rate $L$ vector $\hat{\mathbf{a}}$ can then be recovered by resampling $\mathbf{\hat{a}}$: \begin{equation} -\hat{\mathbf{y}} = S(\hat{\mathbf{c}}) +\hat{\mathbf{a}} = S(\hat{\mathbf{c}} + \mathbf{p}) \end{equation} +where $\mathbf{p}$ is a post filter vector. The post filter vector is generated from the mean-removed rate $K$ vector $\hat{\mathbf{d}}$ in the $log$ frequency domain: +\begin{equation} +\begin{split} +\mathbf{p} &= G + P_{gain} \left( \hat{\mathbf{d}} + \mathbf{r} \right) - \mathbf{r} \\ +\mathbf{r} &= \begin{bmatrix} R_1, R_2, \ldots R_K \end{bmatrix} \\ + R_k &= 20log_{10}(f_k/300) \quad k=1,...,K +\end{split} +\end{equation} +where $G$ is an energy normalisation term, and $1.2 < P_{gain} < 1.5$ describes the amount if post filtering applied. $G$ and $P_{gain}$ are similar to $g$ and $\beta$ in the LPC/LSP post filter (\ref{eq:lpc_lsp_pf}). The $\mathbf{r}$ term is a high pass (pre-emphasis) filter with +20 dB/decade gain after 300 Hz ($f_k$ is given in (\ref{eq:warp})). The post filtering is applied on the pre-emphasised vector, then the pre-emphasis is removed from the final result. Multiplying by $P_{gain}$ in the $log$ domain is similar to the $\alpha$ power function in (\ref{eq:lpc_lsp_pf}); spectral peaks are moved up, and troughs pushed down. This filter enhances the speech quality but also introduces some artefacts. + +Figure \ref{fig:decoder_newamp1} is the block diagram of the decoder signal processing. Cepstral techniques are used to synthesise a phase spectra $arg[H(e^{j \omega}])$ from $\hat{\mathbf{a}}$ using a minimum phase model. + +\begin{figure}[h] +\caption{Codec 2 700C (newamp1) Decoder} +\label{fig:decoder_newamp1} +\begin{center} +\begin{tikzpicture}[auto, node distance=3cm,>=triangle 45,x=1.0cm,y=1.0cm,align=center] + +\node [input] (rinput) {}; +\node [block, right of=rinput,node distance=1.5cm] (unpack) {Unpack}; +\node [block, right of=unpack,node distance=2.5cm] (interp) {Interpolate}; +\node [block, right of=interp,node distance=3cm,text width=2cm] (post) {Post Filter}; +\node [block, below of=post,text width=2cm,node distance=2cm] (resample) {Resample to Rate $L$}; +\node [block, below of=resample,text width=2cm,node distance=2cm] (synth) {Sinusoidal\\Synthesis}; +\node [tmp, below of=resample,node distance=1cm] (z1) {}; +\node [block, right of=synth,text width=2cm] (phase) {Phase Synthesis}; +\node [output,left of=synth,node distance=2cm] (routput) {}; + +\draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack); +\draw [->] (unpack) -- (interp); +\draw [->] (interp) -- (post); +\draw [->] (post) -- node[left] {$\hat{\mathbf{c}}$} (resample); +\draw [->] (resample) -- node[left] {$\hat{\mathbf{a}}$} (synth); +\draw [->] (resample) -- (z1) -| (phase); +\draw [->] (phase) -- (synth); +\draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$}; + +\end{tikzpicture} +\end{center} +\end{figure} Some notes on the Codec 2 700C \emph{newamp1} algorithms: \begin{enumerate} @@ -847,10 +902,8 @@ Some notes on the Codec 2 700C \emph{newamp1} algorithms: \item The mode is capable of communications quality speech and is in common use with FreeDV, but is close to the lower limits of intelligibility, and doesn't do well in some languages (problems have been reported with German and Japanese). \item The VQ was trained on just 120 seconds of data - way too short. \item The parameter set (pitch, voicing, log spectral magnitudes) is very similar to that used for the latest neural vocoders. -\item The Rate K algorithms were recently revisited, several improvements proposed and prototyped \cite{rowe2023ratek}. -\end{enumerate}. - -TODO: Post filters for LPC/LSP and 700C. +\item The Rate K algorithms were recently revisited, several improvements were proposed and prototyped \cite{rowe2023ratek}. +\end{enumerate} \section{Further Work} @@ -861,7 +914,8 @@ Summary of mysteries/interesting points drawn out above. \item How to use Octave tools to single step through codec operation \item Table summarising source files with one line description \item Add doc license (Creative Commons?) -\item Energy distribution theory. Need for V model, neural vocoders, non-linear function. Figures and simulation plots would be useful. Figure of phase synthesis. +\item Energy distribution theory. Need for V model, neural vocoders, non-linear function. +\item Figures and simulation plots would be useful to better explain algorithms. \end{enumerate} @@ -880,7 +934,7 @@ Mode & Frm (ms) & Bits & $A_m$ & $E$ & $\omega_0$ & $v$ & Comment \\ 1600 & 40 & 64 & 36 & 10 & 14 & 4 \\ 1400 & 40 & 56 & 36 & 16 & - & 4 \\ 1300 & 40 & 52 & 36 & 5 & 7 & 4 & Joint $\omega_0$/E VQ \\ -1200 & 40 & 48 & 27 & 16 & - & 4 & LSP VQ, Joint $\omega_0$/E VQ, 1 spare \\ +1200 & 40 & 48 & 27 & 16 & - & 4 & LSP VQ, joint $\omega_0$/E VQ, 1 spare \\ 700C & 40 & 28 & 18 & 4 & 6 & - & VQ of log magnitudes \\ \hline \end{tabular} diff --git a/doc/codec2_refs.bib b/doc/codec2_refs.bib index 9999286..756a92b 100644 --- a/doc/codec2_refs.bib +++ b/doc/codec2_refs.bib @@ -68,3 +68,17 @@ year = {2023}, note = {\url{https://github.com/drowe67/misc/blob/master/ratek_resampler/ratek_resampler.pdf}} } + +@book{kondoz1994digital, + title={Digital speech: coding for low bit rate communication systems}, + author={Kondoz, Ahmet M}, + year={1994}, + publisher={John Wiley \& Sons} +} + +@book{kleijn1995speech, + title={Speech coding and synthesis}, + author={Kleijn, W Bastiaan and Paliwal, Kuldip K}, + year={1995}, + publisher={Elsevier Science Inc.} +} \ No newline at end of file -- cgit v1.2.3 From 348f68f6c8df2882324123e2901aa1cac7c44619 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sat, 9 Dec 2023 19:49:47 +1030 Subject: added LPC/LSP and LPC post figure figures, plus code to generate them --- doc/Makefile | 7 +++++-- doc/codec2.pdf | Bin 310563 -> 318830 bytes doc/codec2.tex | 53 +++++++++++++++++++++++++++++++++++------------------ octave/plamp.m | 42 ++++++++++++++++++++++++++++++++++++++---- src/c2sim.c | 2 +- 5 files changed, 79 insertions(+), 25 deletions(-) diff --git a/doc/Makefile b/doc/Makefile index 3729f6a..aba973c 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -2,14 +2,17 @@ # set these externally with an env variable (e.g. for GitHub action) to override # defaults below. Need to run cmake with -DDUMP + CODEC2_SRC ?= $(HOME)/codec2 CODEC2_BINARY ?= $(HOME)/codec2/build_linux/src PATH := $(PATH):$(CODEC2_BINARY) -PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex +PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lsp.tex + +all: $(PLOT_FILES) $(PLOT_FILES): echo $(PATH) - c2sim $(CODEC2_SRC)/raw/hts2a.raw --dump hts2a + c2sim $(CODEC2_SRC)/raw/hts2a.raw --dump hts2a --lpc 10 --lsp --lpcpf DISPLAY=""; printf "plamp('hts2a',f=37,epslatex=1)\nq\n" | octave-cli -qf -p $(CODEC2_SRC)/octave diff --git a/doc/codec2.pdf b/doc/codec2.pdf index f5f2804..c3d1a5f 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index f1ea924..a277026 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -91,12 +91,8 @@ Recently, machine learning has been applied to speech coding. This technology p To explain how Codec 2 works, lets look at some speech. Figure \ref{fig:hts2a_time} shows a short 40ms segment of speech in the time and frequency domain. On the time plot we can see the waveform is changing slowly over time as the word is articulated. On the right hand side it also appears to repeat itself - one cycle looks very similar to the last. This cycle time is the ``pitch period", which for this example is around $P=35$ samples. Given we are sampling at $F_s=8000$ Hz, the pitch period is $P/F_s=35/8000=0.0044$ seconds, or 4.4ms. -Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. - -Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. - -\begin{figure} -\caption{ A 40ms segment from the word ``these" from a female speaker, sampled at 8kHz. Top is a plot again time, bottom (blue) is a plot against frequency. The waveform repeats itself every 4.3ms ($F_0=230$ Hz), this is the ``pitch period" of this segment.} +\begin{figure} [H] +\caption{ A 40ms segment from the word ``these" from a female speaker, sampled at 8kHz. Top is a plot against time, bottom (blue) is a plot of the same speech against frequency. The waveform repeats itself every 4.3ms ($F_0=230$ Hz), this is the ``pitch period" of this segment. The red crosses are the sine wave amplitudes, explained in the text.} \label{fig:hts2a_time} \begin{center} \input hts2a_37_sn.tex @@ -105,6 +101,10 @@ Note that each harmonic has it's own amplitude, that varies across frequency. T \end{center} \end{figure} +Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. + +Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. + \subsection{Sinusoidal Speech Coding} A sinewave will cause a spike or spectral line on a spectrum plot, so we can see each spike as a small sine wave generator. Each sine wave generator has it's own frequency that are all multiples of the fundamental pitch frequency (e.g. $230, 460, 690,...$ Hz). They will also have their own amplitude and phase. If we add all the sine waves together (Figure \ref{fig:sinusoidal_model}) we can produce reasonable quality synthesised speech. This is called sinusoidal speech coding and is the speech production ``model" at the heart of Codec 2. @@ -343,7 +343,7 @@ b_m &= \lfloor (m + 0.5)r \rceil \\ r &= \frac{\omega_0 N_{dft}}{2 \pi} \end{split} \end{equation} -The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. +The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. Figure $\ref{fig:hts2a_time}$ plots $S_w$ (blue) and $\{A_m\}$ (red) for a sample frame of female speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. @@ -586,11 +586,19 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode. This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pairs (LSPs) to quantise and transmit the spectral magnitude information. There is a great deal of information available on these topics so they are only briefly described here. -The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}). A relatively flat excitation source $E(z)$ excites a filter $H(z)$ which models the magnitude spectrum of the speech. Linear Predictive Coding (LPC) defines $H(z)$ as an all pole filter: +\begin{figure} [h] +\caption{LPC spectrum $|H(e^{j \omega})|$ (green line) and LSP frequencies $\{\omega_i\}$ (green crosses) for the speech frame in Figure \ref{fig:hts2a_time}. The original speech spectrum (blue) and $A_m$ estimates (red) are provided as references.} +\label{fig:hts2a_lpc_lsp} +\begin{center} +\input hts2a_37_lpc_lsp.tex +\end{center} +\end{figure} + +The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}). A spectrally flat excitation source $E(z)$ excites a filter $H(z)$ which models the magnitude spectrum of the speech. In Linear Predictive Coding (LPC), we define $H(z)$ as an all pole filter: \begin{equation} H(z) = \frac{G}{1-\sum_{k=1}^p a_k z^{-k}} = \frac{G}{A(z)} \end{equation} -where $\{a_k\}, k=1..10$ is a set of p linear prediction coefficients that characterise the filter's frequency response and G is a scalar gain factor. An excellent reference for LPC is \cite{makhoul1975linear}. +where $\{a_k\}, k=1..10$ is a set of p linear prediction coefficients that characterise the filters frequency response and G is a scalar gain factor. The coefficients are time varying and are extracted from the input speech signal, typically using a least squares approach. An excellent reference for LPC is \cite{makhoul1975linear}. To be useful in low bit rate speech coding it is necessary to quantise and transmit the LPC coefficients using a small number of bits. Direct quantisation of these LPC coefficients is inappropriate due to their large dynamic range (8-10 bits/coefficient). Thus for transmission purposes, especially at low bit rates, other forms such as the Line Spectral Pair (LSP) \cite{itakura1975line} frequencies are used to represent the LPC parameters. The LSP frequencies can be derived by decomposing the $p$-th order polynomial $A(z)$, into symmetric and anti-symmetric polynomials $P(z)$ and $Q(z)$, shown here in factored form: \begin{equation} @@ -603,9 +611,9 @@ where $\omega_{2i-1}$ and $\omega_{2i}$ are the LSP frequencies, found by evalua \begin{equation} A(z) = \frac{P(z)+Q(z)}{2} \end{equation} -Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$. More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources. +Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$ to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$. More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources. -Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$. The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}. Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}). +Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$. The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}. Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still computed for use in voicing estimation (\ref{eq:voicing_snr}). \begin{figure}[h] \caption{LPC/LSP Modes Encoder} @@ -647,9 +655,9 @@ Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame. As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters. Using a fixed order LPC model is a neat solution to this problem. Another feature of LPC modelling combined with scalar LSP quantisation is a tolerance to variations in the input frequency response (see section \ref{sect:mode_newamp1} for more information on this issue). -Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope. +Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope described by $\{Am\}$. All of these problems can be observed in Figure \ref{fig:hts2a_lpc_lsp}. Thus exciting the LPC model by a simple, spectrally flat $E(z)$ will result in some errors in the reconstructed magnitude speech spectrum. -In CELP codecs these problems can be accommodated by the (high bit rate) excitation, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model. +In CELP codecs these problems can be accommodated by the (high bit rate) excitation used to construct a non-flat $E(z)$, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model. Before bit packing, the Codec 2 parameters are decimated in time. An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality. The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes. This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech. Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis. Nevertheless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate. @@ -693,6 +701,15 @@ where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame \begin{equation} arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right] \end{equation} + +\begin{figure} [h] +\caption{LPC post filter. LPC spectrum before $|H(e^{j \omega})|$ (green line) and after (red) post filtering. The distance between the spectral peaks and troughs has been increased. The step change at 1000 Hz is +3dB low frequency boost (see source code).} +\label{fig:hts2a_lpc_pf} +\begin{center} +\input hts2a_37_lpc_pf.tex +\end{center} +\end{figure} + Prior to sampling the amplitude and phase, a frequency domain post filter is applied to the LPC power spectrum. The algorithm is based on the MBE frequency domain post filter \cite[Section 8.6, p 267]{kondoz1994digital}, which is turn based on the frequency domain post filter from McAulay and Quatieri \cite[Section 4.3, p 148]{kleijn1995speech}. The authors report a significant improvement in speech quality from the post filter, which has also been our experience when applied to Codec 2. The post filter is given by: \begin{equation} \label{eq:lpc_lsp_pf} @@ -701,7 +718,7 @@ P_f(e^{j\omega}) &= g \left( R_w(e^{j \omega} \right))^\beta \\ R_w(^{j\omega}) &= A(e^{j \omega/ \gamma})/A(e^{j \omega}) \end{split} \end{equation} -where $g$ is a gain chosen to such that the energy of at the output of the post filter is the same as the input, $\beta=0.2$, and $\gamma=0.5$. The post filter raises the spectral peaks (formants), and pushes down the energy between formants. The $\beta$ term compensates for spectral tilt, such that $R_w$ is similar to the LPC synthesis filter $1/A(z)$ however with equal emphasis at low and high frequencies. The authors suggest the post filter reduces the noise level between formants, an explanation commonly given to post filters used for CELP codecs where significant inter-formant noise exists from the noisy excitation source. However in harmonic sinusoidal codecs there is no excitation noise between formants in $E(z)$. Our theory is the post filter also acts to reduce the bandwidth of spectral peaks, modifying the energy distribution across the time domain pitch cycle in a way that improves intelligibility, especially for low pitched speakers. +where $g$ is chosen to normalise the gain of the post filter, and $\beta=0.2$, $\gamma=0.5$ are experimentally derived constants. The post filter raises the spectral peaks (formants), and lowers the inter-formant energy. The $\gamma$ term compensates for spectral tilt, providing equal emphasis at low and high frequencies. The authors suggest the post filter reduces the noise level between formants, an explanation commonly given to post filters used for CELP codecs where significant inter-formant noise exists from the noisy excitation source. However in harmonic sinusoidal codecs there is no excitation noise between formants in $E(z)$. Our theory is the post filter also acts to reduce the bandwidth of spectral peaks, modifying the energy distribution across the time domain pitch cycle which improves speech quality, especially for low pitched speakers. A disadvantage of the post filter is the need for experimentally derived constants. It performs a non-linear operation on the speech spectrum, and if mis-applied can worsen speech quality. As it's operation is not completely understood, it represents a source of future quality improvement. @@ -817,10 +834,10 @@ k = warp^{-1}(f,K) = \frac{mel(f)-mel(200)}{g} + 1 \centering \begin{tikzpicture} \tkzDefPoint(1,1){A} -\tkzDefPoint(5,5){B} -\draw[thick] (1,1) node [right]{(1,mel(200))} -- (5,5) node [right]{(K,mel(3700))}; -\draw[thick,->] (0,0) -- (6,0) node [below]{k}; -\draw[thick,->] (0,0) -- (0,6) node [left]{mel(f)}; +\tkzDefPoint(3,3){B} +\draw[thick] (1,1) node [right]{(1,mel(200))} -- (3,3) node [right]{(K,mel(3700))}; +\draw[thick,->] (0,0) -- (4,0) node [below]{k}; +\draw[thick,->] (0,0) -- (0,4) node [left]{mel(f)}; \foreach \n in {A,B} \node at (\n)[circle,fill,inner sep=1.5pt]{}; \end{tikzpicture} diff --git a/octave/plamp.m b/octave/plamp.m index 6f0478f..c7a1291 100644 --- a/octave/plamp.m +++ b/octave/plamp.m @@ -21,6 +21,11 @@ function plamp(samname, f, epslatex=0) Ew = load(ew_name); endif + E_name = strcat(samname,"_E.txt"); + if (file_in_path(".",E_name)) + E = load(E_name); + endif + rk_name = strcat(samname,"_rk.txt"); if (file_in_path(".",rk_name)) Rk = load(rk_name); @@ -38,6 +43,10 @@ function plamp(samname, f, epslatex=0) if (file_in_path(".",pw_name)) Pw = load(pw_name); endif + pwb_name = strcat(samname,"_pwb.txt"); + if (file_in_path(".",pwb_name)) + Pwb = load(pwb_name); + endif lsp_name = strcat(samname,"_lsp.txt"); if (file_in_path(".",lsp_name)) @@ -63,14 +72,14 @@ function plamp(samname, f, epslatex=0) k = ' '; do - figure(1); + figure(1); clf; clf; s = [ Sn(2*f-1,:) Sn(2*f,:) ]; plot(s,'b'); axis([1 length(s) -30000 30000]); xlabel('Time (samples)'); ylabel('Amplitude'); - figure(2); + figure(2); clf; Wo = model(f,1); L = model(f,2); Am = model(f,3:(L+2)); @@ -80,9 +89,26 @@ function plamp(samname, f, epslatex=0) if plot_sw plot((0:255)*4000/256, Sw(f,:),"b"); end - legend('boxoff'); ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); + hold off; grid minor; + ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); + + figure(3); clf; + hold on; + plot((0:255)*4000/256, Sw(f,:),"b"); + plot((1:L)*Wo*4000/pi, 20*log10(Am),"+-r"); + plot((0:255)*4000/256, E(f)+10*log10(Pwb(f,:)),"g"); + plot(lsp(f,:)*4000/pi, 75,"g+"); + hold off; grid minor; + axis([1 4000 -10 80]); + ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); + figure(4); clf; + hold on; + plot((0:255)*4000/256, E(f)+10*log10(Pwb(f,:)),"g"); + plot((0:255)*4000/256, 10*log10(Pw(f,:)),"r"); hold off; grid minor; + axis([1 4000 -10 80]); + ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); % print EPS file @@ -103,7 +129,15 @@ function plamp(samname, f, epslatex=0) fn = sprintf("%s_%d_sw.tex",samname,f); print(fn,"-depslatex",sz); printf("printing... %s\n", fn); - restore_fonts(textfontsize,linewidth); + figure(3); + fn = sprintf("%s_%d_lpc_lsp.tex",samname,f); + print(fn,"-depslatex",sz); printf("printing... %s\n", fn); + + figure(4); + fn = sprintf("%s_%d_lpc_pf.tex",samname,f); + print(fn,"-depslatex",sz); printf("printing... %s\n", fn); + + restore_fonts(textfontsize,linewidth); endif % interactive menu diff --git a/src/c2sim.c b/src/c2sim.c index cf23d3a..3117415 100644 --- a/src/c2sim.c +++ b/src/c2sim.c @@ -1023,7 +1023,7 @@ int main(int argc, char *argv[]) { if (lpc_model) { lsp_to_lpc(&lsps_dec[i][0], &ak_dec[i][0], order); aks_to_M2(fftr_fwd_cfg, &ak_dec[i][0], order, &model_dec[i], e_dec[i], - &snr, 0, simlpcpf, lpcpf, 1, LPCPF_BETA, LPCPF_GAMMA, Aw); + &snr, 1, simlpcpf, lpcpf, 1, LPCPF_BETA, LPCPF_GAMMA, Aw); apply_lpc_correction(&model_dec[i]); sum_snr += snr; #ifdef DUMP -- cgit v1.2.3 From c27e56d09683527147e803defa1baea82a4c6a2b Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 10 Dec 2023 11:29:59 +1030 Subject: oops we forgot to rm this in recent clean up --- src/newamp2.h | 80 ----------------------------------------------------------- 1 file changed, 80 deletions(-) delete mode 100644 src/newamp2.h diff --git a/src/newamp2.h b/src/newamp2.h deleted file mode 100644 index 6a4d6a4..0000000 --- a/src/newamp2.h +++ /dev/null @@ -1,80 +0,0 @@ -/*---------------------------------------------------------------------------*\ - - FILE........: newamp2.h - AUTHOR......: Thomas Kurin and Stefan Erhardt - INSTITUTE...: Institute for Electronics Engineering, University of -Erlangen-Nuremberg DATE CREATED: July 2018 BASED ON....: "newamp1.h" by -David Rowe - - Quantisation functions for the sinusoidal coder, using "newamp1" - algorithm that resamples variable rate L [Am} to a fixed rate K then - VQs. - -\*---------------------------------------------------------------------------*/ - -/* - Copyright Thomas Kurin and Stefan Erhardt 2018 - - All rights reserved. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License version 2.1, as - published by the Free Software Foundation. This program is - distributed in the hope that it will be useful, but WITHOUT ANY - WARRANTY; without even the implied warranty of MERCHANTABILITY or - FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public - License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with this program; if not, see . -*/ - -#ifndef __NEWAMP2__ -#define __NEWAMP2__ - -#define NEWAMP2_N_INDEXES \ - 4 /* Number of indexes to pack: vq1, vq2, energy, Wo */ -#define NEWAMP2_PHASE_NFFT \ - 128 /* size of FFT used for phase synthesis */ -#define NEWAMP2_K 29 /* rate K vector length */ -#define NEWAMP2_16K_K \ - 40 /* rate K vector length for 16k Mode */ - -#include "codec2_fft.h" -#include "comp.h" - -void n2_mel_sample_freqs_kHz(float rate_K_sample_freqs_kHz[], int K); -void n2_resample_const_rate_f(C2CONST *c2const, MODEL *model, - float rate_K_vec[], - float rate_K_sample_freqs_kHz[], int K); -void n2_rate_K_mbest_encode(int *indexes, float *x, float *xq, int ndim); -void n2_resample_rate_L(C2CONST *c2const, MODEL *model, float rate_K_vec[], - float rate_K_sample_freqs_kHz[], int K, - int plosive_flag); -void n2_post_filter_newamp2(float vec[], float sample_freq_kHz[], int K, - float pf_gain); -void newamp2_interpolate(float interpolated_surface_[], float left_vec[], - float right_vec[], int K, int plosive_flag); -void newamp2_model_to_indexes(C2CONST *c2const, int indexes[], MODEL *model, - float rate_K_vec[], - float rate_K_sample_freqs_kHz[], int K, - float *mean, float rate_K_vec_no_mean[], - float rate_K_vec_no_mean_[], int plosiv); -void newamp2_indexes_to_rate_K_vec(float rate_K_vec_[], - float rate_K_vec_no_mean_[], - float rate_K_sample_freqs_kHz[], int K, - float *mean_, int indexes[], float pf_gain); -void newamp2_16k_indexes_to_rate_K_vec(float rate_K_vec_[], - float rate_K_vec_no_mean_[], - float rate_K_sample_freqs_kHz[], int K, - float *mean_, int indexes[], - float pf_gain); -void newamp2_indexes_to_model(C2CONST *c2const, MODEL model_[], COMP H[], - float interpolated_surface_[], - float prev_rate_K_vec_[], float *Wo_left, - int *voicing_left, - float rate_K_sample_freqs_kHz[], int K, - codec2_fft_cfg fwd_cfg, codec2_fft_cfg inv_cfg, - int indexes[], float pf_gain, int flag16k); - -#endif -- cgit v1.2.3 From 8a9b13e6dbed25fc599b01aede18e5d4173ce43f Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 10 Dec 2023 12:02:24 +1030 Subject: removed newamp2 code --- src/codec2.c | 1 - src/codec2_internal.h | 8 -------- 2 files changed, 9 deletions(-) diff --git a/src/codec2.c b/src/codec2.c index 52602e3..b27626a 100644 --- a/src/codec2.c +++ b/src/codec2.c @@ -46,7 +46,6 @@ #include "lpc.h" #include "lsp.h" #include "machdep.h" -#include "newamp2.h" #include "nlp.h" #include "phase.h" #include "postfilter.h" diff --git a/src/codec2_internal.h b/src/codec2_internal.h index 0b54585..32cd7eb 100644 --- a/src/codec2_internal.h +++ b/src/codec2_internal.h @@ -32,7 +32,6 @@ #include "codec2_fft.h" #include "newamp1.h" -#include "newamp2.h" struct CODEC2 { int mode; @@ -87,13 +86,6 @@ struct CODEC2 { float eq[NEWAMP1_K]; /* optional equaliser */ bool eq_en; - /*newamp2 states (also uses newamp1 states )*/ - float energy_prev; - float n2_rate_K_sample_freqs_kHz[NEWAMP2_K]; - float n2_prev_rate_K_vec_[NEWAMP2_K]; - float n2_pwb_rate_K_sample_freqs_kHz[NEWAMP2_16K_K]; - float n2_pwb_prev_rate_K_vec_[NEWAMP2_16K_K]; - /* used to dump features for deep learning experiments */ FILE *fmlfeat, *fmlmodel; -- cgit v1.2.3 From d1c085a180560b47d9c43f4b359707ff00d9e749 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 10 Dec 2023 12:05:55 +1030 Subject: Added a list or source files; edited Further Work section --- doc/codec2.pdf | Bin 318830 -> 320755 bytes doc/codec2.tex | 74 ++++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index c3d1a5f..ac00385 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index a277026..0d188a7 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -922,42 +922,63 @@ Some notes on the Codec 2 700C \emph{newamp1} algorithms: \item The Rate K algorithms were recently revisited, several improvements were proposed and prototyped \cite{rowe2023ratek}. \end{enumerate} -\section{Further Work} - -Summary of mysteries/interesting points drawn out above. - -\begin{enumerate} -\item Some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters. Listen to various phases of quantisation. -\item How to use Octave tools to single step through codec operation -\item Table summarising source files with one line description -\item Add doc license (Creative Commons?) -\item Energy distribution theory. Need for V model, neural vocoders, non-linear function. -\item Figures and simulation plots would be useful to better explain algorithms. -\end{enumerate} - - \section{Summary of Codec 2 Modes} \label{sect:glossary} \begin{table}[H] \label{tab:codec2_modes} \centering -\begin{tabular}{p{0.75cm}|p{0.75cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{5cm}} +\begin{tabular}{p{0.75cm}|p{0.75cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{0.5cm}|p{3cm}} \hline -Mode & Frm (ms) & Bits & $A_m$ & $E$ & $\omega_0$ & $v$ & Comment \\ +Mode & Frm (ms) & Bits & $A_m$ & $E$ & $\omega_0$ & $v$ & Use Cases \\ \hline -3200 & 20 & 64 & 50 & 5 & 7 & 2 & LSP differences \\ -2400 & 20 & 50 & 36 & 8 & - & 2 & Joint $\omega_0$/E VQ, 2 spare bits \\ -1600 & 40 & 64 & 36 & 10 & 14 & 4 \\ +3200 & 20 & 64 & 50 & 5 & 7 & 2 & M17 \\ +2400 & 20 & 50 & 36 & 8 & - & 2 \\ +1600 & 40 & 64 & 36 & 10 & 14 & 4 & M17 \\ 1400 & 40 & 56 & 36 & 16 & - & 4 \\ -1300 & 40 & 52 & 36 & 5 & 7 & 4 & Joint $\omega_0$/E VQ \\ -1200 & 40 & 48 & 27 & 16 & - & 4 & LSP VQ, joint $\omega_0$/E VQ, 1 spare \\ -700C & 40 & 28 & 18 & 4 & 6 & - & VQ of log magnitudes \\ +1300 & 40 & 52 & 36 & 5 & 7 & 4 & FreeDV 1600 \\ +1200 & 40 & 48 & 27 & 16 & - & 4 & \\ +700C & 40 & 28 & 18 & 4 & 6 & - & FreeDV 700C/D/E \\ \hline \end{tabular} \caption{Codec 2 Modes} \end{table} +The 3200 mode quantises the LSP differences $\omega_{i+1}-\omega_i$, which provides low distortion at the expense of robustness to bit errors, as an error in a low order LSP difference will propagate through the frame. The 2400 and 1200 bit/s modes use a joint delta $\omega_0$ and energy VQ, which is efficient but also also suffers from error propagation so is not suitable for high BER use cases. + +There is an unfortunate overlap in the naming conventions of Codec 2 and FreeDV. The Codec 2 700C mode is used in the FreeDV 700C, 700D, and 700E modes. + +\section{Summary of Codec 2 Source Files} +\label{sect:source_files} + +Codec 2 is part of the \emph{codec2} repository, which also includes various modems and FreeDV API code. This sections lists the files specific to the speech codec. The \emph{cmake} system builds the \emph{libcodec2} library, which is called by user applications via the Codec 2 API in \emph{codec2.h}. See the repository \emph{README} for information on building, demo applications, and an introduction to other features of the \emph{codec2} repository. + +\begin{table}[H] +\label{tab:codec2_file} +\centering +\begin{tabular}{l l} +\hline +File & Description \\ +\hline +c2dec & Sample decoder application \\ +c2enc & Sample encoder application \\ +c2sim & Simulation and development application \\ +codebook & Directory containing quantiser tables \\ +codec2.c & Quantised encoder and decoder functions that implement each mode \\ +codec2\_fft.c & Wrapper for FFT (usually kiss FFT) \\ +defines.h & Constants \\ +lpc.c & LPC functions \\ +mbest.c & Multistage VQ search \\ +newamp1.c & Codec 2 700C \emph{newamp1} mode \\ +nlp.c & Non-linear Pitch (NLP) \\ +sine.c & Sinusoidal analysis, synthesis, voicing estimation \\ +phase.c & Phase synthesis \\ +quantise.c & Quantisation, in particular for LPC/LSP modes \\ +\hline +\end{tabular} +\caption{Codec 2 Source Files} +\end{table} + \section{Glossary} \label{sect:glossary} @@ -1013,6 +1034,15 @@ $v$ & Voicing decision for the current frame \\ \caption{Glossary of Symbols} \end{table} +\section{Further Documentation Work} + +This section contains ideas for expanding the documentation of Codec 2. Please contact the authors if you are interested in this material or would like to help develop and test it. + +\begin{enumerate} +\item The \emph{c2sim} utility is presently undocumented. We could add some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters. Demonstrate how to listen to various phases of quantisation. +\item Several Octave scripts exist that were used to develop Codec 2. We could add information describing how to use the Octave tools to single step through the codec operation. +\end{enumerate} + \bibliographystyle{plain} \bibliography{codec2_refs} \end{document} -- cgit v1.2.3 From 05110e5fa8f10ac8fe7bd7aba2169a44c11ef7d9 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 10 Dec 2023 12:37:41 +1030 Subject: first pass at Makefile to build doc --- doc/Makefile | 20 +++++++++++++++++--- doc/codec2.pdf | Bin 320755 -> 320770 bytes doc/codec2.tex | 8 ++++---- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/doc/Makefile b/doc/Makefile index aba973c..1eaab1b 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,6 +1,11 @@ # Makefile for codec2.pdf +# +# usage: +# Build codec2 with -DDUMP (see README) +# cd ~/codec2/doc +# make -# set these externally with an env variable (e.g. for GitHub action) to override +# Set these externally with an env variable (e.g. for GitHub action) to override # defaults below. Need to run cmake with -DDUMP CODEC2_SRC ?= $(HOME)/codec2 @@ -8,11 +13,20 @@ CODEC2_BINARY ?= $(HOME)/codec2/build_linux/src PATH := $(PATH):$(CODEC2_BINARY) -PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lsp.tex +DOCNAME := codec2 +PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lpc_lsp.tex hts2a_37_lpc_pf.tex -all: $(PLOT_FILES) +$(DOCNAME).pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib + pdflatex $(DOCNAME).tex + bibtex $(DOCNAME).aux + pdflatex $(DOCNAME).tex + pdflatex $(DOCNAME).tex $(PLOT_FILES): echo $(PATH) c2sim $(CODEC2_SRC)/raw/hts2a.raw --dump hts2a --lpc 10 --lsp --lpcpf DISPLAY=""; printf "plamp('hts2a',f=37,epslatex=1)\nq\n" | octave-cli -qf -p $(CODEC2_SRC)/octave + +.PHONY: clean +clean: + rm *.blg *.bbl *.aux *.log $(DOCNAME).pdf \ No newline at end of file diff --git a/doc/codec2.pdf b/doc/codec2.pdf index ac00385..0acba11 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 0d188a7..f967286 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -101,7 +101,7 @@ To explain how Codec 2 works, lets look at some speech. Figure \ref{fig:hts2a_ti \end{center} \end{figure} -Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. +Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. If the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. @@ -222,13 +222,13 @@ Figure \ref{fig:codec2_decoder} shows the operation of the Codec 2 decoder. We The phases of each harmonic are generated using the other model parameters and some DSP. It turns out that if you know the amplitude spectrum, you can determine a ``reasonable" phase spectrum using some DSP operations, which in practice is implemented with a couple of FFTs. We also use the voicing information - for unvoiced speech we use random phases (a good way to synthesise noise-like signals) - and for voiced speech we make sure the phases are chosen so the synthesised speech transitions smoothly from one frame to the next. -Frames of speech are synthesised using an inverse FFT. We take a blank array of FFT samples, and at intervals of $F_0$ insert samples with the amplitude and phase for each harmonic. We then inverse FFT to create a frame of time domain samples. These frames of synthesised speech samples are carefully aligned with the previous frame to ensure smooth frame-frame transitions, and output to the listener. +Frames of speech are synthesised using an inverse FFT. We take a blank array of FFT samples, and at intervals of $F_0$ insert samples with the amplitude and phase of each harmonic. We then inverse FFT to create a frame of time domain samples. These frames of synthesised speech samples are carefully aligned with the previous frame to ensure smooth frame-frame transitions, and output to the listener. \subsection{Bit Allocation} Table \ref{tab:bit_allocation} presents the bit allocation for two popular Codec 2 modes. One additional parameter is the frame energy, this is the average level of the spectral amplitudes, or ``AF gain" of the speech frame. -At very low bit rates such as 700 bits/s, we use Vector Quantisation (VQ) to represent the spectral amplitudes. We construct a table such that each row of the table has a set of spectral amplitude samples. In Codec 2 700C the table has 512 rows. During the quantisation process, we choose the table row that best matches the spectral amplitudes for this frame, then send the \emph{index} of the table row. The decoder has a similar table, so can use the index to look up the output values. If the table is 512 rows, we can use a 9 bit number to quantise the spectral amplitudes. In Codec 2 700C, we use two tables of 512 entries each (18 bits total), the second one helps fine tune the quantisation from the first table. +At very low bit rates such as 700 bits/s, we use Vector Quantisation (VQ) to represent the spectral amplitudes. We construct a table such that each row of the table has a set of spectral amplitude samples. In Codec 2 700C the table has 512 rows. During the quantisation process, we choose the table row that best matches the spectral amplitudes for this frame, then send the \emph{index} of the table row. The decoder has a similar table, so can use the index to look up the spectral amplitude values. If the table is 512 rows, we can use a 9 bit number to quantise the spectral amplitudes. In Codec 2 700C, we use two tables of 512 entries each (18 bits total), the second one helps fine tune the quantisation from the first table. Vector Quantisation can only represent what is present in the tables, so if it sees anything unusual (for example a different microphone frequency response or background noise), the quantisation can become very rough and speech quality poor. We train the tables at design time using a database of speech samples and a training algorithm - an early form of machine learning. @@ -280,7 +280,7 @@ Both voiced and unvoiced speech is represented using a harmonic sinusoidal model \end{equation} where the parameters $A_m, \theta_m, m=1...L$ represent the magnitude and phases of each sinusoid, $\omega_0$ is the fundamental frequency in radians/sample, and $L=\lfloor \pi/\omega_0 \rfloor$ is the number of harmonics. -Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal analysis system at the core of the Codec 2 encoder. This algorithms described in this section is based on the work in \cite{rowe1997techniques}, with some changes in notation. +Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal analysis system at the core of the Codec 2 encoder. This algorithms described in this section are based on the work in \cite{rowe1997techniques}, with some changes in notation. \begin{figure}[h] \caption{Sinusoidal Analysis} -- cgit v1.2.3 From 7e88771a42998b2dc4e65f631e07fbb0679548c2 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 11 Dec 2023 08:45:13 +1030 Subject: proof read, minor edits, update symbol glossary --- doc/codec2.pdf | Bin 320770 -> 322270 bytes doc/codec2.tex | 72 ++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/doc/codec2.pdf b/doc/codec2.pdf index 0acba11..ae71c92 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index f967286..27181a2 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -70,7 +70,7 @@ Key feature includes: The Codec 2 project was started in 2009 in response to the problem of closed source, patented, proprietary voice codecs in the sub-5 kbit/s range, in particular for use in the Amateur Radio service. -This document describes Codec 2 at two levels. Section \ref{sect:overview} is a high level description aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description with math and signal processing theory. Combined with the C source code, it is intended to give the reader enough information to understand the operation of Codec 2 in detail and embark on source code level projects, such as improvements, ports to other languages, student or academic research projects. Issues with the current algorithms and topics for further work are also included. +This document describes Codec 2 at two levels. Section \ref{sect:overview} is a high level description aimed at the Radio Amateur, while Section \ref{sect:details} contains a more detailed description using math and signal processing theory. Combined with the C source code, it is intended to give the reader enough information to understand the operation of Codec 2 in detail and embark on source code level projects, such as improvements, ports to other languages, student or academic research projects. Issues with the current algorithms and topics for further work are also included. Section {\ref{sect:codec2_modes} provides a summary of the Codec 2 modes, and Section \ref{sect:source_files} a guide to the C source files. A glossary of terms and symbols is provided in Section \ref{sect:glossary}, and Section \ref{sect:further_work} has suggestions for further documentation work. This production of this document was kindly supported by an ARDC grant \cite{ardc2023}. As an open source project, many people have contributed to Codec 2 over the years - we deeply appreciate all of your support. @@ -424,7 +424,7 @@ From the $N_{dft}$ samples produced by the IDFT (\ref{eq:synth_idft}), after win \subsection{Non-Linear Pitch Estimation} \label{sect:nlp} -The Non-Linear Pitch (NLP) pitch estimator was developed by the author, and is described in detail in chapter 4 of \cite{rowe1997techniques}, and portions of this description are reproduced here. The post processing algorithm used for pitch estimation in Codec 2 is different from \cite{rowe1997techniques} and is described here. The C code \emph{nlp.c} is a useful reference for the fine details of the implementation, and the Octave script \emph{plnlp.m} can by used to plot the internal states and single step through speech, illustrating the operation of the algorithm. +The Non-Linear Pitch (NLP) pitch estimator was developed by the author, described in detail in chapter 4 of \cite{rowe1997techniques}, and portions of this description are reproduced here. The post processing algorithm used for pitch estimation in Codec 2 is different from \cite{rowe1997techniques} and is described here. The C code \emph{nlp.c} is a useful reference for the fine details of the implementation, and the Octave script \emph{plnlp.m} can by used to plot the internal states and single step through speech, illustrating the operation of the algorithm. The core pitch detector is based on a square law non-linearity, that is applied directly to the input speech signal. Given speech is composed of harmonics separated by $F_0$ the non-linearity generates intermodulation products at $F_0$, even if the fundamental is absent from the input signal due to high pass filtering. @@ -432,7 +432,7 @@ Figure \ref{fig:nlp} illustrates the algorithm. The fundamental frequency $F_0$ The speech signal is first squared then notch filtered to remove the DC component from the squared time domain signal. This prevents the large amplitude DC term from interfering with the somewhat smaller amplitude term at the fundamental. This is particularly important for male speakers, who may have low frequency fundamentals close to DC. The notch filter is applied in the time domain and has the experimentally derived transfer function: \begin{equation} -H(z) = \frac{1-z^{-1}}{1-0.95z^{-1}} +H_{notch}(z) = \frac{1-z^{-1}}{1-0.95z^{-1}} \end{equation} \begin{figure}[h] @@ -484,7 +484,7 @@ There is nothing particularly unique about this pitch estimator or it's performa \subsection{Voicing Estimation} -Voicing is determined using a variation of the MBE voicing algorithm \cite{griffin1988multiband}. Voiced speech consists of a harmonic series of frequency domain impulses, separated by $\omega_0$. When we multiply a segment of the input speech samples by the window function $w(n)$, we convolve the frequency domain impulses with $W(k)$, the DFT of the $(w)$. Thus for the $m$-th voiced harmonic, we expect to see a cop yof the window function $W(k)$ in the band $Sw(k), k=a_m,...,b_m$. The MBE voicing algorithm starts with the assumption that the band is voiced, and measures the error between $S_w(k)$ and the ideal voiced harmonic $\hat{S}_w(k)$. +Voicing is determined using a variation of the MBE voicing algorithm \cite{griffin1988multiband}. Voiced speech consists of a harmonic series of frequency domain impulses, separated by $\omega_0$. When we multiply a segment of the input speech samples by the window function $w(n)$, we convolve the frequency domain impulses with $W(k)$, the DFT of the $w(n)$. Thus for the $m$-th voiced harmonic, we expect to see a copy of the window function $W(k)$ in each band $Sw(k), k=a_m,...,b_m$. The MBE voicing algorithm starts with the assumption that the band is voiced, and measures the error between $S_w(k)$ and the ideal voiced harmonic $\hat{S}_w(k)$. For each band we first estimate the complex harmonic amplitude (magnitude and phase) using \cite{griffin1988multiband}: \begin{equation} @@ -576,7 +576,7 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi \item Through headphones speech synthesised with this model drops in quality. Through a small loudspeaker it is very close to original phases. \item If there are voicing errors, the speech can sound clicky or staticy. If voiced speech is mistakenly declared unvoiced, this model tends to synthesise annoying impulses or clicks, as for voiced speech $H(z)$ is relatively flat (broad, high frequency formants), so there is very little dispersion of the excitation impulses through $H(z)$. \item When combined with amplitude modelling or quantisation, such that $H(z)$ is derived from $\{\hat{A}_m\}$ there is an additional drop in quality. -\item This synthesis model (e.g. a pulse train exciting a LPC filter) is effectively the same as a simple LPC-10 vocoders, and yet (especially when $H(z)$ is derived from unquantised $\{A_m\}$) sounds much better. Conventional wisdom (AMBE, MELP) says mixed voicing is required for high quality speech. +\item This synthesis model (e.g. a pulse train exciting a LPC filter) is effectively the same as a simple LPC-10 vocoders, and yet (especially when $arg[H(z)]$ is derived from unquantised $\{A_m\}$) sounds much better. Conventional wisdom (AMBE, MELP) says mixed voicing is required for high quality speech. \item If $H(z)$ is changing rapidly between frames, it's phase contribution may also change rapidly. This approach could cause some discontinuities in the phase at the edge of synthesis frames, as no attempt is made to make sure that the phase tracks are continuous (the excitation phases are continuous, but not the final phases after filtering by $H(z)$). \item The recent crop of neural vocoders produce high quality speech using a similar parameters set, and notably without transmitting phase information. Although many of these vocoders operate in the time domain, this approach can be interpreted as implementing a function $\{ \hat{\theta}_m\} = F(\omega_0, \{Am\},v)$. This validates the general approach used here, and as future work Codec 2 may benefit from being augmented by machine learning. \end{enumerate} @@ -607,13 +607,13 @@ P(z) &= (1+z^{-1}) \prod_{i=1}^{p/2} (1 - 2cos(\omega_{2i-1} z^{-1} + z^{-2} ) \ Q(z) &= (1-z^{-1}) \prod_{i=1}^{p/2} (1 - 2cos(\omega_{2i} z^{-1} + z^{-2} ) \end{split} \end{equation} -where $\omega_{2i-1}$ and $\omega_{2i}$ are the LSP frequencies, found by evaluating the polynomials on the unit circle. The LSP frequencies are interlaced with each other, where $0<\omega_1 < \omega_2 <,..., < \omega_p < \pi$. The separation of adjacent LSP frequencies is related to the bandwidth of spectral peaks in $H(z)=G/A(z)$. A small separation indicates a narrow bandwidth. $A(z)$ may be reconstructed from $P(z)$ and $Q(z)$ using: +where $\omega_{2i-1}$ and $\omega_{2i}$ are the LSP frequencies, found by evaluating the polynomials on the unit circle. The LSP frequencies are interlaced with each other, where $0<\omega_1 < \omega_2 <,..., < \omega_p < \pi$. The separation of adjacent LSP frequencies is related to the bandwidth of spectral peaks in $H(z)=G/A(z)$. A small separation indicates a narrow bandwidth, as shown in Figure \ref{fig:hts2a_lpc_lsp}. $A(z)$ may be reconstructed from $P(z)$ and $Q(z)$ using: \begin{equation} A(z) = \frac{P(z)+Q(z)}{2} \end{equation} Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$ to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$. More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources. -Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$. The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}. Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still computed for use in voicing estimation (\ref{eq:voicing_snr}). +Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$. The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}. Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still computed for use in voicing estimation (\ref{eq:voicing_snr}). \begin{figure}[h] \caption{LPC/LSP Modes Encoder} @@ -653,9 +653,9 @@ Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping \end{center} \end{figure} -One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame. As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters. Using a fixed order LPC model is a neat solution to this problem. Another feature of LPC modelling combined with scalar LSP quantisation is a tolerance to variations in the input frequency response (see section \ref{sect:mode_newamp1} for more information on this issue). +One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame. As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters. Using a fixed order LPC model is a neat solution to this problem. Another feature of LPC modelling combined with scalar LSP quantisation is some tolerance to variations in the input frequency response, e.g. due to microphone or anti-alias filter shape factors (see section \ref{sect:mode_newamp1} for more information on this issue). -Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope described by $\{Am\}$. All of these problems can be observed in Figure \ref{fig:hts2a_lpc_lsp}. Thus exciting the LPC model by a simple, spectrally flat $E(z)$ will result in some errors in the reconstructed magnitude speech spectrum. +Some disadvantages \cite{makhoul1975linear} are the LPC spectrum $|H(e^{j \omega})|$ doesn't follow the spectral magnitudes $A_m$ exactly, in other words is requires a non-flat excitation spectrum to accurately model the amplitude spectrum. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single harmonics, rather than tracking the spectral envelope described by $\{Am\}$. All of these problems can be observed in Figure \ref{fig:hts2a_lpc_lsp}. Thus exciting the LPC model by a simple, spectrally flat $E(z)$ will result in some errors in the reconstructed magnitude speech spectrum. In CELP codecs these problems can be accommodated by the (high bit rate) excitation used to construct a non-flat $E(z)$, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model. @@ -697,7 +697,7 @@ Figure \ref{fig:decoder_lpc_lsp} shows the LPC/LSP mode decoder. Frames of bits \begin{equation} \hat{A}_m = \sqrt{ \sum_{k=a_m}^{b_m-1} | \hat{H}(k) |^2 } \end{equation} -where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame. For phase synthesis, the phase of $H(z)$ is determined by sampling $\hat{H}(k)$ in the centre of each harmonic: +where $\hat{H}(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame. For phase synthesis, the $arg[H(z)]$ component is determined by sampling $\hat{H}(k)$ in the centre of each harmonic: \begin{equation} arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right] \end{equation} @@ -814,7 +814,7 @@ g &= \frac{mel(3700)-mel(200)}{K-1} \\ mel(f) &= g(k-1) + mel(200) \end{split} \end{equation} -Substituting (\ref{eq:f_mel}) into the LHS: +where $g$ is the gradient of the line. Substituting (\ref{eq:f_mel}) into the LHS: \begin{equation} \label{eq:warp} \begin{split} @@ -855,7 +855,7 @@ The input speech may be subject to arbitrary filtering, for example due to the m For every input frame $l$, the equaliser (EQ) updates the dimension $K$ equaliser vector $\mathbf{e}$: \begin{equation} -\mathbf{e}^{l+1} = \mathbf{e}^l + \beta(\mathbf{b} - \mathbf{t}) +\mathbf{e}^{l} = \mathbf{e}^{l-1} + \beta(\mathbf{b} - \mathbf{t}) \end{equation} where $\mathbf{t}$ is a fixed target vector set to the mean of the VQ quantiser, and $\beta$ is a small adaption constant. @@ -897,17 +897,18 @@ Figure \ref{fig:decoder_newamp1} is the block diagram of the decoder signal proc \node [block, below of=post,text width=2cm,node distance=2cm] (resample) {Resample to Rate $L$}; \node [block, below of=resample,text width=2cm,node distance=2cm] (synth) {Sinusoidal\\Synthesis}; \node [tmp, below of=resample,node distance=1cm] (z1) {}; -\node [block, right of=synth,text width=2cm] (phase) {Phase Synthesis}; -\node [output,left of=synth,node distance=2cm] (routput) {}; +\node [block, left of=synth,text width=2cm] (phase) {Phase Synthesis}; +\node [output,right of=synth,node distance=2cm] (routput) {}; \draw [->] node[align=left,text width=2cm] {Bit\\Stream} (rinput) -- (unpack); \draw [->] (unpack) -- (interp); -\draw [->] (interp) -- (post); -\draw [->] (post) -- node[left] {$\hat{\mathbf{c}}$} (resample); -\draw [->] (resample) -- node[left] {$\hat{\mathbf{a}}$} (synth); +\draw [->] (interp) -- node[above] {$\hat{\mathbf{c}}$} (post); +\draw [->] (post) -- node[left] {$\hat{\mathbf{c}} + \mathbf{p}$} (resample); +\draw [->] (interp) |- node[left] {$\hat{\omega_0}, v$} (resample); +\draw [->] (resample) -- node[right] {$\hat{\mathbf{a}}$} (synth); \draw [->] (resample) -- (z1) -| (phase); \draw [->] (phase) -- (synth); -\draw [->] (synth) -- (routput) node[align=left,text width=1.5cm] {$\hat{s}(n)$}; +\draw [->] (synth) -- (routput) node[align=right,text width=1.5cm] {$\hat{s}(n)$}; \end{tikzpicture} \end{center} @@ -923,7 +924,7 @@ Some notes on the Codec 2 700C \emph{newamp1} algorithms: \end{enumerate} \section{Summary of Codec 2 Modes} -\label{sect:glossary} +\label{sect:codec2_modes} \begin{table}[H] \label{tab:codec2_modes} @@ -1012,22 +1013,39 @@ VQ & Vector Quantiser \\ Symbol & Description & Units \\ \hline $A(z)$ & LPC (analysis) filter \\ -$a_m$ & lower DFT index of current band \\ -$b_m$ & upper DFT index of current band \\ +$a_m$ & Lower DFT index of current band \\ +$b_m$ & Upper DFT index of current band \\ $\{A_m\}$ & Set of harmonic magnitudes $m=1,...L$ & dB \\ +$\mathbf{a}$ & $\{A_m\}$ in vector form \\ +$B_m$ & Complex spectral amplitudes used for voicing estimation \\ +$E$ & Frame energy \\ +$E(z)$ & Excitation in source-filter model \\ $F_0$ & Fundamental frequency (pitch) & Hz \\ $F_s$ & Sample rate (usually 8 kHz) & Hz \\ $F_w(k)$ & DFT of squared speech signal in NLP pitch estimator \\ +$G$ & LPC gain \\ +$H(z)$ & Synthesis filter in source-filter model \\ +$\hat{H}(z)$ & Synthesis filter approximation after quantisation \\ +$l$ & Frame index \\ $L$ & Number of harmonics \\ +$N$ & Processing frame size in samples \\ +$n_0$ & Excitation pulse position \\ $P$ & Pitch period & ms or samples \\ +$P(z), Q(z)$ & LSP polynomials \\ +$P_f(e^{j \omega})$ & LPC post filter \\ $\{\theta_m\}$ & Set of harmonic phases $m=1,...L$ & dB \\ $r$ & Maps a harmonic number $m$ to a DFT index \\ -$s(n)$ & Input speech \\ +$s(n)$ & Input time domain speech \\ +$\hat{s}(n)$ & Output (synthesised) time domain speech \\ $s_w(n)$ & Time domain windowed input speech \\ $S_w(k)$ & Frequency domain windowed input speech \\ +$\hat{S}_w(k)$ & Frequency domain output (synthesised)speech \\ +$t(n)$ & Triangular synthesis window \\ $\phi_m$ & Phase of excitation harmonic \\ $\omega_0$ & Fundamental frequency (pitch) & radians/sample \\ -$\{\omega_i\}$ & set of LSP frequencies \\ +$\{\omega_i\}$ & Set of LSP frequencies \\ +$w(n)$ & Window function \\ +$W(k)$ & DFT of window function \\ $v$ & Voicing decision for the current frame \\ \hline \end{tabular} @@ -1035,14 +1053,16 @@ $v$ & Voicing decision for the current frame \\ \end{table} \section{Further Documentation Work} +\label{sect:further_work} -This section contains ideas for expanding the documentation of Codec 2. Please contact the authors if you are interested in this material or would like to help develop and test it. +This section contains ideas for expanding the documentation of Codec 2. Please contact the authors if you are interested in this material or would like to help develop it. \begin{enumerate} -\item The \emph{c2sim} utility is presently undocumented. We could add some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters. Demonstrate how to listen to various phases of quantisation. -\item Several Octave scripts exist that were used to develop Codec 2. We could add information describing how to use the Octave tools to single step through the codec operation. +\item The \emph{c2sim} utility is presently undocumented. We could add some worked examples aimed at the experimenter - e.g. using c2sim to extract and plot model parameters. Demonstrate how to listen to various stages of quantisation. +\item Several GNU Octave scripts exist that were used to develop Codec 2. We could add information describing how to use the Octave tools to single step through the codec operation. \end{enumerate} +\addcontentsline{toc}{chapter}{References} \bibliographystyle{plain} \bibliography{codec2_refs} \end{document} -- cgit v1.2.3 From ea0379f375fee5f9574a51a797025aa5e9390db8 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 11 Dec 2023 09:15:47 +1030 Subject: ctest, README.md, first pass at github action --- .github/workflows/cmake.yml | 1 + CMakeLists.txt | 4 ++++ README.md | 7 ++++++- doc/Makefile | 12 +++++++----- doc/codec2.pdf | Bin 322270 -> 322353 bytes 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 61c9c5c..b5425d0 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -22,6 +22,7 @@ jobs: run: | sudo apt-get update sudo apt-get install octave octave-common octave-signal liboctave-dev gnuplot sox p7zip-full python3-numpy valgrind clang-format + sudo apt-get install texmaker texlive-bibtex-extra texlive-science - name: Create Build Directory shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index d96667f..3f3a1dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -312,6 +312,10 @@ if(UNITTEST) COMMAND sh -c "cd ${CMAKE_CURRENT_SOURCE_DIR}; clang-format --dry-run --Werror src/*.c src/*.h unittest/*.c demo/*.c") + add_test(NAME test_codec2_doc + COMMAND sh -c "cd ${CMAKE_CURRENT_SOURCE_DIR}/doc; + CODEC2_SRC=${CMAKE_CURRENT_SOURCE_DIR} CODEC2_BINARY=${CMAKE_CURRENT_BINARY_DIR} make") + add_test(NAME test_freedv_get_hash COMMAND sh -c "${CMAKE_CURRENT_BINARY_DIR}/unittest/thash") diff --git a/README.md b/README.md index 46ef6b2..726152c 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,10 @@ We have standardized on C99 and develop and test using gcc on a Linux platform. make ``` +## Documentation + +An algorithm description can be found in `doc/codec2.pdf`. + ## Programs + See `demo` directory for simple examples of using Codec and the FreeDV API. @@ -138,7 +142,7 @@ CTest is used as a test framework, with support from [GNU Octave](https://www.gn 1. Install GNU Octave and libraries on Ubuntu with: ``` - sudo apt install octave octave-common octave-signal liboctave-dev gnuplot python3-numpy sox valgrind clang-format + sudo apt install octave octave-common octave-signal liboctave-dev gnuplot python3-numpy sox valgrind clang-format texmaker texlive-bibtex-extra texlive-science ``` 1. To build and run the tests: ``` @@ -180,6 +184,7 @@ CTest is used as a test framework, with support from [GNU Octave](https://www.gn ``` cmake - cmake support files demo - Simple Codec 2 and FreeDv API demo applications +doc - documentation octave - Octave scripts used to support ctests src - C source code for Codec 2, FDMDV modem, COHPSK modem, FreeDV API raw - speech files in raw format (16 bits signed linear 8 kHz) diff --git a/doc/Makefile b/doc/Makefile index 1eaab1b..606d05f 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,7 +1,7 @@ # Makefile for codec2.pdf # # usage: -# Build codec2 with -DDUMP (see README) +# Build codec2 with -DUNITEST=1 (see README) # cd ~/codec2/doc # make @@ -9,14 +9,16 @@ # defaults below. Need to run cmake with -DDUMP CODEC2_SRC ?= $(HOME)/codec2 -CODEC2_BINARY ?= $(HOME)/codec2/build_linux/src +CODEC2_BINARY ?= $(HOME)/codec2/build_linux -PATH := $(PATH):$(CODEC2_BINARY) +PATH := $(PATH):$(CODEC2_BINARY)/src DOCNAME := codec2 PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lpc_lsp.tex hts2a_37_lpc_pf.tex -$(DOCNAME).pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib +# For automated tests we always want to build the PDF, despite codec2.pdf existing in the repo +.PHONY: pdf +pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib pdflatex $(DOCNAME).tex bibtex $(DOCNAME).aux pdflatex $(DOCNAME).tex @@ -29,4 +31,4 @@ $(PLOT_FILES): .PHONY: clean clean: - rm *.blg *.bbl *.aux *.log $(DOCNAME).pdf \ No newline at end of file + rm -f *.blg *.bbl *.aux *.log *.out $(DOCNAME).pdf hts2a* \ No newline at end of file diff --git a/doc/codec2.pdf b/doc/codec2.pdf index ae71c92..ac2e63c 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ -- cgit v1.2.3 From 21dd265f96391d11c0e09196e62bcd62a3d2828c Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 11 Dec 2023 11:54:02 +1030 Subject: way to run doc ctest without over writing codec2.doc --- CMakeLists.txt | 3 ++- doc/Makefile | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f3a1dc..ff1b295 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -314,7 +314,8 @@ if(UNITTEST) add_test(NAME test_codec2_doc COMMAND sh -c "cd ${CMAKE_CURRENT_SOURCE_DIR}/doc; - CODEC2_SRC=${CMAKE_CURRENT_SOURCE_DIR} CODEC2_BINARY=${CMAKE_CURRENT_BINARY_DIR} make") + make clean; + CODEC2_SRC=${CMAKE_CURRENT_SOURCE_DIR} CODEC2_BINARY=${CMAKE_CURRENT_BINARY_DIR} JOBNAME=test make") add_test(NAME test_freedv_get_hash COMMAND sh -c "${CMAKE_CURRENT_BINARY_DIR}/unittest/thash") diff --git a/doc/Makefile b/doc/Makefile index 606d05f..0658fe1 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -5,24 +5,25 @@ # cd ~/codec2/doc # make -# Set these externally with an env variable (e.g. for GitHub action) to override -# defaults below. Need to run cmake with -DDUMP +DOCNAME ?= codec2 + +# Set these externally to override defaults. JOBNAME sets the output file basename, +# and avoids over writing codec2.pdf (e.g. when we are running a doc build test, but don't actually +# want to change codec2.pdf in the repo) CODEC2_SRC ?= $(HOME)/codec2 CODEC2_BINARY ?= $(HOME)/codec2/build_linux +JOBNAME ?= $(DOCNAME) PATH := $(PATH):$(CODEC2_BINARY)/src -DOCNAME := codec2 PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lpc_lsp.tex hts2a_37_lpc_pf.tex -# For automated tests we always want to build the PDF, despite codec2.pdf existing in the repo -.PHONY: pdf -pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib - pdflatex $(DOCNAME).tex - bibtex $(DOCNAME).aux - pdflatex $(DOCNAME).tex - pdflatex $(DOCNAME).tex +$(DOCNAME).pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib + pdflatex -jobname=$(JOBNAME) $(DOCNAME).tex + bibtex $(JOBNAME).aux + pdflatex -jobname=$(JOBNAME) $(DOCNAME).tex + pdflatex -jobname=$(JOBNAME) $(DOCNAME).tex $(PLOT_FILES): echo $(PATH) @@ -31,4 +32,4 @@ $(PLOT_FILES): .PHONY: clean clean: - rm -f *.blg *.bbl *.aux *.log *.out $(DOCNAME).pdf hts2a* \ No newline at end of file + rm -f *.blg *.bbl *.aux *.log *.out hts2a* \ No newline at end of file -- cgit v1.2.3 From 18c5e48d5c86a5d02c996d129880bb8d212e9b75 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 11 Dec 2023 12:16:48 +1030 Subject: exclude test_codec2_doc when running tests on github actions --- .github/workflows/cmake.yml | 2 +- doc/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index b5425d0..69ab0fb 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -51,7 +51,7 @@ jobs: - name: Run ctests working-directory: ${{github.workspace}}/build_linux shell: bash - run: ctest --output-on-failure + run: ctest --output-on-failure -E test_codec2_doc - name: Test library installation working-directory: ${{github.workspace}}/build_linux diff --git a/doc/Makefile b/doc/Makefile index 0658fe1..659d4f6 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -32,4 +32,4 @@ $(PLOT_FILES): .PHONY: clean clean: - rm -f *.blg *.bbl *.aux *.log *.out hts2a* \ No newline at end of file + rm -f *.blg *.bbl *.aux *.log *.out hts2a* -- cgit v1.2.3 From b8e452709330c75abfab55919575966da3bb030c Mon Sep 17 00:00:00 2001 From: drowe67 Date: Tue, 12 Dec 2023 09:33:57 +1030 Subject: don't need tex packages as we've excluded that test for now --- .github/workflows/cmake.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 69ab0fb..7e76b65 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -22,8 +22,7 @@ jobs: run: | sudo apt-get update sudo apt-get install octave octave-common octave-signal liboctave-dev gnuplot sox p7zip-full python3-numpy valgrind clang-format - sudo apt-get install texmaker texlive-bibtex-extra texlive-science - + - name: Create Build Directory shell: bash run: mkdir $GITHUB_WORKSPACE/build_linux -- cgit v1.2.3