From 9bc86bc2c7fe516703bf3d1adb939657a405e966 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 19 Nov 2023 07:14:04 +1030 Subject: building up plot support --- doc/Makefile | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 doc/Makefile (limited to 'doc/Makefile') diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..3729f6a --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,15 @@ +# Makefile for codec2.pdf + +# set these externally with an env variable (e.g. for GitHub action) to override +# defaults below. Need to run cmake with -DDUMP +CODEC2_SRC ?= $(HOME)/codec2 +CODEC2_BINARY ?= $(HOME)/codec2/build_linux/src + +PATH := $(PATH):$(CODEC2_BINARY) + +PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex + +$(PLOT_FILES): + echo $(PATH) + c2sim $(CODEC2_SRC)/raw/hts2a.raw --dump hts2a + DISPLAY=""; printf "plamp('hts2a',f=37,epslatex=1)\nq\n" | octave-cli -qf -p $(CODEC2_SRC)/octave -- cgit v1.2.3 From 348f68f6c8df2882324123e2901aa1cac7c44619 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sat, 9 Dec 2023 19:49:47 +1030 Subject: added LPC/LSP and LPC post figure figures, plus code to generate them --- doc/Makefile | 7 +++++-- doc/codec2.pdf | Bin 310563 -> 318830 bytes doc/codec2.tex | 53 +++++++++++++++++++++++++++++++++++------------------ octave/plamp.m | 42 ++++++++++++++++++++++++++++++++++++++---- src/c2sim.c | 2 +- 5 files changed, 79 insertions(+), 25 deletions(-) (limited to 'doc/Makefile') diff --git a/doc/Makefile b/doc/Makefile index 3729f6a..aba973c 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -2,14 +2,17 @@ # set these externally with an env variable (e.g. for GitHub action) to override # defaults below. Need to run cmake with -DDUMP + CODEC2_SRC ?= $(HOME)/codec2 CODEC2_BINARY ?= $(HOME)/codec2/build_linux/src PATH := $(PATH):$(CODEC2_BINARY) -PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex +PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lsp.tex + +all: $(PLOT_FILES) $(PLOT_FILES): echo $(PATH) - c2sim $(CODEC2_SRC)/raw/hts2a.raw --dump hts2a + c2sim $(CODEC2_SRC)/raw/hts2a.raw --dump hts2a --lpc 10 --lsp --lpcpf DISPLAY=""; printf "plamp('hts2a',f=37,epslatex=1)\nq\n" | octave-cli -qf -p $(CODEC2_SRC)/octave diff --git a/doc/codec2.pdf b/doc/codec2.pdf index f5f2804..c3d1a5f 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index f1ea924..a277026 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -91,12 +91,8 @@ Recently, machine learning has been applied to speech coding. This technology p To explain how Codec 2 works, lets look at some speech. Figure \ref{fig:hts2a_time} shows a short 40ms segment of speech in the time and frequency domain. On the time plot we can see the waveform is changing slowly over time as the word is articulated. On the right hand side it also appears to repeat itself - one cycle looks very similar to the last. This cycle time is the ``pitch period", which for this example is around $P=35$ samples. Given we are sampling at $F_s=8000$ Hz, the pitch period is $P/F_s=35/8000=0.0044$ seconds, or 4.4ms. -Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. - -Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. - -\begin{figure} -\caption{ A 40ms segment from the word ``these" from a female speaker, sampled at 8kHz. Top is a plot again time, bottom (blue) is a plot against frequency. The waveform repeats itself every 4.3ms ($F_0=230$ Hz), this is the ``pitch period" of this segment.} +\begin{figure} [H] +\caption{ A 40ms segment from the word ``these" from a female speaker, sampled at 8kHz. Top is a plot against time, bottom (blue) is a plot of the same speech against frequency. The waveform repeats itself every 4.3ms ($F_0=230$ Hz), this is the ``pitch period" of this segment. The red crosses are the sine wave amplitudes, explained in the text.} \label{fig:hts2a_time} \begin{center} \input hts2a_37_sn.tex @@ -105,6 +101,10 @@ Note that each harmonic has it's own amplitude, that varies across frequency. T \end{center} \end{figure} +Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. + +Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. + \subsection{Sinusoidal Speech Coding} A sinewave will cause a spike or spectral line on a spectrum plot, so we can see each spike as a small sine wave generator. Each sine wave generator has it's own frequency that are all multiples of the fundamental pitch frequency (e.g. $230, 460, 690,...$ Hz). They will also have their own amplitude and phase. If we add all the sine waves together (Figure \ref{fig:sinusoidal_model}) we can produce reasonable quality synthesised speech. This is called sinusoidal speech coding and is the speech production ``model" at the heart of Codec 2. @@ -343,7 +343,7 @@ b_m &= \lfloor (m + 0.5)r \rceil \\ r &= \frac{\omega_0 N_{dft}}{2 \pi} \end{split} \end{equation} -The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. +The DFT indexes $a_m, b_m$ select the band of $S_w(k)$ containing the $m$-th harmonic; $r$ maps the harmonic number $m$ to the nearest DFT index, and $\lfloor x \rceil$ is the rounding operator. This method of estimating $A_m$ is relatively insensitive to small errors in $F0$ estimation and works equally well for voiced and unvoiced speech. Figure $\ref{fig:hts2a_time}$ plots $S_w$ (blue) and $\{A_m\}$ (red) for a sample frame of female speech. The phase is sampled at the centre of the band. For all practical Codec 2 modes the phase is not transmitted to the decoder so does not need to be computed. However speech synthesised using the phase is useful as a control during development, and is available using the \emph{c2sim} utility. @@ -586,11 +586,19 @@ Comparing to speech synthesised using original phases $\{\theta_m\}$ the followi In this and the next section we explain how the codec building blocks above are assembled to create a fully quantised Codec 2 mode. This section discusses the higher bit rate (3200 - 1200) modes that use a Linear Predictive Coding (LPC) and Line Spectrum Pairs (LSPs) to quantise and transmit the spectral magnitude information. There is a great deal of information available on these topics so they are only briefly described here. -The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}). A relatively flat excitation source $E(z)$ excites a filter $H(z)$ which models the magnitude spectrum of the speech. Linear Predictive Coding (LPC) defines $H(z)$ as an all pole filter: +\begin{figure} [h] +\caption{LPC spectrum $|H(e^{j \omega})|$ (green line) and LSP frequencies $\{\omega_i\}$ (green crosses) for the speech frame in Figure \ref{fig:hts2a_time}. The original speech spectrum (blue) and $A_m$ estimates (red) are provided as references.} +\label{fig:hts2a_lpc_lsp} +\begin{center} +\input hts2a_37_lpc_lsp.tex +\end{center} +\end{figure} + +The source-filter model of speech production was introduced above in Equation (\ref{eq:source_filter}). A spectrally flat excitation source $E(z)$ excites a filter $H(z)$ which models the magnitude spectrum of the speech. In Linear Predictive Coding (LPC), we define $H(z)$ as an all pole filter: \begin{equation} H(z) = \frac{G}{1-\sum_{k=1}^p a_k z^{-k}} = \frac{G}{A(z)} \end{equation} -where $\{a_k\}, k=1..10$ is a set of p linear prediction coefficients that characterise the filter's frequency response and G is a scalar gain factor. An excellent reference for LPC is \cite{makhoul1975linear}. +where $\{a_k\}, k=1..10$ is a set of p linear prediction coefficients that characterise the filters frequency response and G is a scalar gain factor. The coefficients are time varying and are extracted from the input speech signal, typically using a least squares approach. An excellent reference for LPC is \cite{makhoul1975linear}. To be useful in low bit rate speech coding it is necessary to quantise and transmit the LPC coefficients using a small number of bits. Direct quantisation of these LPC coefficients is inappropriate due to their large dynamic range (8-10 bits/coefficient). Thus for transmission purposes, especially at low bit rates, other forms such as the Line Spectral Pair (LSP) \cite{itakura1975line} frequencies are used to represent the LPC parameters. The LSP frequencies can be derived by decomposing the $p$-th order polynomial $A(z)$, into symmetric and anti-symmetric polynomials $P(z)$ and $Q(z)$, shown here in factored form: \begin{equation} @@ -603,9 +611,9 @@ where $\omega_{2i-1}$ and $\omega_{2i}$ are the LSP frequencies, found by evalua \begin{equation} A(z) = \frac{P(z)+Q(z)}{2} \end{equation} -Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$. More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources. +Thus to transmit the LPC coefficients using LSPs, we first transform the LPC model $A(z)$ to $P(z)$ and $Q(z)$ polynomial form. We then solve $P(z)$ and $Q(z)$ for $z=e^{j \omega}$ to obtain $p$ LSP frequencies $\{\omega_i\}$. The LSP frequencies are then quantised and transmitted over the channel. At the receiver the quantised LSPs are then used to reconstruct an approximation of $A(z)$. More details on LSP analysis can be found in \cite{rowe1997techniques} and many other sources. -Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$. The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}. Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still required for voicing estimation (\ref{eq:voicing_snr}). +Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping input speech frames are processed every 10ms ($N=80$ samples). LPC analysis determines a set of $p=10$ LPC coefficients $\{a_k\}$ that describe a filter the spectral envelope of the current frame and the LPC energy $E=G^2$. The LPC coefficients are transformed to $p=10$ LSP frequencies $\{\omega_i\}$. The source code for these algorithms is in \emph{lpc.c} and \emph{lsp.c}. The LSP frequencies are then quantised to a fixed number of bits/frame. Other parameters include the pitch $\omega_0$, LPC energy $E$, and voicing $v$. The quantisation and bit packing source code for each Codec 2 mode can be found in \emph{codec2.c}. Note the spectral magnitudes $\{A_m\}$ are not transmitted, but are still computed for use in voicing estimation (\ref{eq:voicing_snr}). \begin{figure}[h] \caption{LPC/LSP Modes Encoder} @@ -647,9 +655,9 @@ Figure \ref{fig:encoder_lpc_lsp} presents the LPC/LSP mode encoder. Overlapping One of the problems with quantising spectral magnitudes in sinusoidal codecs is the time varying number of harmonic magnitudes, as $L=\pi/\omega_0$, and $\omega_0$ varies from frame to frame. As we require a fixed bit rate for our uses cases, it is desirable to have a fixed number of parameters. Using a fixed order LPC model is a neat solution to this problem. Another feature of LPC modelling combined with scalar LSP quantisation is a tolerance to variations in the input frequency response (see section \ref{sect:mode_newamp1} for more information on this issue). -Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope. +Some disadvantages \cite{makhoul1975linear} are that the energy minimisation property means the LPC residual spectrum is rarely flat, i.e. it doesn't follow the spectral magnitudes $A_m$ exactly. The slope of the LPC spectrum near 0 and $\pi$ must be 0, which means it does not track perceptually important low frequency information well. For high pitched speakers, LPC tends to place poles around single pitch harmonics, rather than tracking the spectral envelope described by $\{Am\}$. All of these problems can be observed in Figure \ref{fig:hts2a_lpc_lsp}. Thus exciting the LPC model by a simple, spectrally flat $E(z)$ will result in some errors in the reconstructed magnitude speech spectrum. -In CELP codecs these problems can be accommodated by the (high bit rate) excitation, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model. +In CELP codecs these problems can be accommodated by the (high bit rate) excitation used to construct a non-flat $E(z)$, and some low rate codecs such as MELP supply supplementary low frequency information to ``correct" the LPC model. Before bit packing, the Codec 2 parameters are decimated in time. An update rate of 20ms is used for the highest rate modes, which drops to 40ms for Codec 2 1300, with a corresponding drop in speech quality. The number of bits used to quantise the LPC model via LSPs is also reduced in the lower bit rate modes. This has the effect of making the speech less intelligible, and can introduce annoying buzzy or clicky artefacts into the synthesised speech. Lower fidelity spectral magnitude quantisation also results in more noticeable artefacts from phase synthesis. Nevertheless at 1300 bits/s the speech quality is quite usable for HF digital voice, and at 3200 bits/s comparable to closed source codecs at the same bit rate. @@ -693,6 +701,15 @@ where $H(k)$ is the $N_{dft}$ point DFT of the received LPC model for this frame \begin{equation} arg \left[ H(e^{j \omega_0 m}) \right] = arg \left[ \hat{H}(\lfloor m r \rceil) \right] \end{equation} + +\begin{figure} [h] +\caption{LPC post filter. LPC spectrum before $|H(e^{j \omega})|$ (green line) and after (red) post filtering. The distance between the spectral peaks and troughs has been increased. The step change at 1000 Hz is +3dB low frequency boost (see source code).} +\label{fig:hts2a_lpc_pf} +\begin{center} +\input hts2a_37_lpc_pf.tex +\end{center} +\end{figure} + Prior to sampling the amplitude and phase, a frequency domain post filter is applied to the LPC power spectrum. The algorithm is based on the MBE frequency domain post filter \cite[Section 8.6, p 267]{kondoz1994digital}, which is turn based on the frequency domain post filter from McAulay and Quatieri \cite[Section 4.3, p 148]{kleijn1995speech}. The authors report a significant improvement in speech quality from the post filter, which has also been our experience when applied to Codec 2. The post filter is given by: \begin{equation} \label{eq:lpc_lsp_pf} @@ -701,7 +718,7 @@ P_f(e^{j\omega}) &= g \left( R_w(e^{j \omega} \right))^\beta \\ R_w(^{j\omega}) &= A(e^{j \omega/ \gamma})/A(e^{j \omega}) \end{split} \end{equation} -where $g$ is a gain chosen to such that the energy of at the output of the post filter is the same as the input, $\beta=0.2$, and $\gamma=0.5$. The post filter raises the spectral peaks (formants), and pushes down the energy between formants. The $\beta$ term compensates for spectral tilt, such that $R_w$ is similar to the LPC synthesis filter $1/A(z)$ however with equal emphasis at low and high frequencies. The authors suggest the post filter reduces the noise level between formants, an explanation commonly given to post filters used for CELP codecs where significant inter-formant noise exists from the noisy excitation source. However in harmonic sinusoidal codecs there is no excitation noise between formants in $E(z)$. Our theory is the post filter also acts to reduce the bandwidth of spectral peaks, modifying the energy distribution across the time domain pitch cycle in a way that improves intelligibility, especially for low pitched speakers. +where $g$ is chosen to normalise the gain of the post filter, and $\beta=0.2$, $\gamma=0.5$ are experimentally derived constants. The post filter raises the spectral peaks (formants), and lowers the inter-formant energy. The $\gamma$ term compensates for spectral tilt, providing equal emphasis at low and high frequencies. The authors suggest the post filter reduces the noise level between formants, an explanation commonly given to post filters used for CELP codecs where significant inter-formant noise exists from the noisy excitation source. However in harmonic sinusoidal codecs there is no excitation noise between formants in $E(z)$. Our theory is the post filter also acts to reduce the bandwidth of spectral peaks, modifying the energy distribution across the time domain pitch cycle which improves speech quality, especially for low pitched speakers. A disadvantage of the post filter is the need for experimentally derived constants. It performs a non-linear operation on the speech spectrum, and if mis-applied can worsen speech quality. As it's operation is not completely understood, it represents a source of future quality improvement. @@ -817,10 +834,10 @@ k = warp^{-1}(f,K) = \frac{mel(f)-mel(200)}{g} + 1 \centering \begin{tikzpicture} \tkzDefPoint(1,1){A} -\tkzDefPoint(5,5){B} -\draw[thick] (1,1) node [right]{(1,mel(200))} -- (5,5) node [right]{(K,mel(3700))}; -\draw[thick,->] (0,0) -- (6,0) node [below]{k}; -\draw[thick,->] (0,0) -- (0,6) node [left]{mel(f)}; +\tkzDefPoint(3,3){B} +\draw[thick] (1,1) node [right]{(1,mel(200))} -- (3,3) node [right]{(K,mel(3700))}; +\draw[thick,->] (0,0) -- (4,0) node [below]{k}; +\draw[thick,->] (0,0) -- (0,4) node [left]{mel(f)}; \foreach \n in {A,B} \node at (\n)[circle,fill,inner sep=1.5pt]{}; \end{tikzpicture} diff --git a/octave/plamp.m b/octave/plamp.m index 6f0478f..c7a1291 100644 --- a/octave/plamp.m +++ b/octave/plamp.m @@ -21,6 +21,11 @@ function plamp(samname, f, epslatex=0) Ew = load(ew_name); endif + E_name = strcat(samname,"_E.txt"); + if (file_in_path(".",E_name)) + E = load(E_name); + endif + rk_name = strcat(samname,"_rk.txt"); if (file_in_path(".",rk_name)) Rk = load(rk_name); @@ -38,6 +43,10 @@ function plamp(samname, f, epslatex=0) if (file_in_path(".",pw_name)) Pw = load(pw_name); endif + pwb_name = strcat(samname,"_pwb.txt"); + if (file_in_path(".",pwb_name)) + Pwb = load(pwb_name); + endif lsp_name = strcat(samname,"_lsp.txt"); if (file_in_path(".",lsp_name)) @@ -63,14 +72,14 @@ function plamp(samname, f, epslatex=0) k = ' '; do - figure(1); + figure(1); clf; clf; s = [ Sn(2*f-1,:) Sn(2*f,:) ]; plot(s,'b'); axis([1 length(s) -30000 30000]); xlabel('Time (samples)'); ylabel('Amplitude'); - figure(2); + figure(2); clf; Wo = model(f,1); L = model(f,2); Am = model(f,3:(L+2)); @@ -80,9 +89,26 @@ function plamp(samname, f, epslatex=0) if plot_sw plot((0:255)*4000/256, Sw(f,:),"b"); end - legend('boxoff'); ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); + hold off; grid minor; + ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); + + figure(3); clf; + hold on; + plot((0:255)*4000/256, Sw(f,:),"b"); + plot((1:L)*Wo*4000/pi, 20*log10(Am),"+-r"); + plot((0:255)*4000/256, E(f)+10*log10(Pwb(f,:)),"g"); + plot(lsp(f,:)*4000/pi, 75,"g+"); + hold off; grid minor; + axis([1 4000 -10 80]); + ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); + figure(4); clf; + hold on; + plot((0:255)*4000/256, E(f)+10*log10(Pwb(f,:)),"g"); + plot((0:255)*4000/256, 10*log10(Pw(f,:)),"r"); hold off; grid minor; + axis([1 4000 -10 80]); + ylabel ('Amplitude (dB)'); xlabel('Frequency (Hz)'); % print EPS file @@ -103,7 +129,15 @@ function plamp(samname, f, epslatex=0) fn = sprintf("%s_%d_sw.tex",samname,f); print(fn,"-depslatex",sz); printf("printing... %s\n", fn); - restore_fonts(textfontsize,linewidth); + figure(3); + fn = sprintf("%s_%d_lpc_lsp.tex",samname,f); + print(fn,"-depslatex",sz); printf("printing... %s\n", fn); + + figure(4); + fn = sprintf("%s_%d_lpc_pf.tex",samname,f); + print(fn,"-depslatex",sz); printf("printing... %s\n", fn); + + restore_fonts(textfontsize,linewidth); endif % interactive menu diff --git a/src/c2sim.c b/src/c2sim.c index cf23d3a..3117415 100644 --- a/src/c2sim.c +++ b/src/c2sim.c @@ -1023,7 +1023,7 @@ int main(int argc, char *argv[]) { if (lpc_model) { lsp_to_lpc(&lsps_dec[i][0], &ak_dec[i][0], order); aks_to_M2(fftr_fwd_cfg, &ak_dec[i][0], order, &model_dec[i], e_dec[i], - &snr, 0, simlpcpf, lpcpf, 1, LPCPF_BETA, LPCPF_GAMMA, Aw); + &snr, 1, simlpcpf, lpcpf, 1, LPCPF_BETA, LPCPF_GAMMA, Aw); apply_lpc_correction(&model_dec[i]); sum_snr += snr; #ifdef DUMP -- cgit v1.2.3 From 05110e5fa8f10ac8fe7bd7aba2169a44c11ef7d9 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 10 Dec 2023 12:37:41 +1030 Subject: first pass at Makefile to build doc --- doc/Makefile | 20 +++++++++++++++++--- doc/codec2.pdf | Bin 320755 -> 320770 bytes doc/codec2.tex | 8 ++++---- 3 files changed, 21 insertions(+), 7 deletions(-) (limited to 'doc/Makefile') diff --git a/doc/Makefile b/doc/Makefile index aba973c..1eaab1b 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,6 +1,11 @@ # Makefile for codec2.pdf +# +# usage: +# Build codec2 with -DDUMP (see README) +# cd ~/codec2/doc +# make -# set these externally with an env variable (e.g. for GitHub action) to override +# Set these externally with an env variable (e.g. for GitHub action) to override # defaults below. Need to run cmake with -DDUMP CODEC2_SRC ?= $(HOME)/codec2 @@ -8,11 +13,20 @@ CODEC2_BINARY ?= $(HOME)/codec2/build_linux/src PATH := $(PATH):$(CODEC2_BINARY) -PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lsp.tex +DOCNAME := codec2 +PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lpc_lsp.tex hts2a_37_lpc_pf.tex -all: $(PLOT_FILES) +$(DOCNAME).pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib + pdflatex $(DOCNAME).tex + bibtex $(DOCNAME).aux + pdflatex $(DOCNAME).tex + pdflatex $(DOCNAME).tex $(PLOT_FILES): echo $(PATH) c2sim $(CODEC2_SRC)/raw/hts2a.raw --dump hts2a --lpc 10 --lsp --lpcpf DISPLAY=""; printf "plamp('hts2a',f=37,epslatex=1)\nq\n" | octave-cli -qf -p $(CODEC2_SRC)/octave + +.PHONY: clean +clean: + rm *.blg *.bbl *.aux *.log $(DOCNAME).pdf \ No newline at end of file diff --git a/doc/codec2.pdf b/doc/codec2.pdf index ac00385..0acba11 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ diff --git a/doc/codec2.tex b/doc/codec2.tex index 0d188a7..f967286 100644 --- a/doc/codec2.tex +++ b/doc/codec2.tex @@ -101,7 +101,7 @@ To explain how Codec 2 works, lets look at some speech. Figure \ref{fig:hts2a_ti \end{center} \end{figure} -Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. Turns out of the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. +Now if the pitch period is 4.4ms, the pitch frequency or \emph{fundamental} frequency $F_0$ is about $1/0.0044 \approx 230$ Hz. If we look at the blue frequency domain plot at the bottom of Figure \ref{fig:hts2a_time}, we can see spikes that repeat every 230 Hz. If the signal is repeating itself in the time domain, it also repeats itself in the frequency domain. Those spikes separated by about 230 Hz are harmonics of the fundamental frequency $F_0$. Note that each harmonic has it's own amplitude, that varies across frequency. The red line plots the amplitude of each harmonic. In this example there is a peak around 500 Hz, and another, broader peak around 2300 Hz. The ear perceives speech by the location of these peaks and troughs. @@ -222,13 +222,13 @@ Figure \ref{fig:codec2_decoder} shows the operation of the Codec 2 decoder. We The phases of each harmonic are generated using the other model parameters and some DSP. It turns out that if you know the amplitude spectrum, you can determine a ``reasonable" phase spectrum using some DSP operations, which in practice is implemented with a couple of FFTs. We also use the voicing information - for unvoiced speech we use random phases (a good way to synthesise noise-like signals) - and for voiced speech we make sure the phases are chosen so the synthesised speech transitions smoothly from one frame to the next. -Frames of speech are synthesised using an inverse FFT. We take a blank array of FFT samples, and at intervals of $F_0$ insert samples with the amplitude and phase for each harmonic. We then inverse FFT to create a frame of time domain samples. These frames of synthesised speech samples are carefully aligned with the previous frame to ensure smooth frame-frame transitions, and output to the listener. +Frames of speech are synthesised using an inverse FFT. We take a blank array of FFT samples, and at intervals of $F_0$ insert samples with the amplitude and phase of each harmonic. We then inverse FFT to create a frame of time domain samples. These frames of synthesised speech samples are carefully aligned with the previous frame to ensure smooth frame-frame transitions, and output to the listener. \subsection{Bit Allocation} Table \ref{tab:bit_allocation} presents the bit allocation for two popular Codec 2 modes. One additional parameter is the frame energy, this is the average level of the spectral amplitudes, or ``AF gain" of the speech frame. -At very low bit rates such as 700 bits/s, we use Vector Quantisation (VQ) to represent the spectral amplitudes. We construct a table such that each row of the table has a set of spectral amplitude samples. In Codec 2 700C the table has 512 rows. During the quantisation process, we choose the table row that best matches the spectral amplitudes for this frame, then send the \emph{index} of the table row. The decoder has a similar table, so can use the index to look up the output values. If the table is 512 rows, we can use a 9 bit number to quantise the spectral amplitudes. In Codec 2 700C, we use two tables of 512 entries each (18 bits total), the second one helps fine tune the quantisation from the first table. +At very low bit rates such as 700 bits/s, we use Vector Quantisation (VQ) to represent the spectral amplitudes. We construct a table such that each row of the table has a set of spectral amplitude samples. In Codec 2 700C the table has 512 rows. During the quantisation process, we choose the table row that best matches the spectral amplitudes for this frame, then send the \emph{index} of the table row. The decoder has a similar table, so can use the index to look up the spectral amplitude values. If the table is 512 rows, we can use a 9 bit number to quantise the spectral amplitudes. In Codec 2 700C, we use two tables of 512 entries each (18 bits total), the second one helps fine tune the quantisation from the first table. Vector Quantisation can only represent what is present in the tables, so if it sees anything unusual (for example a different microphone frequency response or background noise), the quantisation can become very rough and speech quality poor. We train the tables at design time using a database of speech samples and a training algorithm - an early form of machine learning. @@ -280,7 +280,7 @@ Both voiced and unvoiced speech is represented using a harmonic sinusoidal model \end{equation} where the parameters $A_m, \theta_m, m=1...L$ represent the magnitude and phases of each sinusoid, $\omega_0$ is the fundamental frequency in radians/sample, and $L=\lfloor \pi/\omega_0 \rfloor$ is the number of harmonics. -Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal analysis system at the core of the Codec 2 encoder. This algorithms described in this section is based on the work in \cite{rowe1997techniques}, with some changes in notation. +Figure \ref{fig:analysis} illustrates the processing steps in the sinusoidal analysis system at the core of the Codec 2 encoder. This algorithms described in this section are based on the work in \cite{rowe1997techniques}, with some changes in notation. \begin{figure}[h] \caption{Sinusoidal Analysis} -- cgit v1.2.3 From ea0379f375fee5f9574a51a797025aa5e9390db8 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 11 Dec 2023 09:15:47 +1030 Subject: ctest, README.md, first pass at github action --- .github/workflows/cmake.yml | 1 + CMakeLists.txt | 4 ++++ README.md | 7 ++++++- doc/Makefile | 12 +++++++----- doc/codec2.pdf | Bin 322270 -> 322353 bytes 5 files changed, 18 insertions(+), 6 deletions(-) (limited to 'doc/Makefile') diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 61c9c5c..b5425d0 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -22,6 +22,7 @@ jobs: run: | sudo apt-get update sudo apt-get install octave octave-common octave-signal liboctave-dev gnuplot sox p7zip-full python3-numpy valgrind clang-format + sudo apt-get install texmaker texlive-bibtex-extra texlive-science - name: Create Build Directory shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index d96667f..3f3a1dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -312,6 +312,10 @@ if(UNITTEST) COMMAND sh -c "cd ${CMAKE_CURRENT_SOURCE_DIR}; clang-format --dry-run --Werror src/*.c src/*.h unittest/*.c demo/*.c") + add_test(NAME test_codec2_doc + COMMAND sh -c "cd ${CMAKE_CURRENT_SOURCE_DIR}/doc; + CODEC2_SRC=${CMAKE_CURRENT_SOURCE_DIR} CODEC2_BINARY=${CMAKE_CURRENT_BINARY_DIR} make") + add_test(NAME test_freedv_get_hash COMMAND sh -c "${CMAKE_CURRENT_BINARY_DIR}/unittest/thash") diff --git a/README.md b/README.md index 46ef6b2..726152c 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,10 @@ We have standardized on C99 and develop and test using gcc on a Linux platform. make ``` +## Documentation + +An algorithm description can be found in `doc/codec2.pdf`. + ## Programs + See `demo` directory for simple examples of using Codec and the FreeDV API. @@ -138,7 +142,7 @@ CTest is used as a test framework, with support from [GNU Octave](https://www.gn 1. Install GNU Octave and libraries on Ubuntu with: ``` - sudo apt install octave octave-common octave-signal liboctave-dev gnuplot python3-numpy sox valgrind clang-format + sudo apt install octave octave-common octave-signal liboctave-dev gnuplot python3-numpy sox valgrind clang-format texmaker texlive-bibtex-extra texlive-science ``` 1. To build and run the tests: ``` @@ -180,6 +184,7 @@ CTest is used as a test framework, with support from [GNU Octave](https://www.gn ``` cmake - cmake support files demo - Simple Codec 2 and FreeDv API demo applications +doc - documentation octave - Octave scripts used to support ctests src - C source code for Codec 2, FDMDV modem, COHPSK modem, FreeDV API raw - speech files in raw format (16 bits signed linear 8 kHz) diff --git a/doc/Makefile b/doc/Makefile index 1eaab1b..606d05f 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,7 +1,7 @@ # Makefile for codec2.pdf # # usage: -# Build codec2 with -DDUMP (see README) +# Build codec2 with -DUNITEST=1 (see README) # cd ~/codec2/doc # make @@ -9,14 +9,16 @@ # defaults below. Need to run cmake with -DDUMP CODEC2_SRC ?= $(HOME)/codec2 -CODEC2_BINARY ?= $(HOME)/codec2/build_linux/src +CODEC2_BINARY ?= $(HOME)/codec2/build_linux -PATH := $(PATH):$(CODEC2_BINARY) +PATH := $(PATH):$(CODEC2_BINARY)/src DOCNAME := codec2 PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lpc_lsp.tex hts2a_37_lpc_pf.tex -$(DOCNAME).pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib +# For automated tests we always want to build the PDF, despite codec2.pdf existing in the repo +.PHONY: pdf +pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib pdflatex $(DOCNAME).tex bibtex $(DOCNAME).aux pdflatex $(DOCNAME).tex @@ -29,4 +31,4 @@ $(PLOT_FILES): .PHONY: clean clean: - rm *.blg *.bbl *.aux *.log $(DOCNAME).pdf \ No newline at end of file + rm -f *.blg *.bbl *.aux *.log *.out $(DOCNAME).pdf hts2a* \ No newline at end of file diff --git a/doc/codec2.pdf b/doc/codec2.pdf index ae71c92..ac2e63c 100644 Binary files a/doc/codec2.pdf and b/doc/codec2.pdf differ -- cgit v1.2.3 From 21dd265f96391d11c0e09196e62bcd62a3d2828c Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 11 Dec 2023 11:54:02 +1030 Subject: way to run doc ctest without over writing codec2.doc --- CMakeLists.txt | 3 ++- doc/Makefile | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'doc/Makefile') diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f3a1dc..ff1b295 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -314,7 +314,8 @@ if(UNITTEST) add_test(NAME test_codec2_doc COMMAND sh -c "cd ${CMAKE_CURRENT_SOURCE_DIR}/doc; - CODEC2_SRC=${CMAKE_CURRENT_SOURCE_DIR} CODEC2_BINARY=${CMAKE_CURRENT_BINARY_DIR} make") + make clean; + CODEC2_SRC=${CMAKE_CURRENT_SOURCE_DIR} CODEC2_BINARY=${CMAKE_CURRENT_BINARY_DIR} JOBNAME=test make") add_test(NAME test_freedv_get_hash COMMAND sh -c "${CMAKE_CURRENT_BINARY_DIR}/unittest/thash") diff --git a/doc/Makefile b/doc/Makefile index 606d05f..0658fe1 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -5,24 +5,25 @@ # cd ~/codec2/doc # make -# Set these externally with an env variable (e.g. for GitHub action) to override -# defaults below. Need to run cmake with -DDUMP +DOCNAME ?= codec2 + +# Set these externally to override defaults. JOBNAME sets the output file basename, +# and avoids over writing codec2.pdf (e.g. when we are running a doc build test, but don't actually +# want to change codec2.pdf in the repo) CODEC2_SRC ?= $(HOME)/codec2 CODEC2_BINARY ?= $(HOME)/codec2/build_linux +JOBNAME ?= $(DOCNAME) PATH := $(PATH):$(CODEC2_BINARY)/src -DOCNAME := codec2 PLOT_FILES := hts2a_37_sn.tex hts2a_37_sw.tex hts2a_37_lpc_lsp.tex hts2a_37_lpc_pf.tex -# For automated tests we always want to build the PDF, despite codec2.pdf existing in the repo -.PHONY: pdf -pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib - pdflatex $(DOCNAME).tex - bibtex $(DOCNAME).aux - pdflatex $(DOCNAME).tex - pdflatex $(DOCNAME).tex +$(DOCNAME).pdf: $(PLOT_FILES) $(DOCNAME).tex $(DOCNAME)_refs.bib + pdflatex -jobname=$(JOBNAME) $(DOCNAME).tex + bibtex $(JOBNAME).aux + pdflatex -jobname=$(JOBNAME) $(DOCNAME).tex + pdflatex -jobname=$(JOBNAME) $(DOCNAME).tex $(PLOT_FILES): echo $(PATH) @@ -31,4 +32,4 @@ $(PLOT_FILES): .PHONY: clean clean: - rm -f *.blg *.bbl *.aux *.log *.out $(DOCNAME).pdf hts2a* \ No newline at end of file + rm -f *.blg *.bbl *.aux *.log *.out hts2a* \ No newline at end of file -- cgit v1.2.3 From 18c5e48d5c86a5d02c996d129880bb8d212e9b75 Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 11 Dec 2023 12:16:48 +1030 Subject: exclude test_codec2_doc when running tests on github actions --- .github/workflows/cmake.yml | 2 +- doc/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'doc/Makefile') diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index b5425d0..69ab0fb 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -51,7 +51,7 @@ jobs: - name: Run ctests working-directory: ${{github.workspace}}/build_linux shell: bash - run: ctest --output-on-failure + run: ctest --output-on-failure -E test_codec2_doc - name: Test library installation working-directory: ${{github.workspace}}/build_linux diff --git a/doc/Makefile b/doc/Makefile index 0658fe1..659d4f6 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -32,4 +32,4 @@ $(PLOT_FILES): .PHONY: clean clean: - rm -f *.blg *.bbl *.aux *.log *.out hts2a* \ No newline at end of file + rm -f *.blg *.bbl *.aux *.log *.out hts2a* -- cgit v1.2.3