Photo by Jezael Melgoza on Unsplash

11.1. Noise attenuation#

When using speech technology in realistic environments, such as at home, office or in a car, there will invariably be also other sounds present and not only the speech sounds of desired speaker. There will be the background hum of computers and air conditioning, cars honking, other speakers, and so on. Such sounds reduces the quality of the desired signal, making it more strenuous to listen, more difficult to understand or at the worst case, it might render the speech signal unintelligible. A common feature of these sounds is however that they are independent of and uncorrelated with the desired signal. [Benesty et al., 2008]

That is, we can usually assume that such noises are additive, such that the observed signal \(y\) is the sum of the desired signal \(x\) and interfering noises \(v\), that is, \(y=x+v\). To improve the quality of the observed signal, we would like to make an estimate \( \hat x \) of the desired signal \(x\). The estimate should approximate the desired signal \( x\approx \hat x \) or conversely, we would like to minimize the distance \( d\left(x,\hat x\right) \) with some distance measure \(d(\cdot,\cdot)\).

Hide code cell source
# Initialization for all
from import wavfile
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import scipy
import scipy.fft

#from helper_functions import stft, istft, halfsinewindow
Hide code cell source
def stft(data,fs,window_length_ms=30,window_step_ms=20,windowing_function=None):
    window_length = int(window_length_ms*fs/2000)*2
    window_step = int(window_step_ms*fs/1000)
    if windowing_function is None:
        windowing_function = np.sin(np.pi*np.arange(0.5,window_length,1)/window_length)**2
    total_length = len(data)
    window_count = int( (total_length-window_length)/window_step) + 1
    spectrum_length = int((window_length)/2)+1
    spectrogram = np.zeros((window_count,spectrum_length),dtype=complex)

    for k in range(window_count):
        starting_position = k*window_step

        data_vector = data[starting_position:(starting_position+window_length),]
        window_spectrum = scipy.fft.rfft(data_vector*windowing_function,n=window_length)

        spectrogram[k,:] = window_spectrum
    return spectrogram

def istft(spectrogram,fs,window_length_ms=30,window_step_ms=20,windowing_function=None):
    window_length = int(window_length_ms*fs/2000)*2
    window_step = int(window_step_ms*fs/1000)
    #if windowing_function is None:
    #    windowing_function = np.ones(window_length)
    window_count = spectrogram.shape[0]
    total_length = (window_count-1)*window_step + window_length
    data = np.zeros(total_length)
    for k in range(window_count):
        starting_position = k*window_step
        ix = np.arange(starting_position,starting_position+window_length)

        thiswin = scipy.fft.irfft(spectrogram[k,:],n=window_length)
        data[ix] = data[ix] + thiswin*windowing_function
    return data

def halfsinewindow(window_length):
    return np.sin(np.pi*np.arange(0.5,window_length,1)/window_length)
Hide code cell source
fs = 44100  # Sample rate
seconds = 5  # Duration of recording
window_length = int(window_length_ms*fs/2000)*2
window_step_samples = int(window_step_ms*fs/1000)

windowing_function = halfsinewindow(window_length)

filename = 'sounds/enhancement_test.wav'
Hide code cell source
# read from storage
fs, data =
data = data[:]


t = np.arange(0,len(data),1)/fs

plt.xlabel('Time (s)')
plt.title('Waveform of noisy audio')
plt.axis([0, len(data)/fs, 1.05*np.min(data), 1.05*np.max(data)])

spectrogram_matrix = stft(data,
fft_length = spectrogram_matrix.shape[1]
window_count = spectrogram_matrix.shape[0]
length_in_s = window_count*window_step_ms/1000
           extent=[0, length_in_s, 0, fs/2000])
plt.axis([0, length_in_s, 0, 8])
plt.xlabel('Time (s)')
plt.ylabel('Frequency (kHz)');
plt.title('Spectrogram of noisy audio')