语音识别初实现

语音识别初实现——傅立叶变换、MFCC、HMM

读取和绘制音频数据

读取音频文件并可视化展现

音频文件是实际音频信哈的数字化形式，实际的音频信号是复杂的连续波形。为了将其保存成数字化形式，需要对音频信号进行采样并将其转换成数字。语音通常以44100 Hz的频率进行采样，这就意味着每秒钟信号被分解成44100份，然后这些抽样值被保存。

代码实现：

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile

# 使用wavfile包从input_read.wav中读取音频文件:
sampling_freq, audio = wavfile.read('input_read.wav')

# 打印信号相关参数
print ('\nShape:', audio.shape)
print ('Datatype:', audio.dtype)
print ('Duration:', round(audio.shape[0] / float(sampling_freq), 3), 'seconds')

# 标准化
audio = audio / (2.**15)

# 提取前30个值画图
audio = audio[:30]

# 建立时间轴，x轴按照采样频率因子进行缩放
x_values = np.arange(0, len(audio), 1) / float(sampling_freq)

# 单位转换为seconds
x_values *= 1000

# 画出声音信号图形
plt.plot(x_values, audio, color='black')
plt.xlabel('Time (ms)')
plt.ylabel('Amplitude')
plt.title('Audio signal')
plt.show()

将音频信号转化为频域

音频信号是不同频率、幅度和相位的正弦波的复杂混合。正弦波也称作正弦曲线。音频信号的频率内容中隐藏了很多信息。事实上，一个音频信号的性质由其频率内容决定。世界上的语音和音乐都是基于这个事实的。

//Todo 傅立叶变换

import numpy as np
from scipy.io import wavfile
import matplotlib.pyplot as plt

# Read the input file
sampling_freq, audio = wavfile.read('input_freq.wav')

# Normalize the values
audio = audio / (2.**15)

# 提取数组长度
len_audio = len(audio)

# 傅立叶变换，因为傅立叶变换是关于中心点对称的，因此只需要转换信号的前半部分。
# 最终目标是提取功率信号，先将信号的值平方
transformed_signal = np.fft.fft(audio)
half_length = np.ceil((len_audio + 1) / 2.0)
transformed_signal = abs(transformed_signal[0:half_length])
transformed_signal /= float(len_audio)
transformed_signal **= 2

# 提取信号长度
len_ts = len(transformed_signal)

# 将部分信号乘以2
if len_audio % 2:
    transformed_signal[1:len_ts] *= 2
else:
    transformed_signal[1:len_ts-1] *= 2

# 获取功率信号
power = 10 * np.log10(transformed_signal)

# Build the time axis
x_values = np.arange(0, half_length, 1) * (sampling_freq / len_audio) / 1000.0

# Plot the figure
plt.figure()
plt.plot(x_values, power, color='black')
plt.xlabel('Freq (in kHz)')
plt.ylabel('Power (in dB)')
plt.show()

提取频域特征

将信号转换为频域之后，还需要将其转换成有用的形式。梅尔频率倒谱系数(Mel Frequency Cepstrum Coefficient，MFCC)可以解决这个问题。MFCC首先计算信号的功率谱，然后用滤波器组和离散余弦变换的组合来提取特征。

//Todo 梅尔频率倒谱系数

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile 
from features import mfcc, logfbank

# Read input sound file
sampling_freq, audio = wavfile.read("input_freq.wav")

# Extract MFCC and Filter bank features
# 提取MFCC和过滤器组特征
mfcc_features = mfcc(audio, sampling_freq)
filterbank_features = logfbank(audio, sampling_freq)

# Print parameters
# 查看可以生成多少个窗体
print ('\nMFCC:\nNumber of windows =', mfcc_features.shape[0])
print ('Length of each feature =', mfcc_features.shape[1])
print ('\nFilter bank:\nNumber of windows =', filterbank_features.shape[0])
print ('Length of each feature =', filterbank_features.shape[1])

# Plot the features
# 画出特征图
mfcc_features = mfcc_features.T
plt.matshow(mfcc_features)
plt.title('MFCC')

# 滤波器组特征可视化，需要转换矩阵，使得域是水平的
filterbank_features = filterbank_features.T
plt.matshow(filterbank_features)
plt.title('Filter bank')

plt.show()

创建一个隐马尔可夫模型

本例将用到隐马尔科夫模型(Hidden Markov Models，HMMs)来做语音识别。隐马尔科夫模型非常擅长建立时间序列数据模型。因为一个音频信号同时也是一个时间序列信号，因此隐马尔科夫模型也同样适用于音频信号的处理。假定输出是通过隐藏状态生成的，我们的目标是找到这些隐藏状态，以便对信号建模。

// Todo 隐马尔科夫模型

创建一个语音识别器

我们需要为每一类构建一个隐马尔科夫模型。如果想识别新的输入文件中的单词，需要对该文件运行所有的模型，并找出最佳分数的结果。下面将用到在前一节构建的隐马尔科夫类。

import os
import argparse 

import numpy as np
from scipy.io import wavfile 
from hmmlearn import hmm
from features import mfcc

# 用于解析命令行中输入的参数
def build_arg_parser():
    parser = argparse.ArgumentParser(description='Trains the HMM classifier')
    parser.add_argument("--input-folder", dest="input_folder", required=True,
            help="Input folder containing the audio files in subfolders")
    return parser

# 创建类处理HHM相关过程
class HMMTrainer(object):
    # 初始化该类。下面将用到高斯隐马尔科夫模型(Gaussian HMMs)来对数据建模。参数 n_components定义了隐藏状态的个数，参数cov_type定义了转移矩阵的协方差类型，参数 n_iter定义了训练的迭代次数
    def __init__(self, model_name='GaussianHMM', n_components=4, cov_type='diag', n_iter=1000):
        #  初始化变量
        self.model_name = model_name
        self.n_components = n_components
        self.cov_type = cov_type
        self.n_iter = n_iter
        self.models = []

        # 用以下参数定义模型
        if self.model_name == 'GaussianHMM':
            self.model = hmm.GaussianHMM(n_components=self.n_components, 
                    covariance_type=self.cov_type, n_iter=self.n_iter)
        else:
            raise TypeError('Invalid model type')

    # 输入数据是一个NumPy数组，数组的每个元素都是一个特征向量，每个特征向量都包含k 个维度
    # X is a 2D numpy array where each row is 13D
    def train(self, X):
        np.seterr(all='ignore')
        self.models.append(self.model.fit(X))

    # 基于该模型定义一个提取分数的方法
    # Run the model on input data
    def get_score(self, input_data):
        return self.model.score(input_data)

    
# 定义一个main函数，解析输入的参数
if __name__=='__main__':
    args = build_arg_parser().parse_args()
    input_folder = args.input_folder

    # 初始化隐马尔科夫模型的变量
    hmm_models = []

    # 解析包含所有数据库音频文件的输入路径
    for dirname in os.listdir(input_folder):
        # 提取子文件夹的名称 
        subfolder = os.path.join(input_folder, dirname)

        if not os.path.isdir(subfolder): 
            continue

        # 子文件夹的名称即为该类的标记，提取标记
        label = subfolder[subfolder.rfind('/') + 1:]

        # 初始化变量
        X = np.array([])
        y_words = []

        # 迭代每一个子文件夹中的音频文件（分别保留一个进行测试）
        for filename in [x for x in os.listdir(subfolder) if x.endswith('.wav')][:-1]:
            # 读取每个音频文件
            filepath = os.path.join(subfolder, filename)
            sampling_freq, audio = wavfile.read(filepath)
            
            # 提取MFCC特征
            mfcc_features = mfcc(audio, sampling_freq)

            # 将MFCC特征添加到X变量
            if len(X) == 0:
                X = mfcc_features
            else:
                X = np.append(X, mfcc_features, axis=0)
            
            # 添加标记
            y_words.append(label)

        print 'X.shape =', X.shape
        
        
# 一旦提取完当前类所有文件的特征，就可以训练并保存隐马尔科夫模型了。因为隐马尔科夫模型是一个无监督学习的生成模型，所以并不需要利用标记针对每一类构建隐马尔科夫模型。

        # Train and save HMM model
        hmm_trainer = HMMTrainer()
        hmm_trainer.train(X)
        hmm_models.append((hmm_trainer, label))
        hmm_trainer = None
    
    # Test files（获取一个未被用于训练的测试文件列表）
    input_files = [
            'data/pineapple/pineapple15.wav',
            'data/orange/orange15.wav',
            'data/apple/apple15.wav',
            'data/kiwi/kiwi15.wav'
            ]

    # 为输入数据分类
    for input_file in input_files:
        # Read input file
        sampling_freq, audio = wavfile.read(input_file)

        # 提取MFCC特征
        mfcc_features = mfcc(audio, sampling_freq)

        # Define variables
        max_score = None
        output_label = None

        # 迭代HMM模型并选取得分最高的模型
        # the one with the highest score
        for item in hmm_models:
            hmm_model, label = item
            # 提取并保存最大分数
            score = hmm_model.get_score(mfcc_features)
            if score > max_score:
                max_score = score
                output_label = label

        # Print the output
        print "\nTrue:", input_file[input_file.find('/')+1:input_file.rfind('/')]
        print "Predicted:", output_label