【深度学习-语音分类】语种识别挑战赛Baseline
程序员文章站
2023-02-02 18:54:43
【深度学习-语音分类】语种识别挑战赛Baseline提取特征并保存:训练集:测试集:转换数据格式:搭建并训练模型:生成预测结果:比赛地址:http://challenge.xfyun.cn/topic/info?type=multilingual提取特征并保存:这里提取音频文件logmel特征并保存:训练集:import osimport waveimport librosaimport numpy as npfrom tqdm import tqdmimport pickle as...
比赛地址:http://challenge.xfyun.cn/topic/info?type=multilingual
提取特征并保存:
这里提取音频文件logmel特征并保存:
训练集:
import os
import wave
import librosa
import numpy as np
from tqdm import tqdm
import pickle as pkl
import librosa
from sklearn.preprocessing import normalize
def extract_logmel(y, sr, size=3):
"""
extract log mel spectrogram feature
:param y: the input signal (audio time series)
:param sr: sample rate of 'y'
:param size: the length (seconds) of random crop from original audio, default as 3 seconds
:return: log-mel spectrogram feature
"""
# normalization
y = y.astype(np.float32)
normalization_factor = 1 / np.max(np.abs(y))
y = y * normalization_factor
# random crop
if len(y) < size * sr:
new_y = np.zeros((size * sr, ))
new_y[:len(y)] = y
y = new_y
start = np.random.randint(0, len(y) - size * sr)
y = y[start: start + size * sr]
# extract log mel spectrogram #####
melspectrogram = librosa.feature.melspectrogram(
y=y, sr=sr, n_fft=2048, hop_length=1024, n_mels=60)
logmelspec = librosa.power_to_db(melspectrogram)
return logmelspec
def get_wave_norm(file):
data, framerate = librosa.load(file, sr=22050)
return data, framerate
LABELS = ['L{:03}'.format(i) for i in range(1, 18)]
N_CLASS = len(LABELS)
DATA_DIR = './train'
SUB_DIR = ['train', 'dev']
file_glob = []
for i, cls_fold in enumerate(os.listdir(DATA_DIR)):
if not cls_fold.startswith('L'):
continue
cls_base = os.path.join(DATA_DIR, cls_fold)
lbl = cls_fold.split('-')[0]
for type_fold in SUB_DIR:
base = os.path.join(cls_base, type_fold)
files = os.listdir(base)
print('{} {} num:'.format(lbl, type_fold), len(files))
for pt in files:
file_pt = os.path.join(base, pt)
file_glob.append((file_pt, LABELS.index(lbl)))
print('done.')
print(len(file_glob))
data = []
for file, lbl in tqdm(file_glob):
try:
raw, sr = get_wave_norm(file)
except Exception as e:
print(e, file)
feature = extract_logmel(y=raw, sr=sr, size=3)
data.append((feature, lbl))
with open('./data.pkl', 'wb') as f:
pkl.dump(data, f)
测试集:
from audiomentations import *
import os
import wave
import librosa
import numpy as np
from tqdm import tqdm
import pickle as pkl
import librosa
DATA_DIR = './test'
file_glob = []
def track_features(y, sr):
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
spectral_center = librosa.feature.spectral_centroid(y=y, sr=sr)
chroma = librosa.feature.chroma_stft(y=y, sr=sr)
spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
feature_0 = mfcc.T
feature_1 = spectral_center.T
feature_2 = chroma.T
feature_3 = spectral_contrast.T
features = np.concatenate(
[feature_0, feature_1, feature_2, feature_3], axis=-1)
return features
def get_wave_norm(file):
with wave.open(file, 'rb') as f:
params = f.getparams()
nchannels, sampwidth, framerate, nframes = params[:4]
data = f.readframes(nframes)
data = np.fromstring(data, dtype=np.int16)
data = data * 1.0 / max(abs(data))
return data, framerate
seg = 150000
data = {}
for cls_fold in tqdm(os.listdir(DATA_DIR)):
if not cls_fold.startswith('t'):
continue
cls_base = os.path.join(DATA_DIR, cls_fold)
try:
raw, sr = get_wave_norm(cls_base)
except Exception as e:
print(e, cls_base)
length = raw.shape[0]
temp = []
for i in range((length//seg)*2+1):
start = i * int(seg/2)
end = start + seg
if end > length:
end = length
if end - start == 50000:
x = raw[start:end]
else:
x = np.zeros(seg)
x[start-end:] = raw[start:end]
r = track_features(x, sr)
temp.append(r)
data[cls_fold] = temp
with open('./test_data.pkl', 'wb') as f:
pkl.dump(data, f)
test = []
for key, value in data.items():
test.append(value)
print(np.max(test))
print(np.min(test))
转换数据格式:
将数据另存为npy格式:
import pickle as pkl
from tqdm import tqdm
import numpy as np
with open('data.pkl', 'rb') as f:
data = pkl.load(f)
train_x = []
train_y = []
np.random.shuffle(data)
for x, y in data:
train_x.append(x)
train_y.append(y)
train_x = np.array(train_x)
train_y = np.array(train_y)
np.save('train_x.npy', train_x)
np.save('train_y.npy', train_y)
print(train_x.shape)
print(train_y.shape)
搭建并训练模型:
使用1D卷积+双向LSTM:
import keras.backend as K
from keras import regularizers
from keras import layers
from keras.models import Sequential
import keras
import os
import wave
import numpy as np
import pickle as pkl
LABELS = ['L{:03}'.format(i) for i in range(1, 18)]
N_CLASS = len(LABELS)
train_x = np.load('train_x.npy')
train_y = np.load('train_y.npy')
print(train_x.shape)
print(train_y.shape)
model = Sequential()
model.add(layers.Conv1D(16, 5, input_shape=(293, 33),
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.Conv1D(16, 3, activation='elu',
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.Conv1D(16, 3, activation='elu',
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPool1D())
model.add(layers.Dropout(0.5))
model.add(layers.Conv1D(32, 3, activation='elu',
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.Conv1D(32, 3, activation='elu',
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPool1D())
model.add(layers.Dropout(0.5))
model.add(layers.Conv1D(64, 3, activation='elu',
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.Conv1D(64, 3, activation='elu',
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.BatchNormalization())
model.add(layers.Bidirectional(layers.LSTM(128, dropout=0.5, return_sequences=True,
kernel_regularizer=regularizers.l1_l2(1e-7))))
model.add(layers.Bidirectional(layers.LSTM(128, dropout=0.5, return_sequences=True,
kernel_regularizer=regularizers.l1_l2(1e-7))))
model.add(layers.LSTM(128,
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.Dense(128, activation='elu',
kernel_regularizer=regularizers.l1_l2(1e-7)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(N_CLASS, activation="softmax"))
model.summary()
adam = keras.optimizers.adam(2e-4)
model.compile(loss='categorical_crossentropy',
optimizer=adam, metrics=['accuracy'])
# Train model on dataset
batch_size = 128
steps = len(train_x) // batch_size
model.fit(x=train_x, y=train_y, batch_size=batch_size,
epochs=200, validation_split=0.1, shuffle=True)
model.save('my_model.h5')
生成预测结果:
import keras.backend as K
from keras import regularizers
from keras import layers
from keras.models import Sequential
import keras
import os
import wave
import numpy as np
import pickle as pkl
from tqdm import tqdm
import pandas as pd
from keras.models import load_model
LABELS = ['L{:03}'.format(i) for i in range(1, 18)]
N_CLASS = len(LABELS)
test_pt = 'test_data.pkl'
with open(test_pt, 'rb') as f:
raw_data = pkl.load(f)
model = load_model('my_model.h5')
result = {'id': [], 'label': []}
for key, value in tqdm(raw_data.items()):
x = np.array(value)
y = model.predict(x)
y = np.mean(y, axis=0)
pred = LABELS[np.argmax(y)]
result['id'].append(key)
result['label'].append(pred)
result = pd.DataFrame(result)
result.to_csv('./submission.csv', index=False)
本文地址:https://blog.csdn.net/weixin_44936889/article/details/107446945