Skip to content
Extraits de code Groupes Projets
Valider 2c550016 rédigé par Sting's avatar Sting
Parcourir les fichiers

Cleanup obsolete code

parent 8a884653
Aucune branche associée trouvée
Aucune étiquette associée trouvée
1 requête de fusion!3Madmom
......@@ -6,7 +6,7 @@ import shlex
from pathlib import Path
from assUtils import AssWriter, getSyls
from cnn_madmom.segment import segment
from autosyl.segment import segment
parser = argparse.ArgumentParser(description='AutoKara - Automatic karaoke timing tool')
......
Fichier déplacé
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm import tqdm
from cnn.music_processor import *
"""
On the paper,
Starting from a stack of three spectrogram excerpts,
convolution and max-pooling in turns compute a set of 20 feature maps
classified with a fully-connected network.
"""
class convNet(nn.Module):
"""
copies the neural net used in a paper.
"Improved musical onset detection with Convolutional Neural Networks".
src: https://ieeexplore.ieee.org/document/6854953
"""
def __init__(self):
super(convNet, self).__init__()
# model
self.conv1 = nn.Conv2d(3, 10, (3, 7))
self.conv2 = nn.Conv2d(10, 20, 3)
self.fc1 = nn.Linear(1120, 256)
self.fc2 = nn.Linear(256, 120)
self.fc3 = nn.Linear(120, 1)
def forward(self, x, istraining=False, minibatch=1):
x = F.max_pool2d(F.relu(self.conv1(x)), (3, 1))
x = F.max_pool2d(F.relu(self.conv2(x)), (3, 1))
x = F.dropout(x.view(minibatch, -1), training=istraining)
x = F.dropout(F.relu(self.fc1(x)), training=istraining)
x = F.dropout(F.relu(self.fc2(x)), training=istraining)
return F.sigmoid(self.fc3(x))
def train_data_builder(self, feats, answer, major_note_index, samplerate, soundlen=15, minibatch=1, split=0.2):
"""
Args:
feats: song.feats; Audio module
answers: song.answers; Audio module
major_note_index: answer labels; corresponding to feats
samplerate: song.samplerate; Audio module
soundlen: =15. 学習モデルに渡す画像データの横方向の長さ.ここでは(80 * 15)サイズのデータを使用している
minibatch: training minibatch
split: =1.
Variables:
minspace: minimum space between major note indexs
maxspace: maximum space between major note indexs
idx: index of major_note_index or feats
dist: distance of two notes
"""
# acceptable interval in seconds
minspace = 0.1
maxspace = 0.7
idx = np.random.permutation(major_note_index.shape[0] - soundlen) + soundlen // 2
X, y = [], []
cnt = 0
for i in range(int(idx.shape[0] * split)):
dist = major_note_index[idx[i] + 1] - major_note_index[idx[i]] # distinguish by this value
if dist < maxspace * samplerate / 512 and dist > minspace * samplerate / 512:
for j in range(-1, dist + 2):
X.append(feats[:, :, major_note_index[idx[i]] - soundlen // 2 + j : major_note_index[idx[i]] + soundlen // 2 + j + 1])
y.append(answer[major_note_index[idx[i]] + j])
cnt += 1
if cnt % minibatch == 0:
yield (torch.from_numpy(np.array(X)).float(), torch.from_numpy(np.array(y)).float())
X, y = [], []
def infer_data_builder(self, feats, soundlen=15, minibatch=1):
x = []
for i in range(feats.shape[2] - soundlen):
x.append(feats[:, :, i:i+soundlen])
if (i + 1) % minibatch == 0:
yield (torch.from_numpy(np.array(x)).float())
x = []
if len(x) != 0:
yield (torch.from_numpy(np.array(x)).float())
def train(self, songs, minibatch, epoch, device, soundlen=15, val_song=None, save_place='./models/model.pth', log='./log/log.txt'):
"""
Args:
songs: the list of song
minibatch: minibatch value
epoch: number of train
device: cpu / gpu
soundlen: width of one train data's image
val_song: validation song, if you wanna validation while training, give a path of validation song data.
save_place: save place path
log: log place path
don-ka: don(1) or ka(2) or both(0), usually, firstly, train don, then, train ka.
"""
for song in songs:
timing = np.array([syl[0] for syl in song.timestamp])
syllable = np.array([syl[1] for syl in song.timestamp])
song.answer = np.zeros((song.feats.shape[2]))
song.major_note_index = np.rint(timing[np.where(syllable != 0)] * song.samplerate/512).astype(np.int32)
song.major_note_index = np.delete(song.major_note_index, np.where(song.major_note_index >= song.feats.shape[2]))
song.answer[song.major_note_index] = 1
song.answer = milden(song.answer)
# training
optimizer = optim.SGD(self.parameters(), lr=0.02)
criterion = nn.MSELoss()
running_loss = 0
val_loss = 0
for i in range(epoch):
for song in songs:
for X, y in self.train_data_builder(song.feats, song.answer, song.major_note_index, song.samplerate, soundlen, minibatch, split=0.2):
optimizer.zero_grad()
output = self(X.to(device), istraining=True, minibatch=minibatch)
target = y.to(device)
loss = criterion(output.squeeze(), target)
loss.backward()
optimizer.step()
running_loss += loss.data.item()
with open(log, 'a') as f:
print("epoch: %.d running_loss: %.10f " % (i+1, running_loss), file=f)
print("epoch: %.d running_loss: %.10f" % (i+1, running_loss))
running_loss = 0
if val_song:
inference = torch.from_numpy(self.infer(val_song.feats, device, minibatch=512)).to(device)
target = torch.from_numpy(val_song.answer[:-soundlen]).float().to(device)
loss = criterion(inference.squeeze(), target)
val_loss = loss.data.item()
with open(log, 'a') as f:
print("val_loss: %.10f " % (val_loss), file=f)
torch.save(self.state_dict(), save_place)
def infer(self, feats, device, minibatch=1):
with torch.no_grad():
inference = None
for x in tqdm(self.infer_data_builder(feats, minibatch=minibatch), total=feats.shape[2]//minibatch):
output = self(x.to(device), minibatch=x.shape[0])
if inference is not None:
inference = np.concatenate((inference, output.cpu().numpy().reshape(-1)))
else:
inference = output.cpu().numpy().reshape(-1)
return np.array(inference).reshape(-1)
if __name__ == '__main__':
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = convNet()
net = net.to(device)
with open('./data/pickles/train_data.pickle', mode='rb') as f:
songs = pickle.load(f)
minibatch = 128
soundlen = 15
epoch = 100
net.train(songs=songs, minibatch=minibatch, val_song=None, epoch=epoch, device=device, soundlen=soundlen, save_place='./models/model.pth', log='./data/log/log.txt')
\ No newline at end of file
import soundfile as sf
import matplotlib.pyplot as plt
import numpy as np
import os
from glob import glob
from scipy import signal
from scipy.fftpack import fft
from librosa.filters import mel
from librosa.display import specshow
from librosa import stft
from librosa.effects import pitch_shift
import pickle
import sys
from numba import jit, prange
from sklearn.preprocessing import normalize
import re
from assUtils import dateToTime, timeToDate
class Audio:
"""
audio class which holds music data and timestamp for notes.
Args:
filename: file name.
stereo: True or False; wether you have Don/Ka streo file or not. normaly True.
Variables:
Example:
>>>from music_processor import *
>>>song = Audio(filename)
>>># to get audio data
>>>song.data
>>># to import .tja files:
>>>song.import_tja(filename)
>>># to get data converted
>>>song.data = (song.data[:,0]+song.data[:,1])/2
>>>fft_and_melscale(song, include_zero_cross=False)
"""
def __init__(self, filename, stereo=True):
self.data, self.samplerate = sf.read(filename, always_2d=True)
if stereo is False:
self.data = (self.data[:, 0]+self.data[:, 1])/2
self.timestamp = []
def plotaudio(self, start_t, stop_t):
plt.plot(np.linspace(start_t, stop_t, stop_t-start_t), self.data[start_t:stop_t, 0])
plt.show()
def save(self, filename="./savedmusic.wav", start_t=0, stop_t=None):
if stop_t is None:
stop_t = self.data.shape[0]
sf.write(filename, self.data[start_t:stop_t], self.samplerate)
def import_ass(self, filename):
with open(filename, 'r') as f:
CONTENT = f.read()
LINES_KARA = re.compile(r"Comment:.*(\d+:\d{2}:\d{2}.\d{2}),(\d+:\d{2}:\d{2}.\d{2}),.*,karaoke,(.*)\n");
RGX_TAGS = re.compile(r"\{\\k(\d+)\}([^\{\n\r]*)")
SYLS = []
for line in LINES_KARA.findall(CONTENT):
lastTime = dateToTime(line[0])
for couple in RGX_TAGS.findall(line[2]):
self.timestamp.append((lastTime/100, 1 if len(couple[1]) > 0 else 0))
lastTime += int(couple[0])
self.timestamp = np.array(self.timestamp, dtype='float, int')
def make_frame(data, nhop, nfft):
"""
helping function for fftandmelscale.
細かい時間に切り分けたものを学習データとするため,nhop(512)ずつずらしながらnfftサイズのデータを配列として返す
"""
length = data.shape[0]
framedata = np.concatenate((data, np.zeros(nfft))) # zero padding
return np.array([framedata[i*nhop:i*nhop+nfft] for i in range(length//nhop)])
#@jit
def fft_and_melscale(song, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
"""
fft and melscale method.
fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う.
melscale: 周波数の次元を削減するとともに,log10の値を取っている.
"""
feat_channels = []
for nfft in nffts:
feats = []
window = signal.blackmanharris(nfft)
filt = mel(sr=song.samplerate, n_fft=nfft, n_mels=mel_nband, fmin=mel_freqlo, fmax=mel_freqhi)
# get normal frame
frame = make_frame(song.data, nhop, nfft)
# print(frame.shape)
# melscaling
processedframe = fft(window*frame)[:, :nfft//2+1]
processedframe = np.dot(filt, np.transpose(np.abs(processedframe)**2))
processedframe = 20*np.log10(processedframe+0.1)
# print(processedframe.shape)
feat_channels.append(processedframe)
if include_zero_cross:
song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0]
print(song.zero_crossing)
return np.array(feat_channels)
#@jit(parallel=True)
def multi_fft_and_melscale(songs, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
for i in prange(len(songs)):
songs[i].feats = fft_and_melscale(songs[i], nhop, nffts, mel_nband, mel_freqlo, mel_freqhi)
def milden(data):
"""put smaller value(0.25) to plus minus 1 frame."""
for i in range(data.shape[0]):
if data[i] == 1:
if i > 0:
data[i-1] = 0.25
if i < data.shape[0] - 1:
data[i+1] = 0.25
if data[i] == 0.26:
if i > 0:
data[i-1] = 0.1
if i < data.shape[0] - 1:
data[i+1] = 0.1
return data
def smooth(x, window_len=11, window='hanning'):
if x.ndim != 1:
raise ValueError
if x.size < window_len:
raise ValueError
if window_len < 3:
return x
if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
raise ValueError
s = np.r_[x[window_len-1:0:-1], x, x[-2:-window_len-1:-1]]
# print(len(s))
if window == 'flat': # moving average
w = np.ones(window_len, 'd')
else:
w = eval('np.'+window+'(window_len)')
y = np.convolve(w/w.sum(), s, mode='valid')
return y
def music_for_train(serv, deletemusic=True, verbose=False, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
songplaces = glob(serv)
songs = []
for songplace in songplaces:
if verbose:
print(songplace)
songname = songplace.split("/")[-1]
song = Audio(glob(songplace+"/*.ogg")[0])
song.import_ass(glob(songplace+"/*.ass")[-1])
song.data = (song.data[:, 0]+song.data[:, 1])/2
song.feats = fft_and_melscale(song, nhop, nffts, mel_nband, mel_freqlo, mel_freqhi)
if deletemusic:
song.data = None
with open(f'./data/pickles/{songname:s}.pickle', mode='wb') as f:
pickle.dump(song, f)
def music_for_test(serv, deletemusic=True, verbose=False):
song = Audio(glob(serv+"/*.ogg")[0], stereo=False)
# song.import_tja(glob(serv+"/*.tja")[-1])
song.feats = fft_and_melscale(song, include_zero_cross=False)
with open('./data/pickles/test_data.pickle', mode='wb') as f:
pickle.dump(song, f)
if __name__ == "__main__":
if sys.argv[1] == 'train':
print("preparing all train data processing...")
serv = f'./{sys.argv[2]:s}/*'
music_for_train(serv, verbose=True)
print("all train data processing done!")
if sys.argv[1] == 'test':
print("test data proccesing...")
serv = f'./{sys.argv[2]:s}/*'
music_for_test(serv)
print("test data processing done!")
from cnn.model import *
from cnn.music_processor import *
from assUtils import AssWriter
import pickle
import numpy as np
from scipy.signal import argrelmax
from librosa.util import peak_pick
from librosa.onset import onset_detect
def segment(songfile):
song = Audio(songfile, stereo=False)
song.feats = fft_and_melscale(song, include_zero_cross=False)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = convNet()
net = net.to(device)
if torch.cuda.is_available():
net.load_state_dict(torch.load('./models/model.pth'))
else:
net.load_state_dict(torch.load('./models/model.pth', map_location='cpu'))
inference = net.infer(song.feats, device, minibatch=4192)
inference = np.reshape(inference, (-1))
return detection(inference, song.samplerate)
def detection(inference, samplerate):
inference = smooth(inference, 5)
timestamp = (peak_pick(inference, pre_max=1, post_max=2, pre_avg=4, post_avg=5, delta=0.05, wait=3)) # 実際は7フレーム目のところの音
timestamp = timestamp*512/samplerate
return timestamp
if __name__ == '__main__':
onsets = segment(sys.argv[1])
syls = [[t, ''] for t in onsets]
print(syls)
writer = AssWriter()
writer.openAss("./media/test.ass")
writer.writeHeader()
writer.writeSyls(syls)
writer.closeAss()
from cnn.music_processor import *
if sys.argv[1] == 'train':
print("preparing all train data processing...")
serv = f'./{sys.argv[2]:s}/*'
music_for_train(serv, verbose=True)
print("all train data processing done!")
if sys.argv[1] == 'test':
print("test data proccesing...")
serv = f'./{sys.argv[2]:s}/*'
music_for_test(serv)
print("test data processing done!")
\ No newline at end of file
from cnn.model import *
from cnn.music_processor import *
from glob import glob
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = convNet()
net = net.to(device)
songplaces = glob('./data/pickles/*.pickle')
songs = []
for songplace in songplaces:
with open(songplace, mode='rb') as f:
song = pickle.load(f)
songs.append(song)
minibatch = 128
soundlen = 15
epoch = 100
net.train(songs=songs, minibatch=minibatch, val_song=None, epoch=epoch, device=device, soundlen=soundlen, save_place='./models/model.pth', log='./data/log/log.txt')
\ No newline at end of file
Fichier déplacé
import librosa
import numpy as np
# import matplotlib.pyplot as plt
import sys
class Segment:
def __init__(self, file):
self.file = file
def onsets(self):
'''
Use librosa's onset detection to detect syllable start times
'''
y, sr = librosa.load(self.file)
o_env = librosa.onset.onset_strength(y=y, sr=sr)
times = librosa.times_like(o_env, sr=sr)
onset_raw = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
onset_bt = librosa.onset.onset_backtrack(onset_raw, o_env)
S = np.abs(librosa.stft(y=y))
rms = librosa.feature.rms(S=S)
onset_bt_rms = librosa.onset.onset_backtrack(onset_raw, rms[0])
onset_bt_times = librosa.frames_to_time(onset_bt, sr=sr)
onset_bt_rms_times = librosa.frames_to_time(onset_bt_rms, sr=sr)
onset_raw_times = librosa.frames_to_time(onset_raw, sr=sr)
# print(onset_bt_rms_times)
'''
fig, ax = plt.subplots(nrows=3, sharex=True)
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),y_axis='log', x_axis='time', ax=ax[0])
ax[0].label_outer()
ax[1].plot(times, o_env, label='Onset strength')
ax[1].vlines(librosa.frames_to_time(onset_raw), 0, o_env.max(), label='Raw onsets')
ax[1].vlines(librosa.frames_to_time(onset_bt), 0, o_env.max(), label='Backtracked', color='r')
ax[1].legend()
ax[1].label_outer()
ax[2].plot(times, rms[0], label='RMS')
ax[2].vlines(librosa.frames_to_time(onset_bt_rms), 0, rms.max(), label='Backtracked (RMS)', color='r')
ax[2].legend()
plt.show()
'''
return onset_raw_times
if __name__ == "__main__":
seg = Segment(sys.argv[1])
seg.onsets()
\ No newline at end of file
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Veuillez vous inscrire ou vous pour commenter