Skip to content
Extraits de code Groupes Projets
Valider 916c9275 rédigé par Sting's avatar Sting
Parcourir les fichiers

Merge branch 'cnn-segmentation' into 'master'

CNN segmentation

See merge request !1
parents e8416a99 4f733b30
Aucune branche associée trouvée
Aucune étiquette associée trouvée
1 requête de fusion!1CNN segmentation
*
!.gitignore
!README.md
!requirements.txt
!extractWav.sh
!extractAss.sh
!karaUtils.py
!autokara.py
!segment.py
!assUtils.py
!process_train_data.sh
!cnn_prepare_data.py
!cnn_train.py
!*/cnn/segment.py
!*/cnn/music_processor.py
!*/cnn/model.py
!rosa/*.py
media/
\ No newline at end of file
......@@ -45,10 +45,9 @@ If we ever want to use an AI to identify syllables without a reference lyrics fi
- MKVToolnix (at least the CLI utils)
- Python >= 3.8
- PyTorch : follow the instructions [here](https://pytorch.org/get-started/locally/)
Having a CUDA-capable GPU is optional, but can greatly reduce processing time.
## Setup
All other python modules can be installed directly through pip, see further.
This project requires at least Python 3.8, and using a virtual environment is strongly recommended.
To install the dependencies, execute in the project directory :
......@@ -56,16 +55,43 @@ To install the dependencies, execute in the project directory :
$ python -m venv env # create the virtual environment, do it once
$ source env/bin/activate # use the virtual environement
# Install the Demucs (vocal separation tool)
$ pip install -U demucs
$ pip install librosa
# Install the required python modules
$ pip install -r requirements.txt
# To exit the virtual environment
$ deactivate
```
Having a CUDA-capable GPU is optional, but can greatly reduce processing time.
# Use
## Training
To extract vocals and ASS from MKV video files:
```bash
$ ./process_train_data video_folder train_folder
```
To prepare the training data for the model :
```bash
$ python cnn_prepare_data.py train train_folder
```
Prepared data will be stored in `./data/pickles/train_data.pickle`
To train the model on the prepared data :
```bash
$ python cnn_train.py
```
The model will be written to `./models/model.pth`
## Infer
To execute AutoKara on a MKV video file :
```bash
$ python autokara.py video.mkv output.ass
......
......@@ -11,6 +11,15 @@ def timeToDate(time):
return f'{hours:02d}:{remainder_mins:02d}:{remainder_sec:.2f}'
def dateToTime(date):
"""
The `date` should be in the following format: H:MM:SS.cs
"""
hourInMinuts = int(date[0:1]) * 60
minutsInSeconds = (int(date[2:4]) + hourInMinuts) * 60
secondsInCentiseconds = (int(date[5:7]) + minutsInSeconds) * 100
return int(date[8:10]) + secondsInCentiseconds
class AssWriter:
......@@ -44,12 +53,12 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
def writeSyls(self, syl_timings):
last_syl_dur = 500
start_time = timeToDate(syl_timings[0])
end_time = timeToDate(syl_timings[-1] + last_syl_dur//100)
start_time = timeToDate(syl_timings[0][0])
end_time = timeToDate(syl_timings[-1][0] + last_syl_dur//100)
line = f'Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,'
for i in range(len(syl_timings) - 1):
syl_dur = round((syl_timings[i+1] - syl_timings[i]) * 100)
line += f'{{\k{syl_dur:d}}}'
line += f'{{\k{last_syl_dur:d}}}\n'
syl_dur = round((syl_timings[i+1][0] - syl_timings[i][0]) * 100)
line += f'{{\k{syl_dur:d}}}{syl_timings[i][1]:s}'
line += f'{{\k{last_syl_dur:d}}}{syl_timings[-1][1]:s}\n'
self.file.write(line)
\ No newline at end of file
......@@ -6,7 +6,7 @@ import shlex
from pathlib import Path
from assUtils import AssWriter
from segment import Segment
from cnn.segment import segment
parser = argparse.ArgumentParser(description='AutoKara - Automatic karaoke timing tool')
......@@ -41,13 +41,13 @@ else:
print("Identifying syl starts...")
seg = Segment(vocals_file)
onset_times = seg.onsets()
onsets = segment(vocals_file)
syls = [[t, ''] for t in onsets]
print("Syls found, writing ASS file...")
writer = AssWriter()
writer.openAss(ass_file)
writer.writeHeader()
writer.writeSyls(onset_times)
writer.writeSyls(syls)
writer.closeAss()
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from tqdm import tqdm
from cnn.music_processor import *
"""
On the paper,
Starting from a stack of three spectrogram excerpts,
convolution and max-pooling in turns compute a set of 20 feature maps
classified with a fully-connected network.
"""
class convNet(nn.Module):
"""
copies the neural net used in a paper.
"Improved musical onset detection with Convolutional Neural Networks".
src: https://ieeexplore.ieee.org/document/6854953
"""
def __init__(self):
super(convNet, self).__init__()
# model
self.conv1 = nn.Conv2d(3, 10, (3, 7))
self.conv2 = nn.Conv2d(10, 20, 3)
self.fc1 = nn.Linear(1120, 256)
self.fc2 = nn.Linear(256, 120)
self.fc3 = nn.Linear(120, 1)
def forward(self, x, istraining=False, minibatch=1):
x = F.max_pool2d(F.relu(self.conv1(x)), (3, 1))
x = F.max_pool2d(F.relu(self.conv2(x)), (3, 1))
x = F.dropout(x.view(minibatch, -1), training=istraining)
x = F.dropout(F.relu(self.fc1(x)), training=istraining)
x = F.dropout(F.relu(self.fc2(x)), training=istraining)
return F.sigmoid(self.fc3(x))
def train_data_builder(self, feats, answer, major_note_index, samplerate, soundlen=15, minibatch=1, split=0.2):
"""
Args:
feats: song.feats; Audio module
answers: song.answers; Audio module
major_note_index: answer labels; corresponding to feats
samplerate: song.samplerate; Audio module
soundlen: =15. 学習モデルに渡す画像データの横方向の長さ.ここでは(80 * 15)サイズのデータを使用している
minibatch: training minibatch
split: =1.
Variables:
minspace: minimum space between major note indexs
maxspace: maximum space between major note indexs
idx: index of major_note_index or feats
dist: distance of two notes
"""
# acceptable interval in seconds
minspace = 0.1
maxspace = 0.7
idx = np.random.permutation(major_note_index.shape[0] - soundlen) + soundlen // 2
X, y = [], []
cnt = 0
for i in range(int(idx.shape[0] * split)):
dist = major_note_index[idx[i] + 1] - major_note_index[idx[i]] # distinguish by this value
if dist < maxspace * samplerate / 512 and dist > minspace * samplerate / 512:
for j in range(-1, dist + 2):
X.append(feats[:, :, major_note_index[idx[i]] - soundlen // 2 + j : major_note_index[idx[i]] + soundlen // 2 + j + 1])
y.append(answer[major_note_index[idx[i]] + j])
cnt += 1
if cnt % minibatch == 0:
yield (torch.from_numpy(np.array(X)).float(), torch.from_numpy(np.array(y)).float())
X, y = [], []
def infer_data_builder(self, feats, soundlen=15, minibatch=1):
x = []
for i in range(feats.shape[2] - soundlen):
x.append(feats[:, :, i:i+soundlen])
if (i + 1) % minibatch == 0:
yield (torch.from_numpy(np.array(x)).float())
x = []
if len(x) != 0:
yield (torch.from_numpy(np.array(x)).float())
def train(self, songs, minibatch, epoch, device, soundlen=15, val_song=None, save_place='./models/model.pth', log='./log/log.txt'):
"""
Args:
songs: the list of song
minibatch: minibatch value
epoch: number of train
device: cpu / gpu
soundlen: width of one train data's image
val_song: validation song, if you wanna validation while training, give a path of validation song data.
save_place: save place path
log: log place path
don-ka: don(1) or ka(2) or both(0), usually, firstly, train don, then, train ka.
"""
for song in songs:
timing = np.array([syl[0] for syl in song.timestamp])
syllable = np.array([syl[1] for syl in song.timestamp])
song.answer = np.zeros((song.feats.shape[2]))
song.major_note_index = np.rint(timing[np.where(syllable != "")] * song.samplerate/512).astype(np.int32)
song.major_note_index = np.delete(song.major_note_index, np.where(song.major_note_index >= song.feats.shape[2]))
song.answer[song.major_note_index] = 1
song.answer = milden(song.answer)
# training
optimizer = optim.SGD(self.parameters(), lr=0.02)
criterion = nn.MSELoss()
running_loss = 0
val_loss = 0
for i in range(epoch):
for song in songs:
for X, y in self.train_data_builder(song.feats, song.answer, song.major_note_index, song.samplerate, soundlen, minibatch, split=0.2):
optimizer.zero_grad()
output = self(X.to(device), istraining=True, minibatch=minibatch)
target = y.to(device)
loss = criterion(output.squeeze(), target)
loss.backward()
optimizer.step()
running_loss += loss.data.item()
with open(log, 'a') as f:
print("epoch: %.d running_loss: %.10f " % (i+1, running_loss), file=f)
print("epoch: %.d running_loss: %.10f" % (i+1, running_loss))
running_loss = 0
if val_song:
inference = torch.from_numpy(self.infer(val_song.feats, device, minibatch=512)).to(device)
target = torch.from_numpy(val_song.answer[:-soundlen]).float().to(device)
loss = criterion(inference.squeeze(), target)
val_loss = loss.data.item()
with open(log, 'a') as f:
print("val_loss: %.10f " % (val_loss), file=f)
torch.save(self.state_dict(), save_place)
def infer(self, feats, device, minibatch=1):
with torch.no_grad():
inference = None
for x in tqdm(self.infer_data_builder(feats, minibatch=minibatch), total=feats.shape[2]//minibatch):
output = self(x.to(device), minibatch=x.shape[0])
if inference is not None:
inference = np.concatenate((inference, output.cpu().numpy().reshape(-1)))
else:
inference = output.cpu().numpy().reshape(-1)
return np.array(inference).reshape(-1)
if __name__ == '__main__':
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = convNet()
net = net.to(device)
with open('./data/pickles/train_data.pickle', mode='rb') as f:
songs = pickle.load(f)
minibatch = 128
soundlen = 15
epoch = 100
net.train(songs=songs, minibatch=minibatch, val_song=None, epoch=epoch, device=device, soundlen=soundlen, save_place='./models/model.pth', log='./data/log/log.txt')
\ No newline at end of file
import soundfile as sf
import matplotlib.pyplot as plt
import numpy as np
import os
from glob import glob
from scipy import signal
from scipy.fftpack import fft
from librosa.filters import mel
from librosa.display import specshow
from librosa import stft
from librosa.effects import pitch_shift
import pickle
import sys
from numba import jit, prange
from sklearn.preprocessing import normalize
import re
from assUtils import dateToTime, timeToDate
class Audio:
"""
audio class which holds music data and timestamp for notes.
Args:
filename: file name.
stereo: True or False; wether you have Don/Ka streo file or not. normaly True.
Variables:
Example:
>>>from music_processor import *
>>>song = Audio(filename)
>>># to get audio data
>>>song.data
>>># to import .tja files:
>>>song.import_tja(filename)
>>># to get data converted
>>>song.data = (song.data[:,0]+song.data[:,1])/2
>>>fft_and_melscale(song, include_zero_cross=False)
"""
def __init__(self, filename, stereo=True):
self.data, self.samplerate = sf.read(filename, always_2d=True)
if stereo is False:
self.data = (self.data[:, 0]+self.data[:, 1])/2
self.timestamp = []
def plotaudio(self, start_t, stop_t):
plt.plot(np.linspace(start_t, stop_t, stop_t-start_t), self.data[start_t:stop_t, 0])
plt.show()
def save(self, filename="./savedmusic.wav", start_t=0, stop_t=None):
if stop_t is None:
stop_t = self.data.shape[0]
sf.write(filename, self.data[start_t:stop_t], self.samplerate)
def import_ass(self, filename):
with open(filename, 'r') as f:
CONTENT = f.read()
LINES_KARA = re.compile(r"Comment:.*(\d+:\d{2}:\d{2}.\d{2}),(\d+:\d{2}:\d{2}.\d{2}),.*,karaoke,(.*)\n");
RGX_TAGS = re.compile(r"\{\\k(\d+)\}([^\{\n\r]*)")
SYLS = []
for line in LINES_KARA.findall(CONTENT):
lastTime = dateToTime(line[0])
for couple in RGX_TAGS.findall(line[2]):
self.timestamp.append((lastTime/100, couple[1]))
lastTime += int(couple[0])
self.timestamp = np.array(self.timestamp, dtype='float, object')
def make_frame(data, nhop, nfft):
"""
helping function for fftandmelscale.
細かい時間に切り分けたものを学習データとするため,nhop(512)ずつずらしながらnfftサイズのデータを配列として返す
"""
length = data.shape[0]
framedata = np.concatenate((data, np.zeros(nfft))) # zero padding
return np.array([framedata[i*nhop:i*nhop+nfft] for i in range(length//nhop)])
#@jit
def fft_and_melscale(song, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
"""
fft and melscale method.
fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う.
melscale: 周波数の次元を削減するとともに,log10の値を取っている.
"""
feat_channels = []
for nfft in nffts:
feats = []
window = signal.blackmanharris(nfft)
filt = mel(sr=song.samplerate, n_fft=nfft, n_mels=mel_nband, fmin=mel_freqlo, fmax=mel_freqhi)
# get normal frame
frame = make_frame(song.data, nhop, nfft)
# print(frame.shape)
# melscaling
processedframe = fft(window*frame)[:, :nfft//2+1]
processedframe = np.dot(filt, np.transpose(np.abs(processedframe)**2))
processedframe = 20*np.log10(processedframe+0.1)
# print(processedframe.shape)
feat_channels.append(processedframe)
if include_zero_cross:
song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0]
print(song.zero_crossing)
return np.array(feat_channels)
#@jit(parallel=True)
def multi_fft_and_melscale(songs, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
for i in prange(len(songs)):
songs[i].feats = fft_and_melscale(songs[i], nhop, nffts, mel_nband, mel_freqlo, mel_freqhi)
def milden(data):
"""put smaller value(0.25) to plus minus 1 frame."""
for i in range(data.shape[0]):
if data[i] == 1:
if i > 0:
data[i-1] = 0.25
if i < data.shape[0] - 1:
data[i+1] = 0.25
if data[i] == 0.26:
if i > 0:
data[i-1] = 0.1
if i < data.shape[0] - 1:
data[i+1] = 0.1
return data
def smooth(x, window_len=11, window='hanning'):
if x.ndim != 1:
raise ValueError
if x.size < window_len:
raise ValueError
if window_len < 3:
return x
if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
raise ValueError
s = np.r_[x[window_len-1:0:-1], x, x[-2:-window_len-1:-1]]
# print(len(s))
if window == 'flat': # moving average
w = np.ones(window_len, 'd')
else:
w = eval('np.'+window+'(window_len)')
y = np.convolve(w/w.sum(), s, mode='valid')
return y
def music_for_train(serv, deletemusic=True, verbose=False, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
songplaces = glob(serv)
songs = []
for songplace in songplaces:
if verbose:
print(songplace)
song = Audio(glob(songplace+"/*.ogg")[0])
song.import_ass(glob(songplace+"/*.ass")[-1])
song.data = (song.data[:, 0]+song.data[:, 1])/2
songs.append(song)
multi_fft_and_melscale(songs, nhop, nffts, mel_nband, mel_freqlo, mel_freqhi, include_zero_cross=include_zero_cross)
if deletemusic:
for song in songs:
song.data = None
with open('./data/pickles/train_data.pickle', mode='wb') as f:
pickle.dump(songs, f)
def music_for_test(serv, deletemusic=True, verbose=False):
song = Audio(glob(serv+"/*.ogg")[0], stereo=False)
# song.import_tja(glob(serv+"/*.tja")[-1])
song.feats = fft_and_melscale(song, include_zero_cross=False)
with open('./data/pickles/test_data.pickle', mode='wb') as f:
pickle.dump(song, f)
if __name__ == "__main__":
if sys.argv[1] == 'train':
print("preparing all train data processing...")
serv = f'./{sys.argv[2]:s}/*'
music_for_train(serv, verbose=True)
print("all train data processing done!")
if sys.argv[1] == 'test':
print("test data proccesing...")
serv = f'./{sys.argv[2]:s}/*'
music_for_test(serv)
print("test data processing done!")
from cnn.model import *
from cnn.music_processor import *
from assUtils import AssWriter
import pickle
import numpy as np
from scipy.signal import argrelmax
from librosa.util import peak_pick
from librosa.onset import onset_detect
def segment(songfile):
song = Audio(songfile, stereo=False)
song.feats = fft_and_melscale(song, include_zero_cross=False)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = convNet()
net = net.to(device)
if torch.cuda.is_available():
net.load_state_dict(torch.load('./models/model.pth'))
else:
net.load_state_dict(torch.load('./models/model.pth', map_location='cpu'))
inference = net.infer(song.feats, device, minibatch=4192)
inference = np.reshape(inference, (-1))
return detection(inference, song.samplerate)
def detection(inference, samplerate):
inference = smooth(inference, 5)
timestamp = (peak_pick(inference, pre_max=1, post_max=2, pre_avg=4, post_avg=5, delta=0.05, wait=3)) # 実際は7フレーム目のところの音
timestamp = timestamp*512/samplerate
return timestamp
if __name__ == '__main__':
onsets = segment(sys.argv[1])
syls = [[t, ''] for t in onsets]
print(syls)
writer = AssWriter()
writer.openAss("./media/test.ass")
writer.writeHeader()
writer.writeSyls(syls)
writer.closeAss()
from cnn.music_processor import *
if sys.argv[1] == 'train':
print("preparing all train data processing...")
serv = f'./{sys.argv[2]:s}/*'
music_for_train(serv, verbose=True)
print("all train data processing done!")
if sys.argv[1] == 'test':
print("test data proccesing...")
serv = f'./{sys.argv[2]:s}/*'
music_for_test(serv)
print("test data processing done!")
\ No newline at end of file
from cnn.model import *
from cnn.music_processor import *
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = convNet()
net = net.to(device)
with open('./data/pickles/train_data.pickle', mode='rb') as f:
songs = pickle.load(f)
minibatch = 128
soundlen = 15
epoch = 100
net.train(songs=songs, minibatch=minibatch, val_song=None, epoch=epoch, device=device, soundlen=soundlen, save_place='./models/model.pth', log='./data/log/log.txt')
\ No newline at end of file
#!/usr/bin/env python3
import re
import sys
try:
FILE = sys.argv[1]
except IndexError:
print("usage : %s inputFile.py" % sys.argv[0])
with open(FILE, 'r') as f:
CONTENT = f.read()
LINES_KARA = re.compile(r"Comment:.*(\d+:\d{2}:\d{2}.\d{2}),(\d+:\d{2}:\d{2}.\d{2}),.*,karaoke,(.*)\n");
RGX_TAGS = re.compile(r"\{\\k(\d+)\}([^\{\n\r]*)")
SYLS = []
def dateToTime(date):
"""
The `date` should be in the following format: HH:MM:SS.cs
"""
hourInMinuts = int(date[0:1]) * 60
minutsInSeconds = (int(date[3:4]) + hourInMinuts) * 60
secondsInCentiseconds = (int(date[6:7]) + minutsInSeconds) * 100
return int(date[9:10]) + secondsInCentiseconds
for line in LINES_KARA.findall(CONTENT):
lastTime = dateToTime(line[0])
for couple in RGX_TAGS.findall(line[2]):
SYLS.append((lastTime, couple[1], couple[0]))
lastTime += int(couple[0])
print(SYLS)
USAGE_MESSAGE="usage : $0 video_folder train_folder"
if [ $# != 2 ]; then
echo $USAGE_MESSAGE; exit 1;
fi
video_folder=$1
train_folder=$2
for filename in "$video_folder"/*.mkv; do
name=${filename##*/}
base=${name%.mkv}
mkdir -p "$train_folder/$base"
./extractWav.sh "$filename" "$train_folder/$base/$base.wav"
demucs --two-stems vocals -o "$train_folder/$base" "$train_folder/$base/$base.wav"
rm "$train_folder/$base/$base.wav"
ffmpeg -i "$train_folder/$base/htdemucs/$base/vocals.wav" "$train_folder/$base/vocals.ogg"
rm -r "$train_folder/$base/htdemucs"
./extractAss.sh "$filename" "$train_folder/$base/vocals.ass"
done;
\ No newline at end of file
librosa
demucs
chainer
soundfile
sklearn
matplotlib
numpy
tqdm
\ No newline at end of file
Fichier déplacé
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter