Cleanup obsolete code

2c550016 · Sting · 8a884653 · 2c550016 · 2c550016 · 8a884653
--- a/autokara.py
+++ b/autokara.py
@@ -6,7 +6,7 @@ import shlex
 from pathlib import Path
 from assUtils import AssWriter, getSyls

-from cnn_madmom.segment import segment
+from autosyl.segment import segment


 parser = argparse.ArgumentParser(description='AutoKara - Automatic karaoke timing tool')

--- a/cnn_madmom/segment.py
+++ b/cnn_madmom/segment.py
--- a/cnn/model.py
+++ b/cnn/model.py
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import numpy as np
-from tqdm import tqdm
-from cnn.music_processor import *
-
-"""
-On the paper,
-Starting from a stack of three spectrogram excerpts,
-convolution and max-pooling in turns compute a set of 20 feature maps 
-classified with a fully-connected network.
-"""
-
-class convNet(nn.Module):
-    """
-    copies the neural net used in a paper.
-    "Improved musical onset detection with Convolutional Neural Networks".
-    src: https://ieeexplore.ieee.org/document/6854953
-    """
-
-    def __init__(self):
-
-        super(convNet, self).__init__()
-        # model
-        self.conv1 = nn.Conv2d(3, 10, (3, 7))
-        self.conv2 = nn.Conv2d(10, 20, 3)
-        self.fc1 = nn.Linear(1120, 256)
-        self.fc2 = nn.Linear(256, 120)
-        self.fc3 = nn.Linear(120, 1)
-
-
-    def forward(self, x, istraining=False, minibatch=1):
-
-        x = F.max_pool2d(F.relu(self.conv1(x)), (3, 1))
-        x = F.max_pool2d(F.relu(self.conv2(x)), (3, 1))
-        x = F.dropout(x.view(minibatch, -1), training=istraining)
-        x = F.dropout(F.relu(self.fc1(x)), training=istraining)
-        x = F.dropout(F.relu(self.fc2(x)), training=istraining)
-
-        return F.sigmoid(self.fc3(x))
-
-
-    def train_data_builder(self, feats, answer, major_note_index, samplerate, soundlen=15, minibatch=1, split=0.2):
-        """
-        Args:
-            feats: song.feats; Audio module
-            answers: song.answers; Audio module
-            major_note_index: answer labels; corresponding to feats
-            samplerate: song.samplerate; Audio module
-            soundlen: =15. 学習モデルに渡す画像データの横方向の長さ．ここでは(80 * 15)サイズのデータを使用している
-            minibatch: training minibatch
-            split: =1. 
-        Variables:
-            minspace: minimum space between major note indexs
-            maxspace: maximum space between major note indexs
-            idx: index of major_note_index or feats
-            dist: distance of two notes
-        """
-
-        # acceptable interval in seconds
-        minspace = 0.1
-        maxspace = 0.7
-
-        idx = np.random.permutation(major_note_index.shape[0] - soundlen) + soundlen // 2
-        X, y = [], []
-        cnt = 0
-        
-        for i in range(int(idx.shape[0] * split)):
-
-            dist = major_note_index[idx[i] + 1] - major_note_index[idx[i]]  # distinguish by this value
-            
-            if dist < maxspace * samplerate / 512 and dist > minspace * samplerate / 512:    
-                for j in range(-1, dist + 2):
-                    X.append(feats[:, :, major_note_index[idx[i]] - soundlen // 2 + j : major_note_index[idx[i]] + soundlen // 2 + j + 1])
-                    y.append(answer[major_note_index[idx[i]] + j])
-                    cnt += 1
-                    
-                    if cnt % minibatch == 0:
-                        yield (torch.from_numpy(np.array(X)).float(), torch.from_numpy(np.array(y)).float())
-                        X, y = [], []
-
-
-    def infer_data_builder(self, feats, soundlen=15, minibatch=1):
-        
-        x = []
-        
-        for i in range(feats.shape[2] - soundlen):
-            x.append(feats[:, :, i:i+soundlen])
-        
-            if (i + 1) % minibatch == 0:
-                yield (torch.from_numpy(np.array(x)).float())
-                x = []
-        
-        if len(x) != 0:
-            yield (torch.from_numpy(np.array(x)).float())
-
-
-    def train(self, songs, minibatch, epoch, device, soundlen=15, val_song=None, save_place='./models/model.pth', log='./log/log.txt'):
-        """
-        Args:
-            songs: the list of song
-            minibatch: minibatch value
-            epoch: number of train 
-            device: cpu / gpu
-            soundlen: width of one train data's image
-            val_song: validation song, if you wanna validation while training, give a path of validation song data.
-            save_place: save place path
-            log: log place path
-            don-ka: don(1) or ka(2) or both(0), usually, firstly, train don, then, train ka.
-        """
-
-        for song in songs:
-            
-            timing = np.array([syl[0] for syl in song.timestamp])
-            syllable  = np.array([syl[1] for syl in song.timestamp])
-            song.answer = np.zeros((song.feats.shape[2]))
-
-           
-            song.major_note_index = np.rint(timing[np.where(syllable != 0)] * song.samplerate/512).astype(np.int32)
-            
-            song.major_note_index = np.delete(song.major_note_index, np.where(song.major_note_index >= song.feats.shape[2]))
-
-            song.answer[song.major_note_index] = 1
-
-            song.answer = milden(song.answer)
-
-        # training
-        optimizer = optim.SGD(self.parameters(), lr=0.02)
-        criterion = nn.MSELoss()
-        running_loss = 0
-        val_loss = 0
-
-        for i in range(epoch):
-            for song in songs:
-                for X, y in self.train_data_builder(song.feats, song.answer, song.major_note_index, song.samplerate, soundlen, minibatch, split=0.2):
-                    optimizer.zero_grad()
-                    output = self(X.to(device), istraining=True, minibatch=minibatch)
-                    target = y.to(device)
-                    loss = criterion(output.squeeze(), target)
-                    loss.backward()
-                    optimizer.step()
-                    running_loss += loss.data.item()
-
-            with open(log, 'a') as f:
-                print("epoch: %.d running_loss: %.10f " % (i+1, running_loss), file=f)
-
-            print("epoch: %.d running_loss: %.10f" % (i+1, running_loss))
-            
-            running_loss = 0    
-
-            if val_song:
-                inference = torch.from_numpy(self.infer(val_song.feats, device, minibatch=512)).to(device)
-                target = torch.from_numpy(val_song.answer[:-soundlen]).float().to(device)
-                loss = criterion(inference.squeeze(), target)
-                val_loss = loss.data.item()
-
-                with open(log, 'a') as f:
-                    print("val_loss: %.10f " % (val_loss), file=f)
-
-        torch.save(self.state_dict(), save_place)
-
-
-    def infer(self, feats, device, minibatch=1):
-
-        with torch.no_grad():
-            inference = None
-            for x in tqdm(self.infer_data_builder(feats, minibatch=minibatch), total=feats.shape[2]//minibatch):
-                output = self(x.to(device), minibatch=x.shape[0])
-                if inference is not None:
-                    inference = np.concatenate((inference, output.cpu().numpy().reshape(-1)))
-                else:
-                    inference = output.cpu().numpy().reshape(-1)
-            
-            return np.array(inference).reshape(-1)
-
-
-if __name__ == '__main__':
-
-    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-    net = convNet()
-    net = net.to(device)
-           
-    with open('./data/pickles/train_data.pickle', mode='rb') as f:
-        songs = pickle.load(f)
-
-    minibatch = 128
-    soundlen = 15
-    epoch = 100
-
-    
-    net.train(songs=songs, minibatch=minibatch, val_song=None, epoch=epoch, device=device, soundlen=soundlen, save_place='./models/model.pth', log='./data/log/log.txt')
\ No newline at end of file
--- a/cnn/music_processor.py
+++ b/cnn/music_processor.py
-import soundfile as sf
-import matplotlib.pyplot as plt
-import numpy as np
-import os
-from glob import glob
-from scipy import signal
-from scipy.fftpack import fft
-from librosa.filters import mel
-from librosa.display import specshow
-from librosa import stft
-from librosa.effects import pitch_shift
-import pickle
-import sys
-from numba import jit, prange
-from sklearn.preprocessing import normalize
-import re
-from assUtils import dateToTime, timeToDate
-
-
-class Audio:
-    """
-    audio class which holds music data and timestamp for notes.
-    Args:
-        filename: file name.
-        stereo: True or False; wether you have Don/Ka streo file or not. normaly True.
-    Variables:
-    Example:
-        >>>from music_processor import *
-        >>>song = Audio(filename)
-        >>># to get audio data
-        >>>song.data
-        >>># to import .tja files:
-        >>>song.import_tja(filename)
-        >>># to get data converted
-        >>>song.data = (song.data[:,0]+song.data[:,1])/2
-        >>>fft_and_melscale(song, include_zero_cross=False)
-    """
-
-    def __init__(self, filename, stereo=True):
-
-        self.data, self.samplerate = sf.read(filename, always_2d=True)
-        if stereo is False:
-            self.data = (self.data[:, 0]+self.data[:, 1])/2
-        self.timestamp = []
-
-
-    def plotaudio(self, start_t, stop_t):
-
-        plt.plot(np.linspace(start_t, stop_t, stop_t-start_t), self.data[start_t:stop_t, 0])
-        plt.show()
-
-
-    def save(self, filename="./savedmusic.wav", start_t=0, stop_t=None):
-
-        if stop_t is None:
-            stop_t = self.data.shape[0]
-        sf.write(filename, self.data[start_t:stop_t], self.samplerate)
-
-
-
-
-    def import_ass(self, filename):
-
-        with open(filename, 'r') as f:
-            CONTENT = f.read()
-
-        LINES_KARA = re.compile(r"Comment:.*(\d+:\d{2}:\d{2}.\d{2}),(\d+:\d{2}:\d{2}.\d{2}),.*,karaoke,(.*)\n");
-
-        RGX_TAGS = re.compile(r"\{\\k(\d+)\}([^\{\n\r]*)")
-
-        SYLS = []
-
-        for line in LINES_KARA.findall(CONTENT):
-            lastTime = dateToTime(line[0])
-            for couple in RGX_TAGS.findall(line[2]):
-                self.timestamp.append((lastTime/100, 1 if len(couple[1]) > 0 else 0))
-                lastTime += int(couple[0])
-        self.timestamp = np.array(self.timestamp, dtype='float, int')
-
-
-def make_frame(data, nhop, nfft):
-    """
-    helping function for fftandmelscale.
-    細かい時間に切り分けたものを学習データとするため，nhop(512)ずつずらしながらnfftサイズのデータを配列として返す
-    """
-    
-    length = data.shape[0]
-    framedata = np.concatenate((data, np.zeros(nfft)))  # zero padding
-    return np.array([framedata[i*nhop:i*nhop+nfft] for i in range(length//nhop)])  
-
-
-#@jit
-def fft_and_melscale(song, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
-    """
-    fft and melscale method.
-    fft: nfft = [1024, 2048, 4096]; サンプルの切り取る長さを変えながらデータからnp.arrayを抽出して高速フーリエ変換を行う．
-    melscale: 周波数の次元を削減するとともに，log10の値を取っている．
-    """
-
-    feat_channels = []
-    
-    for nfft in nffts:
-        
-        feats = []
-        window = signal.blackmanharris(nfft)
-        filt = mel(sr=song.samplerate, n_fft=nfft, n_mels=mel_nband, fmin=mel_freqlo, fmax=mel_freqhi)
-        
-        # get normal frame
-        frame = make_frame(song.data, nhop, nfft)
-        # print(frame.shape)
-
-        # melscaling
-        processedframe = fft(window*frame)[:, :nfft//2+1]
-        processedframe = np.dot(filt, np.transpose(np.abs(processedframe)**2))
-        processedframe = 20*np.log10(processedframe+0.1)
-        # print(processedframe.shape)
-
-        feat_channels.append(processedframe)
-    
-    if include_zero_cross:
-        song.zero_crossing = np.where(np.diff(np.sign(song.data)))[0]
-        print(song.zero_crossing)
-    
-    return np.array(feat_channels)
-
-
-#@jit(parallel=True)
-def multi_fft_and_melscale(songs, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
-    
-    for i in prange(len(songs)):
-        songs[i].feats = fft_and_melscale(songs[i], nhop, nffts, mel_nband, mel_freqlo, mel_freqhi)
-
-
-def milden(data):
-    """put smaller value(0.25) to plus minus 1 frame."""
-    
-    for i in range(data.shape[0]):
-        
-        if data[i] == 1:
-            if i > 0:
-                data[i-1] = 0.25
-            if i < data.shape[0] - 1:
-                data[i+1] = 0.25
-        
-        if data[i] == 0.26:
-            if i > 0:
-                data[i-1] = 0.1
-            if i < data.shape[0] - 1:
-                data[i+1] = 0.1
-    
-    return data
-
-
-def smooth(x, window_len=11, window='hanning'):
-    
-    if x.ndim != 1:
-        raise ValueError
-
-    if x.size < window_len:
-        raise ValueError
-
-    if window_len < 3:
-        return x
-
-    if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
-        raise ValueError
-
-    s = np.r_[x[window_len-1:0:-1], x, x[-2:-window_len-1:-1]]
-    # print(len(s))
-    if window == 'flat':  # moving average
-        w = np.ones(window_len, 'd')
-    else:
-        w = eval('np.'+window+'(window_len)')
-
-    y = np.convolve(w/w.sum(), s, mode='valid')
-    
-    return y
-
-
-
-
-def music_for_train(serv, deletemusic=True, verbose=False, nhop=512, nffts=[1024, 2048, 4096], mel_nband=80, mel_freqlo=27.5, mel_freqhi=16000.0, include_zero_cross=False):
-    
-    songplaces = glob(serv)
-    songs = []
-    
-    for songplace in songplaces:
-        
-        if verbose:
-            print(songplace)
-        
-        songname = songplace.split("/")[-1]
-        
-        song = Audio(glob(songplace+"/*.ogg")[0])
-        song.import_ass(glob(songplace+"/*.ass")[-1])
-        song.data = (song.data[:, 0]+song.data[:, 1])/2
-
-        song.feats = fft_and_melscale(song, nhop, nffts, mel_nband, mel_freqlo, mel_freqhi)
-
-        if deletemusic:
-            song.data = None
-    
-        with open(f'./data/pickles/{songname:s}.pickle', mode='wb') as f:
-            pickle.dump(song, f)
-
-
-def music_for_test(serv, deletemusic=True, verbose=False):
-
-    song = Audio(glob(serv+"/*.ogg")[0], stereo=False)
-    # song.import_tja(glob(serv+"/*.tja")[-1])
-    song.feats = fft_and_melscale(song, include_zero_cross=False)
-    with open('./data/pickles/test_data.pickle', mode='wb') as f:
-        pickle.dump(song, f)
-
-
-if __name__ == "__main__":
-
-    if sys.argv[1] == 'train':
-        print("preparing all train data processing...")
-        serv = f'./{sys.argv[2]:s}/*'
-        music_for_train(serv, verbose=True)
-        print("all train data processing done!")    
-
-    if sys.argv[1] == 'test':
-        print("test data proccesing...")
-        serv = f'./{sys.argv[2]:s}/*'
-        music_for_test(serv)
-        print("test data processing done!")
-
-        
--- a/cnn/segment.py
+++ b/cnn/segment.py
-from cnn.model import *
-from cnn.music_processor import *
-from assUtils import AssWriter
-import pickle
-import numpy as np
-from scipy.signal import argrelmax
-from librosa.util import peak_pick
-from librosa.onset import onset_detect
-
-
-def segment(songfile):
-
-
-    song = Audio(songfile, stereo=False)
-    song.feats = fft_and_melscale(song, include_zero_cross=False)
-
-    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-    net = convNet()
-    net = net.to(device)
-        
-    if torch.cuda.is_available():
-        net.load_state_dict(torch.load('./models/model.pth'))
-    else:
-        net.load_state_dict(torch.load('./models/model.pth', map_location='cpu'))
-
-    inference = net.infer(song.feats, device, minibatch=4192)
-    inference = np.reshape(inference, (-1))
-
-    return detection(inference, song.samplerate)
-
-
-
-def detection(inference, samplerate):
-
-    inference = smooth(inference, 5)
-   
-
-    timestamp = (peak_pick(inference, pre_max=1, post_max=2, pre_avg=4, post_avg=5, delta=0.05, wait=3))  # 実際は7フレーム目のところの音
-
-    timestamp = timestamp*512/samplerate
- 
-    return timestamp
-
-
-
-if __name__ == '__main__':
-
-    onsets = segment(sys.argv[1])
-    syls = [[t, ''] for t in onsets]
-
-    print(syls)
-
-    writer = AssWriter()
-    writer.openAss("./media/test.ass")
-    writer.writeHeader()
-    writer.writeSyls(syls)
-    writer.closeAss()
-
-    
-
--- a/cnn_prepare_data.py
+++ b/cnn_prepare_data.py
-from cnn.music_processor import *
-
-
-
-if sys.argv[1] == 'train':
-    print("preparing all train data processing...")
-    serv = f'./{sys.argv[2]:s}/*'
-    music_for_train(serv, verbose=True)
-    print("all train data processing done!")    
-
-if sys.argv[1] == 'test':
-    print("test data proccesing...")
-    serv = f'./{sys.argv[2]:s}/*'
-    music_for_test(serv)
-    print("test data processing done!")
\ No newline at end of file
--- a/cnn_train.py
+++ b/cnn_train.py
-from cnn.model import *
-from cnn.music_processor import *
-from glob import glob
-
-
-device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-net = convNet()
-net = net.to(device)
-
-songplaces = glob('./data/pickles/*.pickle')     
-songs = []   
-
-for songplace in songplaces:
-    with open(songplace, mode='rb') as f:
-        song = pickle.load(f)
-        songs.append(song)
-
-minibatch = 128
-soundlen = 15
-epoch = 100
-
-
-net.train(songs=songs, minibatch=minibatch, val_song=None, epoch=epoch, device=device, soundlen=soundlen, save_place='./models/model.pth', log='./data/log/log.txt')
\ No newline at end of file
--- a/process_train_data.sh
+++ b/process_train_data.sh
--- a/rosa/segment.py
+++ b/rosa/segment.py
-import librosa
-import numpy as np
-# import matplotlib.pyplot as plt
-import sys
-
-
-
-class Segment:
-
-    def __init__(self, file):
-        self.file = file
-
-
-    def onsets(self):
-        '''
-        Use librosa's onset detection to detect syllable start times
-        '''
-
-        y, sr = librosa.load(self.file)
-
-        o_env = librosa.onset.onset_strength(y=y, sr=sr)
-        times = librosa.times_like(o_env, sr=sr)
-        onset_raw = librosa.onset.onset_detect(onset_envelope=o_env, sr=sr)
-        onset_bt = librosa.onset.onset_backtrack(onset_raw, o_env)
-
-        S = np.abs(librosa.stft(y=y))
-        rms = librosa.feature.rms(S=S)
-        onset_bt_rms = librosa.onset.onset_backtrack(onset_raw, rms[0])
-
-        onset_bt_times = librosa.frames_to_time(onset_bt, sr=sr)
-        onset_bt_rms_times = librosa.frames_to_time(onset_bt_rms, sr=sr)
-
-        onset_raw_times = librosa.frames_to_time(onset_raw, sr=sr)
-
-        # print(onset_bt_rms_times)
-
-        '''
-        fig, ax = plt.subplots(nrows=3, sharex=True)
-        librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),y_axis='log', x_axis='time', ax=ax[0])
-        ax[0].label_outer()
-        ax[1].plot(times, o_env, label='Onset strength')
-        ax[1].vlines(librosa.frames_to_time(onset_raw), 0, o_env.max(), label='Raw onsets')
-        ax[1].vlines(librosa.frames_to_time(onset_bt), 0, o_env.max(), label='Backtracked', color='r')
-        ax[1].legend()
-        ax[1].label_outer()
-        ax[2].plot(times, rms[0], label='RMS')
-        ax[2].vlines(librosa.frames_to_time(onset_bt_rms), 0, rms.max(), label='Backtracked (RMS)', color='r')
-        ax[2].legend()
-
-        plt.show()
-        '''
-
-        return onset_raw_times
-
-
-if __name__ == "__main__":
-    seg = Segment(sys.argv[1])
-    seg.onsets()
\ No newline at end of file