diff --git a/.gitignore b/.gitignore index 0b531c7db05713c9016533ac057376e8a061e62d..b1db3f291983c114d523b100e9c3ae149e731c23 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,6 @@ -* -!.gitignore -!README.md -!requirements.txt -!extractWav.sh -!extractAss.sh -!karaUtils.py -!autokara.py -!assUtils.py -!process_train_data.sh -!cnn_prepare_data.py -!cnn_train.py -!*/cnn/segment.py -!*/cnn/music_processor.py -!*/cnn/model.py -!rosa/*.py -media/ \ No newline at end of file +__pycache__/ +data/ +env/ +media/ +models/ + diff --git a/README.md b/README.md index d22d58f3a0514c7ab184a0aa956d5de0c9e97d5c..04b43ea3eeac6eb6cdaf19d7e733738be4f5a9a6 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ An introduction to neural networks and deep learning: ### Machine Learning & Deep Learning methods [Using CNNs on spectrogram images](https://www.ofai.at/~jan.schlueter/pubs/2014_icassp.pdf) (Schlüter, Böck, 2014) : + - [MADMOM implementation](https://madmom.readthedocs.io/en/v0.16/modules/features/onsets.html) - Python implementation for Taiko rythm games : https://github.com/seiichiinoue/odcnn ### Other methods @@ -45,7 +46,9 @@ If we ever want to use an AI to identify syllables without a reference lyrics fi - MKVToolnix (at least the CLI utils) - Python >= 3.8 -- PyTorch : follow the instructions [here](https://pytorch.org/get-started/locally/) + +Optional : +- PyTorch for custom model training : follow the instructions [here](https://pytorch.org/get-started/locally/) All other python modules can be installed directly through pip, see further. @@ -62,35 +65,12 @@ $ pip install -r requirements.txt $ deactivate ``` -Having a CUDA-capable GPU is optional, but can greatly reduce processing time. +Having a CUDA-capable GPU is optional, but can greatly reduce processing time in some situations. # Use - -## Training - -To extract vocals and ASS from MKV video files: -```bash -$ ./process_train_data video_folder train_folder -``` - -To prepare the training data for the model : -```bash -$ python cnn_prepare_data.py train train_folder -``` - -Prepared data will be stored in `./data/pickles/train_data.pickle` - -To train the model on the prepared data : -```bash -$ python cnn_train.py -``` - -The model will be written to `./models/model.pth` - - -## Infer +## Inference To execute AutoKara on a MKV video file : ```bash diff --git a/assUtils.py b/assUtils.py index 510e639f8c31ea0d41365f84233cf7abffabe374..3345d5b1665b0c9b21812ab3fcfbf6a9ee3a82c8 100644 --- a/assUtils.py +++ b/assUtils.py @@ -51,12 +51,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text ''' self.file.write(header) - def writeSyls(self, syl_timings): + def writeSyls(self, syl_timings, syls_per_line=10000): last_syl_dur = 500 - start_time = timeToDate(syl_timings[0][0]) + syl_index = 0 + while syl_index < (len(syl_timings) - syls_per_line): + start_time = timeToDate(syl_timings[syl_index][0]) + end_time = timeToDate(syl_timings[syl_index + syls_per_line][0]) + line = f'Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,' + for i in range(syl_index, syl_index + syls_per_line): + syl_dur = round((syl_timings[i+1][0] - syl_timings[i][0]) * 100) + line += f'{{\k{syl_dur:d}}}{syl_timings[i][1]:s}' + line += '\n' + self.file.write(line) + syl_index += syls_per_line + + start_time = timeToDate(syl_timings[syl_index][0]) end_time = timeToDate(syl_timings[-1][0] + last_syl_dur//100) line = f'Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,' - for i in range(len(syl_timings) - 1): + for i in range(syl_index, len(syl_timings) - 1): syl_dur = round((syl_timings[i+1][0] - syl_timings[i][0]) * 100) line += f'{{\k{syl_dur:d}}}{syl_timings[i][1]:s}' line += f'{{\k{last_syl_dur:d}}}{syl_timings[-1][1]:s}\n' diff --git a/autokara.py b/autokara.py index c62053a37ca55ae6803f47f29606f6df79bbd07b..e7f69b174336edf669cc884c52275ae95fdea7f7 100644 --- a/autokara.py +++ b/autokara.py @@ -6,7 +6,7 @@ import shlex from pathlib import Path from assUtils import AssWriter -from cnn.segment import segment +from cnn_madmom.segment import segment parser = argparse.ArgumentParser(description='AutoKara - Automatic karaoke timing tool') @@ -42,12 +42,12 @@ else: print("Identifying syl starts...") onsets = segment(vocals_file) -syls = [[t, ''] for t in onsets] +syls = [[t, 'la'] for t in onsets] print("Syls found, writing ASS file...") writer = AssWriter() writer.openAss(ass_file) writer.writeHeader() -writer.writeSyls(syls) +writer.writeSyls(syls, syls_per_line=10) writer.closeAss() diff --git a/cnn_madmom/segment.py b/cnn_madmom/segment.py new file mode 100644 index 0000000000000000000000000000000000000000..a86921df036ac72943a271da2440cbfae1b1e0a4 --- /dev/null +++ b/cnn_madmom/segment.py @@ -0,0 +1,96 @@ +import madmom +import numpy as np +import sys +import matplotlib.pyplot as plt +from scipy.ndimage.filters import maximum_filter + + +def segment(songfile): + + delay = -4 + backtrack = False + + cnn = madmom.features.onsets.CNNOnsetProcessor() + spectral = madmom.features.onsets.SpectralOnsetProcessor('complex_domain') + + + spec = spec = madmom.audio.spectrogram.Spectrogram(songfile, num_channels=1) + filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24) + log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1) + + cnn_function = cnn(songfile, num_channels=1) + spectral_function = spectral(songfile, num_channels=1) + spectral_function = spectral_function/(spectral_function.max()) + + #activation_function = 0.5*cnn_function + 0.5*spectral_function + activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function) + #activation_function = np.where(spectral_function > 0.14, cnn_function, 0) + #onsets = proc(activation_function) + + activation_smoothed = madmom.audio.signal.smooth(activation_function, 20) + cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20) + onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=0.6, smooth=0) + onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.2]) + + + if backtrack: + # Backtrack onsets to closest earlier local minimum + backtrack_max_frames = 50 + for i in range(len(onsets)): + initial_onset = onsets[i] + while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames): + onsets[i] -= 1 + + onsets = (onsets + delay)/100 + + print(onsets) + + return onsets + + + +if __name__ == "__main__": + songfile = sys.argv[1] + + backtrack = False + + cnn = madmom.features.onsets.CNNOnsetProcessor() + spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') + + + spec = spec = madmom.audio.spectrogram.Spectrogram(songfile, num_channels=1) + filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24) + log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1) + + cnn_function = cnn(songfile, num_channels=1) + spectral_function = spectral(songfile, num_channels=1) + spectral_function = spectral_function/(spectral_function.max()) + + #activation_function = 0.5*cnn_function + 0.5*spectral_function + activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function) + #activation_function = np.where(spectral_function > 0.14, cnn_function, 0) + #onsets = proc(activation_function) + + activation_smoothed = madmom.audio.signal.smooth(activation_function, 20) + cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20) + onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=0.6, smooth=0) + onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.2]) + + # Backtrack onsets to closest earlier local minimum + if backtrack: + backtrack_max_frames = 50 + for i in range(len(onsets)): + initial_onset = onsets[i] + while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames): + onsets[i] -= 1 + + print(onsets/100) + + fig, axs = plt.subplots(nrows=2, sharex=True) + axs[0].imshow(log_spec.T, origin='lower', aspect='auto') + axs[1].plot(cnn_smoothed) + axs[1].plot(spectral_function, color='green') + axs[1].plot(activation_smoothed, color='orange') + axs[1].vlines(onsets, 0, 1, colors='red') + + plt.show() diff --git a/requirements.txt b/requirements.txt index 4a1d9cbfadfab62043cc52f5798be97e66bc7b97..770a61b27edbe79b68aa14593a431c0a841062bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,8 @@ soundfile sklearn matplotlib numpy -tqdm \ No newline at end of file +tqdm +scipy +cython +mido +git+https://github.com/CPJKU/madmom.git \ No newline at end of file