diff --git a/.gitignore b/.gitignore index b1db3f291983c114d523b100e9c3ae149e731c23..08ab719c7ab57a2c8438443652cd11f1572c5f0e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ data/ env/ media/ models/ - +build/ +*.egg-info \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..7c6af3d33e05eaea206a2d73856b5224bcbe2b3d --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include requirements.txt +include README.md +include preprocess_media.sh +recursive-include autokara * diff --git a/README.md b/README.md index 5239f2f3ddba3a304e9d93170c8130eeeb04b4f6..793b0e5c15056a43ba456f54c0197e92b010e443 100644 --- a/README.md +++ b/README.md @@ -46,21 +46,36 @@ If we ever want to use an AI to identify syllables without a reference lyrics fi ## Requirements - MKVToolnix (at least the CLI utils) +- FFmpeg - Python >= 3.8 -Optional : -- PyTorch for custom model training : follow the instructions [here](https://pytorch.org/get-started/locally/) - All other python modules can be installed directly through pip, see further. -This project requires at least Python 3.8, and using a virtual environment is strongly recommended. -To install the dependencies, execute in the project directory : +## Install + +The simplest way to install Autokara is through PIP : +```bash +# Using HTTPS +$ pip install git+https://git.iiens.net/bakaclub/autokara.git + +# Or SSH +$ pip install git+ssh://git@git.iiens.net:bakaclub/autokara.git +``` + +Or you can clone the repo and use `pip install <repo_directory>` if you prefer. + + +To use the custom phonetic mappings for Japanese Romaji and other non-English languages, you need to update manually (for now) the g2p DB (within the venv): +```bash +$ autokara-gen-lang +``` + + +If you plan on contributing to development, the use of a virtual environment is recommended : ```bash $ python -m venv env # create the virtual environment, do it once $ source env/bin/activate # use the virtual environement - -# Install the required python modules -$ pip install -r requirements.txt +$ pip install git+ssh://git@git.iiens.net:bakaclub/autokara.git # install autokara # To exit the virtual environment $ deactivate @@ -68,15 +83,11 @@ $ deactivate Having a CUDA-capable GPU is optional, but can greatly reduce processing time in some situations. +## Configuration -To use the custom phonetic mapping for Japanese Romaji, you need to update manually (for now) the g2p DB (within the venv): -```bash -$ cp g2p/mappings/langs/rji/* env/lib/python3.11/site-packages/g2p/mappings/langs/rji/ - -#Then update : -$ g2p update -``` - +Autokara comes with a default config file in `autokara/default.conf`. +If you want to tweak some values (enable CUDA, for example), you should add them to a new config file in your personal config directory : `~/.config/autokara/autokara.conf`. +This new file has priority over the default one, which is used only as fallback for unspecified values. # Use @@ -89,22 +100,22 @@ To use Autokara, you need : To execute AutoKara on a MKV video file and an ASS file containing the lyrics (ASS will be overwritten): ```bash -$ python autokara.py video.mkv lyrics.ass +$ autokara video.mkv lyrics.ass ``` To output to a different file (and keep the original) : ```bash -$ python autokara.py video.mkv lyrics.ass -o output.ass +$ autokara video.mkv lyrics.ass -o output.ass ``` To execute AutoKara on a (pre-extracted) WAV (or OGG, MP3, ...) vocals file, pass the `--vocals` flag : ```bash -$ python autokara.py vocals.wav output.ass --vocals +$ autokara vocals.wav output.ass --vocals ``` To use a phonetic transcription optimized for a specific language, use `--lang` (or `-l`) : ```bash -$ python autokara.py vocals.wav output.ass --lang jp +$ autokara vocals.wav output.ass --lang jp ``` Available languages are : ``` @@ -114,7 +125,7 @@ en : English Full help for all options is available with : ```bash -$ python autokara.py -h +$ autokara -h ``` ## Useful scripts @@ -143,7 +154,7 @@ A visualization tool, mainly intended for debug. Does the same as autokara.py, but instead of writing to a file, plots a graphic with onset times, spectrogram, probability curves,... Does not work on video files, only separated vocals audio files ```bash -$ python plot_syls.py vocals.wav lyrics.ass +$ autokara-plot vocals.wav lyrics.ass ``` diff --git a/autokara.py b/autokara.py deleted file mode 100644 index 8fe6f2e6eafc35e54f1f1c6dab83e3f1bab72dce..0000000000000000000000000000000000000000 --- a/autokara.py +++ /dev/null @@ -1,68 +0,0 @@ -import sys -import argparse -import demucs.separate -import subprocess -import shlex -from pathlib import Path - -from autosyl.assUtils import AssWriter, getSyls, getHeader -from autosyl.segment import segment - - -parser = argparse.ArgumentParser(description='AutoKara - Automatic karaoke timing tool') -parser.add_argument("source_file", type=str, help="The video/audio file to time") -parser.add_argument("ass_file", type=str, help="The ASS file with lyrics to time") -parser.add_argument("--vocals", action="store_true", help="Treat the input as vocals file, i.e. do not perform vocals extraction") -parser.add_argument("-o", "--output", help="Write output to specified file. If absent, overwrite source file") -parser.add_argument("-v","--verbose", action="store_true", help="Increased verbosity") -parser.add_argument("-l","--lang", help="Select language to use (default is Japanese Romaji)") - -args = parser.parse_args() - -ass_file = args.ass_file -verbose = args.verbose - -if not args.vocals : - print("Extracting audio from video file...") - Path("./media/audio").mkdir(parents=True, exist_ok=True) - basename = Path(args.source_file).stem - audio_file = "media/audio/%s.wav" % basename - - subprocess.call(shlex.split('./extractWav.sh "%s" "%s"' % (args.source_file, audio_file))) - - Path("./media/vocals").mkdir(parents=True, exist_ok=True) - output_folder = "./media/vocals" - - print("Isolating vocals...") - - # Not working, don't know why - # demucs.separate.main(shlex.split('--two-stems vocals -o "%s" "%s"' % (output_folder, audio_file))) - subprocess.call(shlex.split('demucs --two-stems vocals -o "%s" "%s"' % (output_folder, audio_file))) - - vocals_file = "./media/vocals/htdemucs/%s/vocals.wav" % basename -else: - vocals_file = args.source_file - - - -print("Identifying syl starts...") - - -if verbose: - print("Retrieving syls from lyrics...") -reference_syls, line_meta = getSyls(ass_file) - -if verbose: - print("Starting syl detection...") -syls = segment(vocals_file, reference_syls=reference_syls, verbose=verbose, language=args.lang) -print(syls) -print(line_meta) - -print("Syls found, writing ASS file...") -header = getHeader(ass_file) -writer = AssWriter() -writer.openAss(args.output if args.output else ass_file) -writer.writeHeader(header=header) -writer.writeSyls(syls, line_meta) -writer.closeAss() - diff --git a/autokara/__init__.py b/autokara/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b31ecc406afe6e17bb27fe80f20ed10e8516b16f --- /dev/null +++ b/autokara/__init__.py @@ -0,0 +1,2 @@ + +__version__ = "0.1.0" \ No newline at end of file diff --git a/autokara/autokara.py b/autokara/autokara.py new file mode 100644 index 0000000000000000000000000000000000000000..ffb7edace7fad76076009591ed2a71cc06cc01b6 --- /dev/null +++ b/autokara/autokara.py @@ -0,0 +1,93 @@ +import sys +import argparse +import demucs.separate +import subprocess +import shlex +from pathlib import Path +from configparser import ConfigParser + +from .autosyl.assUtils import AssWriter, getSyls, getHeader +from .autosyl.segment import segment + + + +def main(opts=None): + + parser = argparse.ArgumentParser(description='AutoKara - Automatic karaoke timing tool') + parser.add_argument("source_file", type=str, help="The video/audio file to time") + parser.add_argument("ass_file", type=str, help="The ASS file with lyrics to time") + parser.add_argument("--vocals", action="store_true", help="Treat the input as vocals file, i.e. do not perform vocals extraction") + parser.add_argument("-o", "--output", help="Write output to specified file. If absent, overwrite source file") + parser.add_argument("-v","--verbose", action="store_true", help="Increased verbosity") + parser.add_argument("-l","--lang", help="Select language to use (default is Japanese Romaji)") + + args = parser.parse_args(opts) + + ass_file = args.ass_file + verbose = args.verbose + + here = Path(__file__).parent + + config = ConfigParser() + config.read([ + str(here / "default.conf"), # Default config file + str(Path().home()/ ".config" / "autokara"/ "autokara.conf") # User config file + ]) + + media_dir = config['Media']['media_dir'] + segment_config = { + 'model': config['Segment']['model'], + 'bdr': config['Segment'].getboolean('bdr'), + 'cuda': config['Segment'].getboolean('cuda'), + 'syl_delay': config['Segment'].getint('syl_delay') + } + + + if not args.vocals : + print("Extracting audio from video file...") + Path(media_dir + "/audio").mkdir(parents=True, exist_ok=True) + basename = Path(args.source_file).stem + audio_file = f"{media_dir:s}/audio/{basename:s}.wav" + + subprocess.call(shlex.split(f'{str(here)}/extractWav.sh "{args.source_file:s}" "{audio_file}"')) + + Path(f"{media_dir:s}/vocals").mkdir(parents=True, exist_ok=True) + output_folder = f"{media_dir:s}/vocals" + + print("Isolating vocals...") + + # Not working, don't know why + # demucs.separate.main(shlex.split('--two-stems vocals -o "%s" "%s"' % (output_folder, audio_file))) + subprocess.call(shlex.split(f'demucs --two-stems vocals -o "{output_folder:s}" "{audio_file:s}"')) + + vocals_file = f"{media_dir:s}/vocals/htdemucs/{basename:s}/vocals.wav" + else: + vocals_file = args.source_file + + + + print("Identifying syl starts...") + + + if verbose: + print("Retrieving syls from lyrics...") + reference_syls, line_meta = getSyls(ass_file) + + if verbose: + print("Starting syl detection...") + syls = segment(vocals_file, segment_config, reference_syls=reference_syls, verbose=verbose, language=args.lang) + print(syls) + print(line_meta) + + print("Syls found, writing ASS file...") + header = getHeader(ass_file) + writer = AssWriter() + writer.openAss(args.output if args.output else ass_file) + writer.writeHeader(header=header) + writer.writeSyls(syls, line_meta) + writer.closeAss() + + +if __name__ == "__main__": + main() + diff --git a/autosyl/LyricsAlignment/LICENSE b/autokara/autosyl/LyricsAlignment/LICENSE similarity index 100% rename from autosyl/LyricsAlignment/LICENSE rename to autokara/autosyl/LyricsAlignment/LICENSE diff --git a/autosyl/LyricsAlignment/checkpoints/checkpoint_BDR b/autokara/autosyl/LyricsAlignment/checkpoints/checkpoint_BDR similarity index 100% rename from autosyl/LyricsAlignment/checkpoints/checkpoint_BDR rename to autokara/autosyl/LyricsAlignment/checkpoints/checkpoint_BDR diff --git a/autosyl/LyricsAlignment/checkpoints/checkpoint_Baseline b/autokara/autosyl/LyricsAlignment/checkpoints/checkpoint_Baseline similarity index 100% rename from autosyl/LyricsAlignment/checkpoints/checkpoint_Baseline rename to autokara/autosyl/LyricsAlignment/checkpoints/checkpoint_Baseline diff --git a/autosyl/LyricsAlignment/checkpoints/checkpoint_MTL b/autokara/autosyl/LyricsAlignment/checkpoints/checkpoint_MTL similarity index 100% rename from autosyl/LyricsAlignment/checkpoints/checkpoint_MTL rename to autokara/autosyl/LyricsAlignment/checkpoints/checkpoint_MTL diff --git a/autosyl/LyricsAlignment/model.py b/autokara/autosyl/LyricsAlignment/model.py similarity index 99% rename from autosyl/LyricsAlignment/model.py rename to autokara/autosyl/LyricsAlignment/model.py index f6fd66b10cb934ee1911b50d35b6bbcd13a9320f..50120c275151ad17e553ee01eb53a9d76c420962 100644 --- a/autosyl/LyricsAlignment/model.py +++ b/autokara/autosyl/LyricsAlignment/model.py @@ -4,7 +4,7 @@ import torch.nn.functional as F import torchaudio import warnings -from autosyl.LyricsAlignment.utils import notes_to_pc +from .utils import notes_to_pc # following FFT parameters are designed for a 22.5k sampling rate sr = 22050 diff --git a/autosyl/LyricsAlignment/utils.py b/autokara/autosyl/LyricsAlignment/utils.py similarity index 100% rename from autosyl/LyricsAlignment/utils.py rename to autokara/autosyl/LyricsAlignment/utils.py diff --git a/autosyl/LyricsAlignment/wrapper.py b/autokara/autosyl/LyricsAlignment/wrapper.py similarity index 97% rename from autosyl/LyricsAlignment/wrapper.py rename to autokara/autosyl/LyricsAlignment/wrapper.py index 9e43b4bbe9c59074141669cb7aa11f9e357f91a6..f308ef9872955f4fdf4d45ce384c814fbee8664d 100644 --- a/autosyl/LyricsAlignment/wrapper.py +++ b/autokara/autosyl/LyricsAlignment/wrapper.py @@ -5,8 +5,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -import autosyl.LyricsAlignment.utils as utils -from autosyl.LyricsAlignment.model import train_audio_transforms, AcousticModel, BoundaryDetection +from . import utils +from .model import train_audio_transforms, AcousticModel, BoundaryDetection np.random.seed(7) diff --git a/autosyl/assUtils.py b/autokara/autosyl/assUtils.py similarity index 100% rename from autosyl/assUtils.py rename to autokara/autosyl/assUtils.py diff --git a/autosyl/segment.py b/autokara/autosyl/segment.py similarity index 75% rename from autosyl/segment.py rename to autokara/autosyl/segment.py index 6a54e36c971c2c60596bae459872a58418b63000..46155073b8c10738a55de11981295909f38c91e9 100644 --- a/autosyl/segment.py +++ b/autokara/autosyl/segment.py @@ -5,24 +5,25 @@ import re import matplotlib.pyplot as plt import scipy.signal as sg import parselmouth +from pathlib import Path -from autosyl.assUtils import getSyls, timeToDate, dateToTime -from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file +from .assUtils import getSyls, timeToDate, dateToTime +from .LyricsAlignment.wrapper import align, preprocess_from_file -def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, verbose=False, language="jp"): +def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur=500, verbose=False, language="jp"): - delay = -4 + delay = config['syl_delay'] backtrack = False print(reference_syls) - audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. - word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it - method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" - cuda=False # set True if you have access to a GPU - checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" + audio_file = songfile # pre-computed source-separated vocals; + word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it + method = config['model'] + ("_BDR" if config['bdr'] else "") # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" + cuda = config['cuda'] # set True if you have access to a GPU + checkpoint_folder = f"{str(Path(__file__).parent):s}/LyricsAlignment/checkpoints" language = language diff --git a/autokara/default.conf b/autokara/default.conf new file mode 100644 index 0000000000000000000000000000000000000000..82d914ce4a69e2792135de3cb620ee67f36e604e --- /dev/null +++ b/autokara/default.conf @@ -0,0 +1,20 @@ + +[General] + + + +[Media] + +# Where to store temporary media files (such as extracted vocals or ASS tracks) +media_dir = /tmp/autokara + +[Segment] + +# Which model to use. Options are "Baseline" (base) and "MTL" (better) +model = MTL +# Whether to use BDR model for boundary recognition. Better, but resource-intensive +bdr = true +# Whether to use CUDA +cuda = false +# Default delay applied to detected syls, in centiseconds +syl_delay = -4 diff --git a/extractAss.sh b/autokara/extractAss.sh similarity index 100% rename from extractAss.sh rename to autokara/extractAss.sh diff --git a/extractWav.sh b/autokara/extractWav.sh similarity index 100% rename from extractWav.sh rename to autokara/extractWav.sh diff --git a/g2p/mappings/langs/rji/config.yaml b/autokara/g2p/mappings/langs/rji/config.yaml similarity index 100% rename from g2p/mappings/langs/rji/config.yaml rename to autokara/g2p/mappings/langs/rji/config.yaml diff --git a/g2p/mappings/langs/rji/rji_abbs.csv b/autokara/g2p/mappings/langs/rji/rji_abbs.csv similarity index 100% rename from g2p/mappings/langs/rji/rji_abbs.csv rename to autokara/g2p/mappings/langs/rji/rji_abbs.csv diff --git a/g2p/mappings/langs/rji/romaji_to_eng-arpa.csv b/autokara/g2p/mappings/langs/rji/romaji_to_eng-arpa.csv similarity index 100% rename from g2p/mappings/langs/rji/romaji_to_eng-arpa.csv rename to autokara/g2p/mappings/langs/rji/romaji_to_eng-arpa.csv diff --git a/autokara/plot_syls.py b/autokara/plot_syls.py new file mode 100644 index 0000000000000000000000000000000000000000..9383639fc6cf3ab2a285eeaf9f8518ae8c53f9b5 --- /dev/null +++ b/autokara/plot_syls.py @@ -0,0 +1,195 @@ +import madmom +import numpy as np +import sys +import re +import matplotlib.pyplot as plt +import scipy.signal as sg +import parselmouth +import argparse + +from .autosyl.assUtils import getSyls, timeToDate, dateToTime +from .autosyl.LyricsAlignment.wrapper import align, preprocess_from_file + + +############################################################################## +# +# This is a test script to visualize extracted onsets and other audio features +# It is mainly intended for development/debug +# +# If you just want to detect the syllables, use autokara.py instead +# +############################################################################## + +def main(opts=None): + parser = argparse.ArgumentParser(description='AutoKara - Automatic karaoke timing tool') + parser.add_argument("vocals_file", type=str, help="The audio file to time") + parser.add_argument("ass_file", type=str, help="The ASS file with lyrics to time") + + args = parser.parse_args() + + + songfile = args.vocals_file + reference_syls, line_meta = getSyls(sys.argv[2]) + + + print(reference_syls) + + backtrack = False + + + + + audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. + word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it + method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" + cuda=True # set True if you have access to a GPU + checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" + + pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word" + + + lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] + #print(lyrics_lines) + + + # load audio and lyrics + # words: a list of words + # lyrics_p: phoneme sequence of the target lyrics + # idx_word_p: indices of word start in lyrics_p + # idx_line_p: indices of line start in lyrics_p + audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file) + + # compute alignment + # word_align: a list of frame indices aligned to each word + # words: a list of words + word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder) + + + print([[word_align[i][0], word_align[i][1], words[i]] for i in range(len(word_align))]) + words_onsets = np.array([word_align[i][0] for i in range(len(word_align))]) + + + cnn = madmom.features.onsets.CNNOnsetProcessor() + spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') + + sig = madmom.audio.signal.Signal(songfile, num_channels=1) + parsel = parselmouth.Sound(sig) + + spec = madmom.audio.spectrogram.Spectrogram(sig) + filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24) + log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1) + + magnitude = np.max(log_spec[:,:100], axis=1) + + cnn_function = cnn(sig) + spectral_function = spectral(sig) + spectral_function = spectral_function/(spectral_function.max()) + + #activation_function = 0.5*cnn_function + 0.5*spectral_function + activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function) + #activation_function = np.where(spectral_function > 0.14, cnn_function, 0) + #onsets = proc(activation_function) + + + if reference_syls: + activation_threshold = 0.1 + else: + activation_threshold = 0.2 + + activation_smoothed = madmom.audio.signal.smooth(activation_function, 20) + cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20) + onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=activation_threshold, smooth=0) + #onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.1]) + + pitch = parsel.to_pitch() + pitch_values = pitch.selected_array['frequency'] + + pad_before = round(pitch.xs()[0]*100) + pad_after = len(magnitude) - len(pitch_values) - pad_before + + pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0)) + + mask_function = magnitude * pitch_values + mask_function = mask_function/np.max(mask_function) + mask_threshold = 0.15 + mask_window = [1,6] + invalid_onsets_idx = [] + + for i in range(len(onsets)): + if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold: + invalid_onsets_idx.append(i) + + onsets = np.delete(onsets, invalid_onsets_idx) + + + + if reference_syls: + filtered_onsets = [] + line_index = 0 + for line in reference_syls: + line_index += 1 + syl_number = len(line) - 1 + line_onsets = [o for o in onsets if (o >= line[0][0] and o <= line[-1][0])] + line_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) + if syl_number > len(line_onsets): + print("WARNING : failed to detect enough onsets in line %d (%d, %d)" % (line_index, line[0][0], line[-1][0])) + filtered_onsets += line_onsets[0:syl_number] + + onsets = np.array(sorted(filtered_onsets)) + + + """ + if word_index > 0: + word_start = max(word_align[word_index][0] - 5, line[0][0], previous_onset+1) + else: + word_start = line[0][0] + if word_index < len(words) - 1 and syl_index < len(line) - 2: + word_end = min(line[-1][0], word_align[word_index + 1][0] - 5) + else: + word_end = line[-1][0] + + word_onsets = [o for o in onsets if (o >= word_start and o <= word_end)] + word_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) + if word_syl_count > len(word_onsets): + print("WARNING : failed to detect enough onsets in word %s (%d, %d)" % (word_tmp, word_start, word_end)) + filtered_onsets += word_onsets[0:word_syl_count] + print(word_onsets[0:word_syl_count]) + previous_onset = max(word_onsets[0:word_syl_count] + [0]) + """ + + # Backtrack onsets to closest earlier local minimum + if backtrack: + backtrack_max_frames = 50 + for i in range(len(onsets)): + initial_onset = onsets[i] + while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames): + onsets[i] -= 1 + + #print(onsets/100) + print(words_onsets/100) + + if reference_syls: + reference_onsets = [syl[0]+8 for line in reference_syls for syl in line[:-1]] + + fig, axs = plt.subplots(nrows=2, sharex=True) + axs[0].imshow(log_spec.T, origin='lower', aspect='auto') + if reference_syls: + axs[0].vlines(reference_onsets, 0, 140, colors='red') + axs[0].plot((pitch_values/np.max(pitch_values))*140, color='yellow') + axs[1].plot(mask_function) + #axs[1].plot(cnn_smoothed) + #axs[1].plot(spectral_function, color='green') + axs[1].plot(activation_smoothed, color='orange') + axs[1].vlines(onsets, 0, 2, colors='red') + axs[1].vlines(words_onsets, 0, 3, colors='m') + axs[1].hlines([max(mask_threshold, 0), activation_threshold], 0, onsets[-1]+100, colors='black') + + #bins = np.arange(0, 1, 0.02) + #hist, hist_axs = plt.subplots(nrows=1) + #hist_axs.hist(mask_function, bins=bins) + + plt.show() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/autokara/update_lang_db.py b/autokara/update_lang_db.py new file mode 100644 index 0000000000000000000000000000000000000000..6e0c63e926ebef29287967a04d7fbed6c53cb9f8 --- /dev/null +++ b/autokara/update_lang_db.py @@ -0,0 +1,28 @@ +import subprocess +import shlex +from pathlib import Path +import glob + + +def main(): + from g2p import __path__ as g2p_path + + HERE = Path(__file__).parent + g2p_base = Path(g2p_path[0]) + + print("Regenerating custom language mappings...") + mapping_dir = HERE / "g2p/mappings/langs/" + mappings = glob.glob(f"{str(mapping_dir):s}/*") + for map in mappings: + subprocess.check_call(shlex.split(f'cp -r {map:s} {str(g2p_base):s}/mappings/langs/')) + subprocess.check_call(shlex.split(f'g2p update')) + + if not Path.exists(g2p_base / "mappings/langs/rji"): + print("ERROR : Failed to find language mapping") + else: + print("Setup successful") + + + +if __name__ == "__main__": + main() diff --git a/plot_syls.py b/plot_syls.py deleted file mode 100644 index e86960430761fc37dcc54ebfc60dbcc1a1361da1..0000000000000000000000000000000000000000 --- a/plot_syls.py +++ /dev/null @@ -1,191 +0,0 @@ -import madmom -import numpy as np -import sys -import re -import matplotlib.pyplot as plt -import scipy.signal as sg -import parselmouth -import argparse - -from autosyl.assUtils import getSyls, timeToDate, dateToTime -from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file - - -############################################################################## -# -# This is a test script to visualize extracted onsets and other audio features -# It is mainly intended for development/debug -# -# If you just want to detect the syllables, use autokara.py instead -# -############################################################################## - - -parser = argparse.ArgumentParser(description='AutoKara - Automatic karaoke timing tool') -parser.add_argument("vocals_file", type=str, help="The audio file to time") -parser.add_argument("ass_file", type=str, help="The ASS file with lyrics to time") - -args = parser.parse_args() - - -songfile = args.vocals_file -reference_syls, line_meta = getSyls(sys.argv[2]) - - -print(reference_syls) - -backtrack = False - - - - -audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. -word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it -method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" -cuda=True # set True if you have access to a GPU -checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" - -pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word" - - -lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] -#print(lyrics_lines) - - -# load audio and lyrics -# words: a list of words -# lyrics_p: phoneme sequence of the target lyrics -# idx_word_p: indices of word start in lyrics_p -# idx_line_p: indices of line start in lyrics_p -audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file) - -# compute alignment -# word_align: a list of frame indices aligned to each word -# words: a list of words -word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder) - - -print([[word_align[i][0], word_align[i][1], words[i]] for i in range(len(word_align))]) -words_onsets = np.array([word_align[i][0] for i in range(len(word_align))]) - - -cnn = madmom.features.onsets.CNNOnsetProcessor() -spectral = madmom.features.onsets.SpectralOnsetProcessor('modified_kullback_leibler') - -sig = madmom.audio.signal.Signal(songfile, num_channels=1) -parsel = parselmouth.Sound(sig) - -spec = madmom.audio.spectrogram.Spectrogram(sig) -filt_spec = madmom.audio.spectrogram.FilteredSpectrogram(spec, filterbank=madmom.audio.filters.LogFilterbank, num_bands=24) -log_spec = madmom.audio.spectrogram.LogarithmicSpectrogram(filt_spec, add=1) - -magnitude = np.max(log_spec[:,:100], axis=1) - -cnn_function = cnn(sig) -spectral_function = spectral(sig) -spectral_function = spectral_function/(spectral_function.max()) - -#activation_function = 0.5*cnn_function + 0.5*spectral_function -activation_function = (2 * cnn_function * spectral_function)/(cnn_function + spectral_function) -#activation_function = np.where(spectral_function > 0.14, cnn_function, 0) -#onsets = proc(activation_function) - - -if reference_syls: - activation_threshold = 0.1 -else: - activation_threshold = 0.2 - -activation_smoothed = madmom.audio.signal.smooth(activation_function, 20) -cnn_smoothed = madmom.audio.signal.smooth(cnn_function, 20) -onsets = madmom.features.onsets.peak_picking(activation_smoothed, threshold=activation_threshold, smooth=0) -#onsets = np.array([o for o in onsets if cnn_smoothed[o] > 0.1]) - -pitch = parsel.to_pitch() -pitch_values = pitch.selected_array['frequency'] - -pad_before = round(pitch.xs()[0]*100) -pad_after = len(magnitude) - len(pitch_values) - pad_before - -pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0)) - -mask_function = magnitude * pitch_values -mask_function = mask_function/np.max(mask_function) -mask_threshold = 0.15 -mask_window = [1,6] -invalid_onsets_idx = [] - -for i in range(len(onsets)): - if np.max(mask_function[onsets[i]+mask_window[0]:onsets[i]+mask_window[1]]) < mask_threshold: - invalid_onsets_idx.append(i) - -onsets = np.delete(onsets, invalid_onsets_idx) - - - -if reference_syls: - filtered_onsets = [] - line_index = 0 - for line in reference_syls: - line_index += 1 - syl_number = len(line) - 1 - line_onsets = [o for o in onsets if (o >= line[0][0] and o <= line[-1][0])] - line_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) - if syl_number > len(line_onsets): - print("WARNING : failed to detect enough onsets in line %d (%d, %d)" % (line_index, line[0][0], line[-1][0])) - filtered_onsets += line_onsets[0:syl_number] - - onsets = np.array(sorted(filtered_onsets)) - - -""" - if word_index > 0: - word_start = max(word_align[word_index][0] - 5, line[0][0], previous_onset+1) - else: - word_start = line[0][0] - if word_index < len(words) - 1 and syl_index < len(line) - 2: - word_end = min(line[-1][0], word_align[word_index + 1][0] - 5) - else: - word_end = line[-1][0] - - word_onsets = [o for o in onsets if (o >= word_start and o <= word_end)] - word_onsets.sort(reverse=True, key=(lambda x: activation_smoothed[x])) - if word_syl_count > len(word_onsets): - print("WARNING : failed to detect enough onsets in word %s (%d, %d)" % (word_tmp, word_start, word_end)) - filtered_onsets += word_onsets[0:word_syl_count] - print(word_onsets[0:word_syl_count]) - previous_onset = max(word_onsets[0:word_syl_count] + [0]) -""" - -# Backtrack onsets to closest earlier local minimum -if backtrack: - backtrack_max_frames = 50 - for i in range(len(onsets)): - initial_onset = onsets[i] - while(activation_smoothed[onsets[i] - 1] < activation_smoothed[onsets[i]] and onsets[i] > initial_onset - backtrack_max_frames): - onsets[i] -= 1 - -#print(onsets/100) -print(words_onsets/100) - -if reference_syls: - reference_onsets = [syl[0]+8 for line in reference_syls for syl in line[:-1]] - -fig, axs = plt.subplots(nrows=2, sharex=True) -axs[0].imshow(log_spec.T, origin='lower', aspect='auto') -if reference_syls: - axs[0].vlines(reference_onsets, 0, 140, colors='red') -axs[0].plot((pitch_values/np.max(pitch_values))*140, color='yellow') -axs[1].plot(mask_function) -#axs[1].plot(cnn_smoothed) -#axs[1].plot(spectral_function, color='green') -axs[1].plot(activation_smoothed, color='orange') -axs[1].vlines(onsets, 0, 2, colors='red') -axs[1].vlines(words_onsets, 0, 3, colors='m') -axs[1].hlines([max(mask_threshold, 0), activation_threshold], 0, onsets[-1]+100, colors='black') - -#bins = np.arange(0, 1, 0.02) -#hist, hist_axs = plt.subplots(nrows=1) -#hist_axs.hist(mask_function, bins=bins) - -plt.show() \ No newline at end of file diff --git a/preprocess_media.sh b/preprocess_media.sh index a88ae744d830cadc1fdf2122e5c545180eaa4fb4..ec05203bec0a6563ab7adc7298b08c0fac3d2c4a 100755 --- a/preprocess_media.sh +++ b/preprocess_media.sh @@ -1,6 +1,20 @@ - - - +#!/bin/bash + + +########################################################################################################## +# +# COMMAND : preprocess_media.sh +# +# AUTHOR : Sting +# +# DESCRIPTION : CLI tool to batch extract ASS lyrics and vocals from a video folder +# +# USE : ./preprocess_media.sh input_folder output_folder +# +# REQUIREMENTS : FFMPEG, Demucs, extractAss and extractWav +# +# +########################################################################################################## USAGE_MESSAGE="usage : $0 video_folder train_folder" diff --git a/requirements.txt b/requirements.txt index 04299146d72785e9cf25e747f7bcea65dd11d05c..1574a1b402f57d64880c618eb05472c3fd8879fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,9 @@ librosa demucs -chainer soundfile -sklearn matplotlib numpy -tqdm -scipy -cython -mido -git+https://github.com/CPJKU/madmom.git +madmom@git+https://github.com/CPJKU/madmom.git praat-parselmouth future musdb diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..c71a27b851a0ddfc76ffbe0efda851c06db10851 --- /dev/null +++ b/setup.py @@ -0,0 +1,69 @@ + +from pathlib import Path +from setuptools import setup, find_packages +import atexit +from setuptools.command.install import install +import subprocess +import shlex +import glob + + +NAME = 'autokara' +DESCRIPTION = 'Automatic karaoke timing' + +URL = 'https://git.iiens.net/bakaclub/autokara' +AUTHOR = 'Loïc "Sting" Allègre' +REQUIRES_PYTHON = '>=3.8.0' + +HERE = Path(__file__).parent + +# Get version without explicitly loading the module. +for line in open('autokara/__init__.py'): + line = line.strip() + if '__version__' in line: + context = {} + exec(line, context) + VERSION = context['__version__'] + + +def load_requirements(name): + required = [i.strip() for i in open(HERE / name)] + required = [i for i in required if not i.startswith('#')] + print(required) + return required + + +REQUIRED = load_requirements('requirements.txt') +ALL_REQUIRED = load_requirements('requirements.txt') + + +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + author=AUTHOR, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages(), + install_requires=REQUIRED, + include_package_data=True, + entry_points={ + 'console_scripts': ['autokara=autokara.autokara:main', + 'autokara-plot=autokara.plot_syls:main', + 'autokara-gen-lang=autokara.update_lang_db:main' + ], + }, + scripts=[ + 'autokara/extractAss.sh', + 'autokara/extractWav.sh', + 'preprocess_media.sh' + ], + license='MIT License', + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + 'License :: OSI Approved :: MIT License', + 'Topic :: Multimedia :: Sound/Audio', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + ], +)