diff --git a/autokara/autokara.py b/autokara/autokara.py index ffb7edace7fad76076009591ed2a71cc06cc01b6..6077478b26f8f00229a6fdc88849705184440dd4 100644 --- a/autokara/autokara.py +++ b/autokara/autokara.py @@ -39,7 +39,8 @@ def main(opts=None): 'model': config['Segment']['model'], 'bdr': config['Segment'].getboolean('bdr'), 'cuda': config['Segment'].getboolean('cuda'), - 'syl_delay': config['Segment'].getint('syl_delay') + 'syl_delay': config['Segment'].getint('syl_delay'), + 'uppercase_english': config['Segment'].getboolean('uppercase_as_english') } diff --git a/autokara/autosyl/LyricsAlignment/utils.py b/autokara/autosyl/LyricsAlignment/utils.py index ec8aab853dbd5f016f5e5e9d55d020c7f93cdd7a..d6f996cc86a18f583585ce9655d4387943a4bbb7 100644 --- a/autokara/autosyl/LyricsAlignment/utils.py +++ b/autokara/autosyl/LyricsAlignment/utils.py @@ -7,6 +7,7 @@ import string import warnings import g2p_en from g2p import make_g2p +import re @@ -110,11 +111,16 @@ def load_lyrics(lyrics_file): def write_wav(path, audio, sr): soundfile.write(path, audio.T, sr, "PCM_16") -def gen_phone_gt(words, raw_lines, language="jp"): +def gen_phone_gt(words, raw_lines, language="jp", uppercase_as_english=True): - print(f"Translating lyrics to phonemes, language chosen : {language:s}") + print(f"Translating lyrics to phonemes, language chosen : {language:s}, uppercase as English : {str(uppercase_as_english):s}") g2p = G2p_Wrapper(language=language) + if uppercase_as_english: + g2p_uppercase = G2p_Wrapper(language="en") + else: + g2p_uppercase = g2p + regex_uppercase = re.compile('[^a-z]') # helper function def getsubidx(x, y): # find y in x @@ -125,7 +131,10 @@ def gen_phone_gt(words, raw_lines, language="jp"): words_p = [] lyrics_p = [] for word in words: - out = g2p(word) + if regex_uppercase.sub('', word) == '': # word is all uppercase or symbols + out = g2p_uppercase(word.lower()) + else: + out = g2p(word.lower()) out = [phone if phone[-1] not in string.digits else phone[:-1] for phone in out] words_p.append(out) if len(lyrics_p) > 0: @@ -146,7 +155,7 @@ def gen_phone_gt(words, raw_lines, language="jp"): last_end = 0 for i in range(len(raw_lines)): line = [] - line_phone = [g2p(word) for word in raw_lines[i].split()] + line_phone = [(g2p_uppercase(word.lower()) if regex_uppercase.sub('', word) == '' else g2p(word.lower())) for word in raw_lines[i].split()] for l in line_phone: line += l + [' '] line = line[:-1] diff --git a/autokara/autosyl/LyricsAlignment/wrapper.py b/autokara/autosyl/LyricsAlignment/wrapper.py index f308ef9872955f4fdf4d45ce384c814fbee8664d..51cf853c1886c5def0ed49ca2d1341f67d05b3a1 100644 --- a/autokara/autosyl/LyricsAlignment/wrapper.py +++ b/autokara/autosyl/LyricsAlignment/wrapper.py @@ -10,10 +10,10 @@ from .model import train_audio_transforms, AcousticModel, BoundaryDetection np.random.seed(7) -def preprocess_from_file(audio_file, lyrics_file, word_file=None, language="jp"): +def preprocess_from_file(audio_file, lyrics_file, word_file=None, language="jp", uppercase_as_english=True): y, sr = preprocess_audio(audio_file) - words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file, language=language) + words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file, language=language, uppercase_as_english=uppercase_as_english) return y, words, lyrics_p, idx_word_p, idx_line_p @@ -143,7 +143,7 @@ def preprocess_audio(audio_file, sr=22050): return y, curr_sr -def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"): +def preprocess_lyrics(lyrics_lines, word_file=None, language="jp", uppercase_as_english=True): from string import ascii_lowercase d = {ascii_lowercase[i]: i for i in range(26)} d["'"] = 26 @@ -155,7 +155,7 @@ def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"): # raw_lines = f.read().splitlines() raw_lines = lyrics_lines - raw_lines = ["".join([c for c in line.lower() if c in d.keys()]).strip() for line in raw_lines] + raw_lines = ["".join([c for c in line if c.lower() in d.keys()]).strip() for line in raw_lines] raw_lines = [" ".join(line.split()) for line in raw_lines if len(line) > 0] # concat full_lyrics = " ".join(raw_lines) @@ -166,7 +166,7 @@ def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"): else: words_lines = full_lyrics.split() - lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines, language=(language if language else "jp")) + lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines, language=(language if language else "jp"), uppercase_as_english=uppercase_as_english) return words_lines, lyrics_p, idx_word_p, idx_line_p diff --git a/autokara/autosyl/segment.py b/autokara/autosyl/segment.py index 46155073b8c10738a55de11981295909f38c91e9..0376de6c3cb83eca56f7d269d184692e1e4c0b4f 100644 --- a/autokara/autosyl/segment.py +++ b/autokara/autosyl/segment.py @@ -26,6 +26,7 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur checkpoint_folder = f"{str(Path(__file__).parent):s}/LyricsAlignment/checkpoints" language = language + uppercase_as_english = config['uppercase_english'] lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] #print(lyrics_lines) @@ -38,7 +39,7 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur # lyrics_p: phoneme sequence of the target lyrics # idx_word_p: indices of word start in lyrics_p # idx_line_p: indices of line start in lyrics_p - audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file, language) + audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file, language, uppercase_as_english=uppercase_as_english) if verbose: print(lyrics_p) @@ -64,8 +65,20 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur magnitude = np.max(log_spec[:,:100], axis=1) - magnitude_threshold = 0.75 - magnitude_start = 2 + parsel = parselmouth.Sound(sig) + + pitch = parsel.to_pitch() + pitch_values = pitch.selected_array['frequency'] + + pad_before = round(pitch.xs()[0]*100) + pad_after = len(magnitude) - len(pitch_values) - pad_before + + pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0)) + + mask = pitch_values + + mask_threshold = 10 + mask_start = 2 if reference_syls: syls = [] @@ -73,9 +86,12 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur for line in reference_syls: #print(onset_index, " : ", line) l = [[onsets[onset_index + i], line[i][1]] for i in range(len(line)-1)] - next_drop = words_onsets[onset_index + len(line) - 2] + magnitude_start - while magnitude[next_drop] > magnitude_threshold: + next_drop = words_onsets[onset_index + len(line) - 2] + mask_start + after_high = False + while mask[next_drop] > mask_threshold or not after_high: next_drop += 1 + if mask[next_drop] > mask_threshold: + after_high = True l.append([min(word_align[onset_index + (len(line) - 2)][1]/100, next_drop/100), '']) syls.append(l) onset_index += (len(line) - 1) diff --git a/autokara/default.conf b/autokara/default.conf index 82d914ce4a69e2792135de3cb620ee67f36e604e..4d48d0e3f361374a99fab8bfa5295c60ac3370a9 100644 --- a/autokara/default.conf +++ b/autokara/default.conf @@ -18,3 +18,5 @@ bdr = true cuda = false # Default delay applied to detected syls, in centiseconds syl_delay = -4 +# Whether to treat uppercase words as English. If false, use song language everywhere +uppercase_as_english = true \ No newline at end of file diff --git a/autokara/plot_syls.py b/autokara/plot_syls.py index 9383639fc6cf3ab2a285eeaf9f8518ae8c53f9b5..b7c9ed46276c5601e2fec428da55cb411890a941 100644 --- a/autokara/plot_syls.py +++ b/autokara/plot_syls.py @@ -6,6 +6,7 @@ import matplotlib.pyplot as plt import scipy.signal as sg import parselmouth import argparse +from pathlib import Path from .autosyl.assUtils import getSyls, timeToDate, dateToTime from .autosyl.LyricsAlignment.wrapper import align, preprocess_from_file @@ -43,8 +44,7 @@ def main(opts=None): word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" cuda=True # set True if you have access to a GPU - checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" - + checkpoint_folder = f"{str(Path(__file__).parent):s}/autosyl/LyricsAlignment/checkpoints" pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word"