From b0f73d5d81a98f82ea3be3aec25761ad19e99e9d Mon Sep 17 00:00:00 2001 From: Sting <loic.allegre@ensiie.fr> Date: Tue, 25 Jul 2023 21:09:48 +0200 Subject: [PATCH] Treat uppercase as English + fix overly short syls at end of lines --- autokara/autokara.py | 3 ++- autokara/autosyl/LyricsAlignment/utils.py | 17 ++++++++++---- autokara/autosyl/LyricsAlignment/wrapper.py | 10 ++++---- autokara/autosyl/segment.py | 26 +++++++++++++++++---- autokara/default.conf | 2 ++ autokara/plot_syls.py | 4 ++-- 6 files changed, 45 insertions(+), 17 deletions(-) diff --git a/autokara/autokara.py b/autokara/autokara.py index ffb7eda..6077478 100644 --- a/autokara/autokara.py +++ b/autokara/autokara.py @@ -39,7 +39,8 @@ def main(opts=None): 'model': config['Segment']['model'], 'bdr': config['Segment'].getboolean('bdr'), 'cuda': config['Segment'].getboolean('cuda'), - 'syl_delay': config['Segment'].getint('syl_delay') + 'syl_delay': config['Segment'].getint('syl_delay'), + 'uppercase_english': config['Segment'].getboolean('uppercase_as_english') } diff --git a/autokara/autosyl/LyricsAlignment/utils.py b/autokara/autosyl/LyricsAlignment/utils.py index ec8aab8..d6f996c 100644 --- a/autokara/autosyl/LyricsAlignment/utils.py +++ b/autokara/autosyl/LyricsAlignment/utils.py @@ -7,6 +7,7 @@ import string import warnings import g2p_en from g2p import make_g2p +import re @@ -110,11 +111,16 @@ def load_lyrics(lyrics_file): def write_wav(path, audio, sr): soundfile.write(path, audio.T, sr, "PCM_16") -def gen_phone_gt(words, raw_lines, language="jp"): +def gen_phone_gt(words, raw_lines, language="jp", uppercase_as_english=True): - print(f"Translating lyrics to phonemes, language chosen : {language:s}") + print(f"Translating lyrics to phonemes, language chosen : {language:s}, uppercase as English : {str(uppercase_as_english):s}") g2p = G2p_Wrapper(language=language) + if uppercase_as_english: + g2p_uppercase = G2p_Wrapper(language="en") + else: + g2p_uppercase = g2p + regex_uppercase = re.compile('[^a-z]') # helper function def getsubidx(x, y): # find y in x @@ -125,7 +131,10 @@ def gen_phone_gt(words, raw_lines, language="jp"): words_p = [] lyrics_p = [] for word in words: - out = g2p(word) + if regex_uppercase.sub('', word) == '': # word is all uppercase or symbols + out = g2p_uppercase(word.lower()) + else: + out = g2p(word.lower()) out = [phone if phone[-1] not in string.digits else phone[:-1] for phone in out] words_p.append(out) if len(lyrics_p) > 0: @@ -146,7 +155,7 @@ def gen_phone_gt(words, raw_lines, language="jp"): last_end = 0 for i in range(len(raw_lines)): line = [] - line_phone = [g2p(word) for word in raw_lines[i].split()] + line_phone = [(g2p_uppercase(word.lower()) if regex_uppercase.sub('', word) == '' else g2p(word.lower())) for word in raw_lines[i].split()] for l in line_phone: line += l + [' '] line = line[:-1] diff --git a/autokara/autosyl/LyricsAlignment/wrapper.py b/autokara/autosyl/LyricsAlignment/wrapper.py index f308ef9..51cf853 100644 --- a/autokara/autosyl/LyricsAlignment/wrapper.py +++ b/autokara/autosyl/LyricsAlignment/wrapper.py @@ -10,10 +10,10 @@ from .model import train_audio_transforms, AcousticModel, BoundaryDetection np.random.seed(7) -def preprocess_from_file(audio_file, lyrics_file, word_file=None, language="jp"): +def preprocess_from_file(audio_file, lyrics_file, word_file=None, language="jp", uppercase_as_english=True): y, sr = preprocess_audio(audio_file) - words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file, language=language) + words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file, language=language, uppercase_as_english=uppercase_as_english) return y, words, lyrics_p, idx_word_p, idx_line_p @@ -143,7 +143,7 @@ def preprocess_audio(audio_file, sr=22050): return y, curr_sr -def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"): +def preprocess_lyrics(lyrics_lines, word_file=None, language="jp", uppercase_as_english=True): from string import ascii_lowercase d = {ascii_lowercase[i]: i for i in range(26)} d["'"] = 26 @@ -155,7 +155,7 @@ def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"): # raw_lines = f.read().splitlines() raw_lines = lyrics_lines - raw_lines = ["".join([c for c in line.lower() if c in d.keys()]).strip() for line in raw_lines] + raw_lines = ["".join([c for c in line if c.lower() in d.keys()]).strip() for line in raw_lines] raw_lines = [" ".join(line.split()) for line in raw_lines if len(line) > 0] # concat full_lyrics = " ".join(raw_lines) @@ -166,7 +166,7 @@ def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"): else: words_lines = full_lyrics.split() - lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines, language=(language if language else "jp")) + lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines, language=(language if language else "jp"), uppercase_as_english=uppercase_as_english) return words_lines, lyrics_p, idx_word_p, idx_line_p diff --git a/autokara/autosyl/segment.py b/autokara/autosyl/segment.py index 4615507..0376de6 100644 --- a/autokara/autosyl/segment.py +++ b/autokara/autosyl/segment.py @@ -26,6 +26,7 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur checkpoint_folder = f"{str(Path(__file__).parent):s}/LyricsAlignment/checkpoints" language = language + uppercase_as_english = config['uppercase_english'] lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] #print(lyrics_lines) @@ -38,7 +39,7 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur # lyrics_p: phoneme sequence of the target lyrics # idx_word_p: indices of word start in lyrics_p # idx_line_p: indices of line start in lyrics_p - audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file, language) + audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file, language, uppercase_as_english=uppercase_as_english) if verbose: print(lyrics_p) @@ -64,8 +65,20 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur magnitude = np.max(log_spec[:,:100], axis=1) - magnitude_threshold = 0.75 - magnitude_start = 2 + parsel = parselmouth.Sound(sig) + + pitch = parsel.to_pitch() + pitch_values = pitch.selected_array['frequency'] + + pad_before = round(pitch.xs()[0]*100) + pad_after = len(magnitude) - len(pitch_values) - pad_before + + pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0)) + + mask = pitch_values + + mask_threshold = 10 + mask_start = 2 if reference_syls: syls = [] @@ -73,9 +86,12 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur for line in reference_syls: #print(onset_index, " : ", line) l = [[onsets[onset_index + i], line[i][1]] for i in range(len(line)-1)] - next_drop = words_onsets[onset_index + len(line) - 2] + magnitude_start - while magnitude[next_drop] > magnitude_threshold: + next_drop = words_onsets[onset_index + len(line) - 2] + mask_start + after_high = False + while mask[next_drop] > mask_threshold or not after_high: next_drop += 1 + if mask[next_drop] > mask_threshold: + after_high = True l.append([min(word_align[onset_index + (len(line) - 2)][1]/100, next_drop/100), '']) syls.append(l) onset_index += (len(line) - 1) diff --git a/autokara/default.conf b/autokara/default.conf index 82d914c..4d48d0e 100644 --- a/autokara/default.conf +++ b/autokara/default.conf @@ -18,3 +18,5 @@ bdr = true cuda = false # Default delay applied to detected syls, in centiseconds syl_delay = -4 +# Whether to treat uppercase words as English. If false, use song language everywhere +uppercase_as_english = true \ No newline at end of file diff --git a/autokara/plot_syls.py b/autokara/plot_syls.py index 9383639..b7c9ed4 100644 --- a/autokara/plot_syls.py +++ b/autokara/plot_syls.py @@ -6,6 +6,7 @@ import matplotlib.pyplot as plt import scipy.signal as sg import parselmouth import argparse +from pathlib import Path from .autosyl.assUtils import getSyls, timeToDate, dateToTime from .autosyl.LyricsAlignment.wrapper import align, preprocess_from_file @@ -43,8 +44,7 @@ def main(opts=None): word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" cuda=True # set True if you have access to a GPU - checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" - + checkpoint_folder = f"{str(Path(__file__).parent):s}/autosyl/LyricsAlignment/checkpoints" pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word" -- GitLab