From b0f73d5d81a98f82ea3be3aec25761ad19e99e9d Mon Sep 17 00:00:00 2001
From: Sting <loic.allegre@ensiie.fr>
Date: Tue, 25 Jul 2023 21:09:48 +0200
Subject: [PATCH] Treat uppercase as English + fix overly short syls at end of
 lines

---
 autokara/autokara.py                        |  3 ++-
 autokara/autosyl/LyricsAlignment/utils.py   | 17 ++++++++++----
 autokara/autosyl/LyricsAlignment/wrapper.py | 10 ++++----
 autokara/autosyl/segment.py                 | 26 +++++++++++++++++----
 autokara/default.conf                       |  2 ++
 autokara/plot_syls.py                       |  4 ++--
 6 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/autokara/autokara.py b/autokara/autokara.py
index ffb7eda..6077478 100644
--- a/autokara/autokara.py
+++ b/autokara/autokara.py
@@ -39,7 +39,8 @@ def main(opts=None):
         'model': config['Segment']['model'],
         'bdr': config['Segment'].getboolean('bdr'),
         'cuda': config['Segment'].getboolean('cuda'),
-        'syl_delay': config['Segment'].getint('syl_delay')
+        'syl_delay': config['Segment'].getint('syl_delay'),
+        'uppercase_english': config['Segment'].getboolean('uppercase_as_english')
     }
 
 
diff --git a/autokara/autosyl/LyricsAlignment/utils.py b/autokara/autosyl/LyricsAlignment/utils.py
index ec8aab8..d6f996c 100644
--- a/autokara/autosyl/LyricsAlignment/utils.py
+++ b/autokara/autosyl/LyricsAlignment/utils.py
@@ -7,6 +7,7 @@ import string
 import warnings
 import g2p_en
 from g2p import make_g2p
+import re
 
 
 
@@ -110,11 +111,16 @@ def load_lyrics(lyrics_file):
 def write_wav(path, audio, sr):
     soundfile.write(path, audio.T, sr, "PCM_16")
 
-def gen_phone_gt(words, raw_lines, language="jp"):
+def gen_phone_gt(words, raw_lines, language="jp", uppercase_as_english=True):
 
-    print(f"Translating lyrics to phonemes, language chosen : {language:s}")
+    print(f"Translating lyrics to phonemes, language chosen : {language:s}, uppercase as English : {str(uppercase_as_english):s}")
     g2p = G2p_Wrapper(language=language)
+    if uppercase_as_english:
+        g2p_uppercase = G2p_Wrapper(language="en")
+    else:
+        g2p_uppercase = g2p
 
+    regex_uppercase = re.compile('[^a-z]')
 
     # helper function
     def getsubidx(x, y):  # find y in x
@@ -125,7 +131,10 @@ def gen_phone_gt(words, raw_lines, language="jp"):
     words_p = []
     lyrics_p = []
     for word in words:
-        out = g2p(word)
+        if regex_uppercase.sub('', word) == '':   # word is all uppercase or symbols
+            out = g2p_uppercase(word.lower())
+        else:
+            out = g2p(word.lower())
         out = [phone if phone[-1] not in string.digits else phone[:-1] for phone in out]
         words_p.append(out)
         if len(lyrics_p) > 0:
@@ -146,7 +155,7 @@ def gen_phone_gt(words, raw_lines, language="jp"):
         last_end = 0
         for i in range(len(raw_lines)):
             line = []
-            line_phone = [g2p(word) for word in raw_lines[i].split()]
+            line_phone = [(g2p_uppercase(word.lower()) if regex_uppercase.sub('', word) == '' else g2p(word.lower())) for word in raw_lines[i].split()]
             for l in line_phone:
                 line += l + [' ']
             line = line[:-1]
diff --git a/autokara/autosyl/LyricsAlignment/wrapper.py b/autokara/autosyl/LyricsAlignment/wrapper.py
index f308ef9..51cf853 100644
--- a/autokara/autosyl/LyricsAlignment/wrapper.py
+++ b/autokara/autosyl/LyricsAlignment/wrapper.py
@@ -10,10 +10,10 @@ from .model import train_audio_transforms, AcousticModel, BoundaryDetection
 
 np.random.seed(7)
 
-def preprocess_from_file(audio_file, lyrics_file, word_file=None, language="jp"):
+def preprocess_from_file(audio_file, lyrics_file, word_file=None, language="jp", uppercase_as_english=True):
     y, sr = preprocess_audio(audio_file)
 
-    words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file, language=language)
+    words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file, language=language, uppercase_as_english=uppercase_as_english)
 
     return y, words, lyrics_p, idx_word_p, idx_line_p
 
@@ -143,7 +143,7 @@ def preprocess_audio(audio_file, sr=22050):
 
     return y, curr_sr
 
-def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"):
+def preprocess_lyrics(lyrics_lines, word_file=None, language="jp", uppercase_as_english=True):
     from string import ascii_lowercase
     d = {ascii_lowercase[i]: i for i in range(26)}
     d["'"] = 26
@@ -155,7 +155,7 @@ def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"):
     #    raw_lines = f.read().splitlines()
     raw_lines = lyrics_lines
 
-    raw_lines = ["".join([c for c in line.lower() if c in d.keys()]).strip() for line in raw_lines]
+    raw_lines = ["".join([c for c in line if c.lower() in d.keys()]).strip() for line in raw_lines]
     raw_lines = [" ".join(line.split()) for line in raw_lines if len(line) > 0]
     # concat
     full_lyrics = " ".join(raw_lines)
@@ -166,7 +166,7 @@ def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"):
     else:
         words_lines = full_lyrics.split()
 
-    lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines, language=(language if language else "jp"))
+    lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines, language=(language if language else "jp"), uppercase_as_english=uppercase_as_english)
 
     return words_lines, lyrics_p, idx_word_p, idx_line_p
 
diff --git a/autokara/autosyl/segment.py b/autokara/autosyl/segment.py
index 4615507..0376de6 100644
--- a/autokara/autosyl/segment.py
+++ b/autokara/autosyl/segment.py
@@ -26,6 +26,7 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur
     checkpoint_folder = f"{str(Path(__file__).parent):s}/LyricsAlignment/checkpoints"
     language = language
 
+    uppercase_as_english = config['uppercase_english']
 
     lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls]
     #print(lyrics_lines)
@@ -38,7 +39,7 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur
     # lyrics_p:     phoneme sequence of the target lyrics
     # idx_word_p:   indices of word start in lyrics_p
     # idx_line_p:   indices of line start in lyrics_p
-    audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file, language)
+    audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file, language, uppercase_as_english=uppercase_as_english)
     if verbose:
         print(lyrics_p)
 
@@ -64,8 +65,20 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur
 
     magnitude = np.max(log_spec[:,:100], axis=1)
 
-    magnitude_threshold = 0.75
-    magnitude_start = 2
+    parsel = parselmouth.Sound(sig)
+
+    pitch = parsel.to_pitch()
+    pitch_values = pitch.selected_array['frequency']
+
+    pad_before = round(pitch.xs()[0]*100)
+    pad_after = len(magnitude) - len(pitch_values) - pad_before
+
+    pitch_values = np.pad(pitch_values, (pad_before, pad_after), 'constant', constant_values=(0,0))
+
+    mask = pitch_values
+
+    mask_threshold = 10
+    mask_start = 2
 
     if reference_syls:
         syls = []
@@ -73,9 +86,12 @@ def segment(songfile, config, reference_syls=None,syls_per_line=10, last_syl_dur
         for line in reference_syls:
             #print(onset_index, " : ", line)
             l = [[onsets[onset_index + i], line[i][1]] for i in range(len(line)-1)]
-            next_drop = words_onsets[onset_index + len(line) - 2] + magnitude_start
-            while magnitude[next_drop] > magnitude_threshold:
+            next_drop = words_onsets[onset_index + len(line) - 2] + mask_start
+            after_high = False
+            while mask[next_drop] > mask_threshold or not after_high:
                 next_drop += 1
+                if mask[next_drop] > mask_threshold:
+                    after_high = True
             l.append([min(word_align[onset_index + (len(line) - 2)][1]/100, next_drop/100), ''])
             syls.append(l)
             onset_index += (len(line) - 1)
diff --git a/autokara/default.conf b/autokara/default.conf
index 82d914c..4d48d0e 100644
--- a/autokara/default.conf
+++ b/autokara/default.conf
@@ -18,3 +18,5 @@ bdr = true
 cuda = false
 # Default delay applied to detected syls, in centiseconds
 syl_delay = -4
+# Whether to treat uppercase words as English. If false, use song language everywhere
+uppercase_as_english = true
\ No newline at end of file
diff --git a/autokara/plot_syls.py b/autokara/plot_syls.py
index 9383639..b7c9ed4 100644
--- a/autokara/plot_syls.py
+++ b/autokara/plot_syls.py
@@ -6,6 +6,7 @@ import matplotlib.pyplot as plt
 import scipy.signal as sg
 import parselmouth
 import argparse
+from pathlib import Path
 
 from .autosyl.assUtils import getSyls, timeToDate, dateToTime 
 from .autosyl.LyricsAlignment.wrapper import align, preprocess_from_file
@@ -43,8 +44,7 @@ def main(opts=None):
     word_file = None                           # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it
     method = "MTL_BDR"                             # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR"
     cuda=True                                 # set True if you have access to a GPU
-    checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints"
-
+    checkpoint_folder = f"{str(Path(__file__).parent):s}/autosyl/LyricsAlignment/checkpoints"
     pred_file = "./MTL.csv"                    # saved alignment results, "(float) start_time, (float) end_time, (string) word"
 
 
-- 
GitLab