From 98cfa51a5c9091aa7e722e605c93ed9613da5af1 Mon Sep 17 00:00:00 2001
From: Sting <loic.allegre@ensiie.fr>
Date: Fri, 29 Sep 2023 14:36:50 +0200
Subject: [PATCH] Add French support

---
 autokara/__init__.py                        |  2 +-
 autokara/autosyl/LyricsAlignment/utils.py   |  4 +++-
 autokara/autosyl/LyricsAlignment/wrapper.py | 15 +++++++++------
 autokara/autosyl/assUtils.py                |  2 +-
 autokara/update_lang_db.py                  |  9 ++++++++-
 5 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/autokara/__init__.py b/autokara/__init__.py
index b31ecc4..90276fd 100644
--- a/autokara/__init__.py
+++ b/autokara/__init__.py
@@ -1,2 +1,2 @@
 
-__version__ = "0.1.0"
\ No newline at end of file
+__version__ = "0.2.0"
\ No newline at end of file
diff --git a/autokara/autosyl/LyricsAlignment/utils.py b/autokara/autosyl/LyricsAlignment/utils.py
index d6f996c..09f4703 100644
--- a/autokara/autosyl/LyricsAlignment/utils.py
+++ b/autokara/autosyl/LyricsAlignment/utils.py
@@ -22,6 +22,8 @@ class G2p_Wrapper():
     def __init__(self, language="jp"):
         if language == "en":
             self.transducer = g2p_en.G2p()
+        elif language == "fr":
+            self.transducer = make_g2p('fra', 'eng-arpabet')
         else:                                                   # Only Japanese Romaji for now...
             self.transducer = make_g2p('rji', 'rji-eng-arpa')
 
@@ -120,7 +122,7 @@ def gen_phone_gt(words, raw_lines, language="jp", uppercase_as_english=True):
     else:
         g2p_uppercase = g2p
 
-    regex_uppercase = re.compile('[^a-z]')
+    regex_uppercase = re.compile('[^a-zà-ÿ]')
 
     # helper function
     def getsubidx(x, y):  # find y in x
diff --git a/autokara/autosyl/LyricsAlignment/wrapper.py b/autokara/autosyl/LyricsAlignment/wrapper.py
index 51cf853..d936e22 100644
--- a/autokara/autosyl/LyricsAlignment/wrapper.py
+++ b/autokara/autosyl/LyricsAlignment/wrapper.py
@@ -4,6 +4,7 @@ from time import time
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import re
 
 from . import utils
 from .model import train_audio_transforms, AcousticModel, BoundaryDetection
@@ -144,18 +145,20 @@ def preprocess_audio(audio_file, sr=22050):
     return y, curr_sr
 
 def preprocess_lyrics(lyrics_lines, word_file=None, language="jp", uppercase_as_english=True):
-    from string import ascii_lowercase
-    d = {ascii_lowercase[i]: i for i in range(26)}
-    d["'"] = 26
-    d[" "] = 27
-    d["~"] = 28
+    #from string import ascii_lowercase
+    #d = {ascii_lowercase[i]: i for i in range(26)}
+    #d["'"] = 26
+    #d[" "] = 27
+    #d["~"] = 28
+
+    relevant_chars = re.compile("[a-zA-ZÀ-ÿ' ~]")
 
     # process raw
     #with open(lyrics_file, 'r') as f:
     #    raw_lines = f.read().splitlines()
     raw_lines = lyrics_lines
 
-    raw_lines = ["".join([c for c in line if c.lower() in d.keys()]).strip() for line in raw_lines]
+    raw_lines = ["".join([c for c in line if (re.match(relevant_chars, c) != None)]).strip() for line in raw_lines]
     raw_lines = [" ".join(line.split()) for line in raw_lines if len(line) > 0]
     # concat
     full_lyrics = " ".join(raw_lines)
diff --git a/autokara/autosyl/assUtils.py b/autokara/autosyl/assUtils.py
index 4b819a1..0362147 100644
--- a/autokara/autosyl/assUtils.py
+++ b/autokara/autosyl/assUtils.py
@@ -28,7 +28,7 @@ def getSyls(ass_file):
     META = []
     with open(ass_file, 'r') as f:
         CONTENT = f.read()
-        strip_regex = re.compile('[^a-zA-Z]')
+        strip_regex = re.compile('[^a-zA-ZÀ-ÿ]')
         LINES_KARA = re.compile(r"(?:Comment|Dialogue):.*(\d+:\d{2}:\d{2}.\d{2}),(\d+:\d{2}:\d{2}.\d{2}),([^,]*),([^,]*),(\d+),(\d+),(\d+),(?:(?!fx|template|code)\w)*,(.*)\n")
         RGX_TAGS = re.compile(r"\{\\k(\d+)\}([^\{\n\r]*)")
         for line in LINES_KARA.findall(CONTENT):
diff --git a/autokara/update_lang_db.py b/autokara/update_lang_db.py
index 6e0c63e..88d0970 100644
--- a/autokara/update_lang_db.py
+++ b/autokara/update_lang_db.py
@@ -16,7 +16,14 @@ def main():
     for map in mappings:
         subprocess.check_call(shlex.split(f'cp -r {map:s} {str(g2p_base):s}/mappings/langs/'))
     subprocess.check_call(shlex.split(f'g2p update'))
-    
+
+    ipa_langs = ["fra"]
+    for lang in ipa_langs:
+        name = Path(lang).name
+        subprocess.check_call(shlex.split(f'g2p generate-mapping {name:s} --ipa'))
+    subprocess.check_call(shlex.split(f'g2p update'))
+
+
     if not Path.exists(g2p_base / "mappings/langs/rji"):
         print("ERROR : Failed to find language mapping")
     else:
-- 
GitLab