From 92d6f85ce72b92a6d71c7449b166a739db2f8e6f Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Thu, 16 Apr 2026 11:45:00 -0700 Subject: [PATCH 1/3] =?UTF-8?q?Add=20Korean=20TN=20post-processing=20rules?= =?UTF-8?q?=20for=20particle=20agreement=20and=20month=20=E2=80=A6=20(#409?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add Korean TN post-processing rules for particle agreement and month handling Signed-off-by: Jinwoo Bae * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add Korean TN fraction test cases for particle agreement Signed-off-by: Jinwoo Bae * Fix Korean fraction verbalization with particle-aware handling and remove post_processing dependency Signed-off-by: Jinwoo Bae * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix date and fraction normalization issues based on review feedback Signed-off-by: Jinwoo Bae * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Jinwoo Bae Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../text_normalization/ko/taggers/date.py | 31 +++--- .../text_normalization/ko/taggers/fraction.py | 19 +++- .../ko/verbalizers/fraction.py | 104 +++++++++++++++++- .../test_cases_fraction.txt | 16 ++- 4 files changed, 146 insertions(+), 24 deletions(-) diff --git a/nemo_text_processing/text_normalization/ko/taggers/date.py b/nemo_text_processing/text_normalization/ko/taggers/date.py index 4f2da5702..9748abc49 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/date.py +++ b/nemo_text_processing/text_normalization/ko/taggers/date.py @@ -226,8 +226,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + insert_space + pynutil.insert("year: \"") + (YEAR_ERA_1TO4 @ graph_cardinal) - + pynutil.delete("년") - + pynutil.insert("년") + + pynini.accep("년") + pynutil.insert("\"") ) | @@ -235,27 +234,26 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ( pynutil.insert("year: \"") + (YEAR_NO_ERA_1TO4 @ graph_cardinal) - + pynutil.delete("년") - + pynutil.insert("년") + + pynini.accep("년") + pynutil.insert("\"") ) ).optimize() individual_month_component = ( - pynutil.insert("month: \"") - + month_cardinal - + pynutil.delete("월") - + pynutil.insert("월") - + pynutil.insert("\"") + pynutil.insert("month: \"") + month_cardinal + pynini.accep("월") + pynutil.insert("\"") ) - individual_day_component = ( - pynutil.insert("day: \"") - + cardinal_lz - + pynutil.delete("일") - + pynutil.insert("일") - + pynutil.insert("\"") - ) + month_josa = pynini.union("에", "은", "는", "에는") + + individual_month_component_with_josa = ( + pynutil.insert('month: "') + + month_cardinal + + pynini.accep("월") + + pynini.closure(month_josa, 0, 1) + + pynutil.insert('"') + ).optimize() + + individual_day_component = pynutil.insert("day: \"") + cardinal_lz + pynini.accep("일") + pynutil.insert("\"") week_full_word_acceptor = pynini.project(week, "output") week_component_full_word = pynutil.insert("weekday: \"") + week_full_word_acceptor + pynutil.insert("\"") @@ -272,6 +270,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): day_and_weekday_component | month_and_weekday_component | individual_year_component + | individual_month_component_with_josa | individual_month_component | individual_day_component | week_component diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py index 2163f5f7f..64ea0c56e 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py @@ -81,6 +81,23 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + numerator_component ) + # Optional particles following the fraction + particle_subject = pynutil.insert('morphosyntactic_features: "분의_subject"') + ( + pynutil.delete("이") | pynutil.delete("가") + ) + particle_topic = pynutil.insert('morphosyntactic_features: "분의_topic"') + ( + pynutil.delete("은") | pynutil.delete("는") + ) + particle_object = pynutil.insert('morphosyntactic_features: "분의_object"') + ( + pynutil.delete("을") | pynutil.delete("를") + ) + + optional_particle = pynini.closure( + pynutil.insert(NEMO_SPACE) + (particle_subject | particle_topic | particle_object), + 0, + 1, + ) + # Optional minus sign optional_sign = ( pynutil.insert(f'negative: {DOUBLE_QUOTE}') @@ -90,7 +107,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) # Combine full graph - graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + optional_particle self.graph = graph.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py index bafbf133d..472b8a86d 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py @@ -60,7 +60,43 @@ def __init__(self, deterministic: bool = True): + numerator_component ) - # Match and delete integer_part field (e.g., "2" in "2과3분의1") + # Handle subject particle feature (분의_subject) + # Insert default particle "이" (will be corrected later via rewrite rules) + subject_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_subject"') + + delete_space + + pynutil.insert("이") # 일단 기본값 + ) + + # Handle topic particle feature (분의_topic) + topic_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_topic"') + + delete_space + + pynutil.insert("은") + ) + + # Handle object particle feature (분의_object) + object_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_object"') + + delete_space + + pynutil.insert("을") + ) + + # Combine fraction + optional particle suffix + # Particle is always inserted first in default form and later corrected + graph_fraction_all = graph_fraction + pynini.closure(subject_suffix | topic_suffix | object_suffix, 0, 1) + + # Handle integer + fraction (e.g., "2과 3/4") + # integer_part is removed and replaced with proper spacing graph_integer = ( pynutil.delete('integer_part:') + delete_space @@ -69,9 +105,10 @@ def __init__(self, deterministic: bool = True): + pynutil.delete('"') + pynutil.insert(NEMO_SPACE) ) - graph_integer_fraction = graph_integer + delete_space + graph_fraction + # Combine integer part with fraction + graph_integer_fraction = graph_integer + delete_space + graph_fraction_all - # Match and delete optional negative field (e.g., "마이너스") + # Handle optional negative prefix (e.g., "마이너스") optional_sign = ( pynutil.delete('negative:') + delete_space @@ -82,9 +119,64 @@ def __init__(self, deterministic: bool = True): + pynutil.insert(NEMO_SPACE) ) - # Final graph handles optional negative + (integer + fraction | fraction only) - graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction) + # Final structure: + # [optional negative] + (integer + fraction OR fraction only) + graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction_all) - # Final optimized verbalizer FST + # Remove token wrappers final_graph = self.delete_tokens(graph) + + # Sigma for rewrite context (entire string) + sigma = pynini.closure(NEMO_NOT_QUOTE | NEMO_SPACE) + + # Fix subject particle agreement (이 → 가 for vowel-ending numerals) + # e.g., 사이 → 사가, 구이 → 구가 + subject_rewrite = pynini.cdrewrite( + pynini.string_map( + [ + ("이이", "이가"), + ("사이", "사가"), + ("오이", "오가"), + ("구이", "구가"), + ] + ), + "", + "", + sigma, + ) + + # Fix topic particle agreement (은 → 는) + # e.g., 이은 → 이는, 사은 → 사는 + topic_rewrite = pynini.cdrewrite( + pynini.string_map( + [ + ("이은", "이는"), + ("사은", "사는"), + ("오은", "오는"), + ("구은", "구는"), + ] + ), + "", + "", + sigma, + ) + + # Fix object particle agreement (을 → 를) + # e.g., 오을 → 오를, 이을 → 이를 + object_rewrite = pynini.cdrewrite( + pynini.string_map( + [ + ("이을", "이를"), + ("사을", "사를"), + ("오을", "오를"), + ("구을", "구를"), + ] + ), + "", + "", + sigma, + ) + + # Apply all rewrite rules sequentially and final optimized FST + final_graph = final_graph @ subject_rewrite @ topic_rewrite @ object_rewrite self.fst = final_graph.optimize() diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt index a183be59b..65e5049b8 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt @@ -11,4 +11,18 @@ 1과1/3~일과 삼분의 일 1과√1/4~일과 사분의 루트 일 3분의1~삼분의 일 -121분의3221~백이십일분의 삼천이백이십일 \ No newline at end of file +121분의3221~백이십일분의 삼천이백이십일 +이번 경기의 3/5이 중요하다~이번 경기의 오분의 삼이 중요하다 +전체 구역의 4/7이 통제되었다~전체 구역의 칠분의 사가 통제되었다 +설문 응답자의 9/10이 찬성했다~설문 응답자의 십분의 구가 찬성했다 +그 중 2/3은 성공했다~그 중 삼분의 이는 성공했다 +참가자의 5/8이 탈락했다~참가자의 팔분의 오가 탈락했다 +참가자의 6/7 이 통과했다~참가자의 칠분의 육 이 통과했다 +전체의 3/4 이 감소했다~전체의 사분의 삼 이 감소했다 +응답자의 2/5이 반대했다~응답자의 오분의 이가 반대했다 +학생의 7/9 이 합격했다~학생의 구분의 칠 이 합격했다 +전체의 1/2 이 남았다~전체의 이분의 일 이 남았다 +그 중 4/5이 성공했다~그 중 오분의 사가 성공했다 +전체의 5/6이 완료되었다~전체의 육분의 오가 완료되었다 +참가자의 3/8이 탈락했다~참가자의 팔분의 삼이 탈락했다 +응답자의 6/10 이 동의했다~응답자의 십분의 육 이 동의했다 \ No newline at end of file From d35c205150eadc8643a40a06eedc576499c20c17 Mon Sep 17 00:00:00 2001 From: Mariana <47233618+mgrafu@users.noreply.github.com> Date: Thu, 23 Apr 2026 11:07:54 -0400 Subject: [PATCH 2/3] Jenkins fix (#419) Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 231 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 155 insertions(+), 76 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1219aae54..34b25bbbe 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,9 +1,9 @@ pipeline { agent { - docker { - image 'tnitn_ci_py310:24.07' - args '-v /mnt/jenkins/jenkinsci/TestData:/home/jenkins/TestData -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""' - } + docker { + image 'tnitn_ci_py310:24.07' + args '-v /mnt/jenkins/jenkinsci/TestData:/home/jenkins/TestData -v $HOME/.cache:/root/.cache --shm-size=4g --entrypoint=""' + } } options { timeout(time: 2, unit: 'HOURS') @@ -28,11 +28,12 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/02-18-26-0' - KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-03-25-0' + KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { + stage('PyTorch version') { steps { sh 'python -c "import torch; print(torch.__version__)"' @@ -46,6 +47,7 @@ pipeline { } } + stage('L0: Create EN TN/ITN Grammars') { when { anyOf { @@ -53,6 +55,7 @@ pipeline { branch 'staging/**' branch 'staging_*' changeRequest target: 'main' + } } failFast true @@ -77,10 +80,35 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir ${EN_TN_CACHE}' } } + + } + } + stage('L0: Create HI TN/ITN Grammars') { + when { + anyOf { + branch 'main' + branch 'staging/**' + branch 'staging_*' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: Hi TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hi --text="१" --cache_dir ${HI_TN_CACHE}' + } + } + stage('L0: Hi ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi --text="एक" --cache_dir ${HI_TN_CACHE}' + } + } + } } - stage('L0: Create DE/ES/FR TN/ITN Grammars') { + stage('L0: Create DE/ES TN/ITN Grammars') { when { anyOf { branch 'main' @@ -93,12 +121,12 @@ pipeline { parallel { stage('L0: DE TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=de --text="1" --cache_dir ${DEFAULT_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=de --text="1" --cache_dir ${DE_TN_CACHE}' } } stage('L0: DE ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=de --text="ein hundert " --cache_dir ${DEFAULT_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=de --text="ein hundert " --cache_dir ${DE_TN_CACHE}' } } stage('L0: ES TN grammars') { @@ -116,24 +144,38 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=es_en --text="ciento uno " --cache_dir ${ES_EN_TN_CACHE}' } } - stage('L0: FR TN grammars') { + } + } + + stage('L0: Create AR TN/ITN Grammars') { + when { + anyOf { + branch 'main' + branch 'staging/**' + branch 'staging_*' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: AR TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=fr --text="2" --cache_dir ${FR_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ar --text="2" --cache_dir ${AR_TN_CACHE}' } } - stage('L0: FR ITN grammars') { + stage('L0: AR ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=fr --text="cent " --cache_dir ${FR_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ar --text="اثنان " --cache_dir ${AR_TN_CACHE}' } } + } } - - stage('L0: Create HI/VI/RU TN/ITN') { + stage('L0: Create FR TN/ITN & VI TN/ITN & HU TN & IT TN') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -141,43 +183,43 @@ pipeline { } failFast true parallel { - stage('L0: VI ITN grammars') { + stage('L0: FR TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=fr --text="2" --cache_dir ${FR_TN_CACHE}' } } - stage('L0: VI TN grammars') { + stage('L0: FR ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=fr --text="cent " --cache_dir ${FR_TN_CACHE}' } } - stage('L0: RU TN grammars') { + stage('L0: VI ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --lang=ru --text="03" --cache_dir ${RU_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=vi --text="một ngàn " --cache_dir ${VI_TN_CACHE}' } } - stage('L0: RU ITN grammars') { + stage('L0: VI TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ru --text="три " --cache_dir ${RU_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=vi --text="100" --cache_dir ${VI_TN_CACHE}' } } - stage('L0: Hi TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hi --text="१" --cache_dir ${HI_TN_CACHE}' + stage('L0: HU TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}' } } - stage('L0: Hi ITN grammars') { + stage('L0: IT TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hi --text="एक" --cache_dir ${HI_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=it --text="122" --cache_dir ${IT_TN_CACHE}' } } } } - stage('L0: Create AR/HU/SV/PT/IT TN/ITN Grammars') { + stage('L0: Create RU TN/ITN Grammars & SV & PT') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -185,53 +227,60 @@ pipeline { } failFast true parallel { - stage('L0: SV TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=sv --text="100" --cache_dir ${SV_TN_CACHE}' - } - } - stage('L0: HU TN grammars') { + stage('L0: RU TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hu --text="100" --cache_dir ${HU_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --lang=ru --text="03" --cache_dir ${RU_TN_CACHE}' } } - stage('L0: AR TN grammars') { + stage('L0: RU ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ar --text="2" --cache_dir ${AR_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ru --text="три " --cache_dir ${RU_TN_CACHE}' } } - stage('L0: AR ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ar --text="اثنان " --cache_dir ${AR_TN_CACHE}' + stage('L0: SV TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=sv --text="100" --cache_dir ${SV_TN_CACHE}' } } - // stage('L0: SV ITN grammars') { - // steps { - // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=sv --text="hundra " --cache_dir ${SV_TN_CACHE}' - // } - // } - // stage('L0: PT TN grammars') { - // steps { - // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=pt --text="2" --cache_dir ${DEFAULT_TN_CACHE}' - // } - // } + // stage('L0: SV ITN grammars') { + // steps { + // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=sv --text="hundra " --cache_dir ${SV_TN_CACHE}' + // } + // } + // stage('L0: PT TN grammars') { + // steps { + // sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=pt --text="2" --cache_dir ${DEFAULT_TN_CACHE}' + // } + // } stage('L0: PT ITN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=pt --text="dez " --cache_dir ${PT_TN_CACHE}' } } - stage('L0: IT TN grammars') { + } + } + stage('L0: Create HE ITN Grammar') { + when { + anyOf { + branch 'main' + branch 'staging/**' + branch 'staging_*' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: HE ITN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=it --text="122" --cache_dir ${IT_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=he --text="ת " --cache_dir ${HE_TN_CACHE}' } } } } - - stage('L0: Create MR/HE/HY TN/ITN Grammars') { + stage('L0: Create HY TN/ITN Grammars & MR') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -254,18 +303,12 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=hy --text="վեց " --cache_dir ${HY_TN_CACHE}' } } - stage('L0: HE ITN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=he --text="ת " --cache_dir ${HE_TN_CACHE}' - } - } } } - - stage('L0: Create CJK TN/ITN Grammar') { + stage('L0: Create ZH TN/ITN Grammar') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -283,30 +326,57 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=zh --text="6" --cache_dir ${ZH_TN_CACHE}' } } + } + } + stage('L0: Create JA ITN Grammars') { + when { + anyOf { + branch 'main' + branch 'staging/**' + branch 'staging_*' + changeRequest target: 'main' + } + } + failFast true + parallel { stage('L0: JA ITN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ja --text="100" --cache_dir ${JA_TN_CACHE}' } } - stage('L0: KO TN grammars') { - steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}' - } + } + } + stage('L0: Create KO TN/ITN Grammars') { + when { + anyOf { + branch 'main' + branch 'staging/**' + branch 'staging_*' + changeRequest target: 'main' } + } + failFast true + parallel { stage('L0: KO ITN grammars') { steps { sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ko --text="백" --cache_dir ${KO_TN_CACHE}' } } + stage('L0: KO TN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}' + } + } } } - // L1 Tests starts here + +// L1 Tests starts here stage('L1: TN/ITN Tests CPU') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -378,7 +448,7 @@ pipeline { steps { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ja/ -m "not pleasefixme" --cpu --tn_cache_dir ${JA_TN_CACHE}' } - } + } stage('L1: Run all MR ITN tests (restore grammars from cache)') { steps { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/mr/ -m "not pleasefixme" --cpu --tn_cache_dir ${MR_TN_CACHE}' @@ -402,10 +472,10 @@ pipeline { } } - stage('L2: EN Sparrowhawk Tests') { + stage('L2: EN Sparrowhawk Tests') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -417,12 +487,14 @@ pipeline { steps { sh 'CUDA_VISIBLE_DEVICES="" cp -r /workspace/sparrowhawk/documentation/grammars /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_lower_cased && cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN --LANGUAGE="en"' sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization.sh /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_lower_cased `pwd`' + } } stage('L2: EN ITN Run Sparrowhawk test - Cased Input') { steps { sh 'CUDA_VISIBLE_DEVICES="" cp -r /workspace/sparrowhawk/documentation/grammars /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_cased && cd tools/text_processing_deployment && bash sh_test.sh --MODE="test_itn_grammars" --INPUT_CASE="cased" --OVERWRITE_CACHE=False --FAR_PATH=${EN_TN_CACHE}/SH_ITN_cased --LANGUAGE="en"' sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_inverse_text_normalization_cased.sh /workspace/sparrowhawk/documentation/grammars_en_itn_grammars_cased `pwd`' + } } stage('L2: EN TN Run Sparrowhawk test') { @@ -431,13 +503,14 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" cd tests/nemo_text_processing/en && bash test_sparrowhawk_normalization.sh /workspace/sparrowhawk/documentation/grammars_en_tn_grammars_cased `pwd`' } } + } } - + stage('L2: NeMo text processing') { when { anyOf { - branch 'main' + branch 'main' branch 'staging/**' branch 'staging_*' changeRequest target: 'main' @@ -456,6 +529,7 @@ pipeline { rm -rf $NORM_OUTPUT_DIR' } } + stage('L2: Eng ITN export') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && \ @@ -466,6 +540,8 @@ pipeline { rm -rf $DENORM_OUTPUT_DIR' } } + + stage('L2: Eng alignment TN') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && NORM_OUTPUT_DIR=/home/jenkins/TestData/text_norm/output_${TIME} && mkdir $NORM_OUTPUT_DIR && \ @@ -474,6 +550,7 @@ pipeline { rm -rf $NORM_OUTPUT_DIR' } } + stage('L2: Eng alignment ITN') { steps { sh 'TIME=`date +"%Y-%m-%d-%T"` && DENORM_OUTPUT_DIR=/home/jenkins/TestData/text_denorm/output_${TIME} && mkdir $DENORM_OUTPUT_DIR && \ @@ -482,10 +559,12 @@ pipeline { rm -rf $DENORM_OUTPUT_DIR' } } + } } } + post { always { sh 'chmod -R 777 .' From 114b3db85aa78ef8453ad7336eaee2117d507942 Mon Sep 17 00:00:00 2001 From: Mariana Graterol Fuenmayor Date: Thu, 23 Apr 2026 08:17:00 -0700 Subject: [PATCH 3/3] update jenkins cache Signed-off-by: Mariana Graterol Fuenmayor --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 34b25bbbe..8f36c1fb2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/02-18-26-0' - KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-22-26-0' + KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages {